1 /* Licensed to the Apache Software Foundation (ASF) under one or more
2 * contributor license agreements. See the NOTICE file distributed with
3 * this work for additional information regarding copyright ownership.
4 * The ASF licenses this file to You under the Apache License, Version 2.0
5 * (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "apr.h"
18 #include "apr_private.h"
19 #include "apr_errno.h"
20 #include "apr_arch_utf8.h"
21
22 /* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
23 * with particular attention to canonical translation forms (see section 10
24 * "Security Considerations" of the RFC for more info).
25 *
26 * Since several architectures including Windows support unicode, with UCS2
27 * used as the actual storage conventions by that archicture, these functions
28 * exist to transform or validate UCS2 strings into APR's 'char' type
29 * convention. It is left up to the operating system to determine the
30 * validitity of the string, e.g. normative forms, in the context of
31 * its native language support. Other file systems which support filename
32 * characters of 0x80-0xff but have no explicit requirement for Unicode
33 * will find this function useful only for validating the character sequences
34 * and rejecting poorly encoded UTF8 sequences.
35 *
36 * Len UCS-4 range (hex) UTF-8 octet sequence (binary)
37 * 1:2 00000000-0000007F 0xxxxxxx
38 * 2:2 00000080-000007FF 110XXXXx 10xxxxxx
39 * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
40 * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
41 * 00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
42 * 04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
43 *
44 * One of the X bits must be 1 to avoid overlong representation of ucs2 values.
45 *
46 * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
47 * and the final two forms are used only by full ucs4, per RFC 3629;
48 *
49 * "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
50 * Unicode parlance), being actually UCS-4 characters transformed
51 * through UTF-16, need special treatment: the UTF-16 transformation
52 * must be undone, yielding a UCS-4 character that is then transformed
53 * as above."
54 *
55 * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
56 *
57 * U' = U - 0x10000
58 * U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
59 * W1 = 110110yy yyyyyyyy
60 * W2 = 110111xx xxxxxxxx
61 * Max U' = 0000 00001111 11111111 11111111
62 * Max U = 0000 00010000 11111111 11111111
63 *
64 * Len is the table above is a mapping of bytes used for utf8:ucs2 values,
65 * which results in these conclusions of maximum allocations;
66 *
67 * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
68 * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
69 */
70
apr_conv_utf8_to_ucs2(const char * in,apr_size_t * inbytes,apr_wchar_t * out,apr_size_t * outwords)71 APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,
72 apr_size_t *inbytes,
73 apr_wchar_t *out,
74 apr_size_t *outwords)
75 {
76 apr_int64_t newch, mask;
77 apr_size_t expect, eating;
78 int ch;
79
80 while (*inbytes && *outwords)
81 {
82 ch = (unsigned char)(*in++);
83 if (!(ch & 0200)) {
84 /* US-ASCII-7 plain text
85 */
86 --*inbytes;
87 --*outwords;
88 *(out++) = ch;
89 }
90 else
91 {
92 if ((ch & 0300) != 0300) {
93 /* Multibyte Continuation is out of place
94 */
95 return APR_EINVAL;
96 }
97 else
98 {
99 /* Multibyte Sequence Lead Character
100 *
101 * Compute the expected bytes while adjusting
102 * or lead byte and leading zeros mask.
103 */
104 mask = 0340;
105 expect = 1;
106 while ((ch & mask) == mask) {
107 mask |= mask >> 1;
108 if (++expect > 3) /* (truly 5 for ucs-4) */
109 return APR_EINVAL;
110 }
111 newch = ch & ~mask;
112 eating = expect + 1;
113 if (*inbytes <= expect)
114 return APR_INCOMPLETE;
115 /* Reject values of excessive leading 0 bits
116 * utf-8 _demands_ the shortest possible byte length
117 */
118 if (expect == 1) {
119 if (!(newch & 0036))
120 return APR_EINVAL;
121 }
122 else {
123 /* Reject values of excessive leading 0 bits
124 */
125 if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
126 return APR_EINVAL;
127 if (expect == 2) {
128 /* Reject values D800-DFFF when not utf16 encoded
129 * (may not be an appropriate restriction for ucs-4)
130 */
131 if (newch == 0015 && ((unsigned char)*in & 0040))
132 return APR_EINVAL;
133 }
134 else if (expect == 3) {
135 /* Short circuit values > 110000
136 */
137 if (newch > 4)
138 return APR_EINVAL;
139 if (newch == 4 && ((unsigned char)*in & 0060))
140 return APR_EINVAL;
141 }
142 }
143 /* Where the boolean (expect > 2) is true, we will need
144 * an extra word for the output.
145 */
146 if (*outwords < (apr_size_t)(expect > 2) + 1)
147 break; /* buffer full */
148 while (expect--)
149 {
150 /* Multibyte Continuation must be legal */
151 if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
152 return APR_EINVAL;
153 newch <<= 6;
154 newch |= (ch & 0077);
155 }
156 *inbytes -= eating;
157 /* newch is now a true ucs-4 character
158 *
159 * now we need to fold to ucs-2
160 */
161 if (newch < 0x10000)
162 {
163 --*outwords;
164 *(out++) = (apr_wchar_t) newch;
165 }
166 else
167 {
168 *outwords -= 2;
169 newch -= 0x10000;
170 *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10));
171 *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF));
172 }
173 }
174 }
175 }
176 /* Buffer full 'errors' aren't errors, the client must inspect both
177 * the inbytes and outwords values
178 */
179 return APR_SUCCESS;
180 }
181
apr_conv_ucs2_to_utf8(const apr_wchar_t * in,apr_size_t * inwords,char * out,apr_size_t * outbytes)182 APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in,
183 apr_size_t *inwords,
184 char *out,
185 apr_size_t *outbytes)
186 {
187 apr_int64_t newch, require;
188 apr_size_t need;
189 char *invout;
190 int ch;
191
192 while (*inwords && *outbytes)
193 {
194 ch = (unsigned short)(*in++);
195 if (ch < 0x80)
196 {
197 --*inwords;
198 --*outbytes;
199 *(out++) = (unsigned char) ch;
200 }
201 else
202 {
203 if ((ch & 0xFC00) == 0xDC00) {
204 /* Invalid Leading ucs-2 Multiword Continuation Character
205 */
206 return APR_EINVAL;
207 }
208 if ((ch & 0xFC00) == 0xD800) {
209 /* Leading ucs-2 Multiword Character
210 */
211 if (*inwords < 2) {
212 /* Missing ucs-2 Multiword Continuation Character
213 */
214 return APR_INCOMPLETE;
215 }
216 if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
217 /* Invalid ucs-2 Multiword Continuation Character
218 */
219 return APR_EINVAL;
220 }
221 newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
222 newch += 0x10000;
223 }
224 else {
225 /* ucs-2 Single Word Character
226 */
227 newch = ch;
228 }
229 /* Determine the absolute minimum utf-8 bytes required
230 */
231 require = newch >> 11;
232 need = 1;
233 while (require)
234 require >>= 5, ++need;
235 if (need >= *outbytes)
236 break; /* Insufficient buffer */
237 *inwords -= (need > 2) + 1;
238 *outbytes -= need + 1;
239 /* Compute the utf-8 characters in last to first order,
240 * calculating the lead character length bits along the way.
241 */
242 ch = 0200;
243 out += need + 1;
244 invout = out;
245 while (need--) {
246 ch |= ch >> 1;
247 *(--invout) = (unsigned char)(0200 | (newch & 0077));
248 newch >>= 6;
249 }
250 /* Compute the lead utf-8 character and move the dest offset
251 */
252 *(--invout) = (unsigned char)(ch | newch);
253 }
254 }
255 /* Buffer full 'errors' aren't errors, the client must inspect both
256 * the inwords and outbytes values
257 */
258 return APR_SUCCESS;
259 }
260