xref: /aosp_15_r20/external/cronet/third_party/apache-portable-runtime/src/misc/win32/utf8.c (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 /* Licensed to the Apache Software Foundation (ASF) under one or more
2  * contributor license agreements.  See the NOTICE file distributed with
3  * this work for additional information regarding copyright ownership.
4  * The ASF licenses this file to You under the Apache License, Version 2.0
5  * (the "License"); you may not use this file except in compliance with
6  * the License.  You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "apr.h"
18 #include "apr_private.h"
19 #include "apr_errno.h"
20 #include "apr_arch_utf8.h"
21 
22 /* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
23  * with particular attention to canonical translation forms (see section 10
24  * "Security Considerations" of the RFC for more info).
25  *
26  * Since several architectures including Windows support unicode, with UCS2
27  * used as the actual storage conventions by that archicture, these functions
28  * exist to transform or validate UCS2 strings into APR's 'char' type
29  * convention.  It is left up to the operating system to determine the
30  * validitity of the string, e.g. normative forms, in the context of
31  * its native language support.  Other file systems which support filename
32  * characters of 0x80-0xff but have no explicit requirement for Unicode
33  * will find this function useful only for validating the character sequences
34  * and rejecting poorly encoded UTF8 sequences.
35  *
36  * Len UCS-4 range (hex) UTF-8 octet sequence (binary)
37  * 1:2 00000000-0000007F 0xxxxxxx
38  * 2:2 00000080-000007FF 110XXXXx 10xxxxxx
39  * 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
40  * 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
41  *     00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
42  *     04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
43  *
44  * One of the X bits must be 1 to avoid overlong representation of ucs2 values.
45  *
46  * For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
47  * and the final two forms are used only by full ucs4, per RFC 3629;
48  *
49  *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
50  *   Unicode parlance), being actually UCS-4 characters transformed
51  *   through UTF-16, need special treatment: the UTF-16 transformation
52  *   must be undone, yielding a UCS-4 character that is then transformed
53  *   as above."
54  *
55  * From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
56  *
57  *  U' = U - 0x10000
58  *  U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
59  *                    W1 = 110110yy yyyyyyyy
60  *                    W2 = 110111xx xxxxxxxx
61  *  Max U' = 0000 00001111 11111111 11111111
62  *  Max U  = 0000 00010000 11111111 11111111
63  *
64  * Len is the table above is a mapping of bytes used for utf8:ucs2 values,
65  * which results in these conclusions of maximum allocations;
66  *
67  *  apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
68  *  apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
69  */
70 
apr_conv_utf8_to_ucs2(const char * in,apr_size_t * inbytes,apr_wchar_t * out,apr_size_t * outwords)71 APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,
72                                                 apr_size_t *inbytes,
73                                                 apr_wchar_t *out,
74                                                 apr_size_t *outwords)
75 {
76     apr_int64_t newch, mask;
77     apr_size_t expect, eating;
78     int ch;
79 
80     while (*inbytes && *outwords)
81     {
82         ch = (unsigned char)(*in++);
83         if (!(ch & 0200)) {
84             /* US-ASCII-7 plain text
85              */
86             --*inbytes;
87             --*outwords;
88             *(out++) = ch;
89         }
90         else
91         {
92             if ((ch & 0300) != 0300) {
93                 /* Multibyte Continuation is out of place
94                  */
95                 return APR_EINVAL;
96             }
97             else
98             {
99                 /* Multibyte Sequence Lead Character
100                  *
101                  * Compute the expected bytes while adjusting
102                  * or lead byte and leading zeros mask.
103                  */
104                 mask = 0340;
105                 expect = 1;
106                 while ((ch & mask) == mask) {
107                     mask |= mask >> 1;
108                     if (++expect > 3) /* (truly 5 for ucs-4) */
109                         return APR_EINVAL;
110                 }
111                 newch = ch & ~mask;
112                 eating = expect + 1;
113                 if (*inbytes <= expect)
114                     return APR_INCOMPLETE;
115                 /* Reject values of excessive leading 0 bits
116                  * utf-8 _demands_ the shortest possible byte length
117                  */
118                 if (expect == 1) {
119                     if (!(newch & 0036))
120                         return APR_EINVAL;
121                 }
122                 else {
123                     /* Reject values of excessive leading 0 bits
124                      */
125                     if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
126                         return APR_EINVAL;
127                     if (expect == 2) {
128                         /* Reject values D800-DFFF when not utf16 encoded
129                          * (may not be an appropriate restriction for ucs-4)
130                          */
131                         if (newch == 0015 && ((unsigned char)*in & 0040))
132                             return APR_EINVAL;
133                     }
134                     else if (expect == 3) {
135                         /* Short circuit values > 110000
136                          */
137                         if (newch > 4)
138                             return APR_EINVAL;
139                         if (newch == 4 && ((unsigned char)*in & 0060))
140                             return APR_EINVAL;
141                     }
142                 }
143                 /* Where the boolean (expect > 2) is true, we will need
144                  * an extra word for the output.
145                  */
146                 if (*outwords < (apr_size_t)(expect > 2) + 1)
147                     break; /* buffer full */
148                 while (expect--)
149                 {
150                     /* Multibyte Continuation must be legal */
151                     if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
152                         return APR_EINVAL;
153                     newch <<= 6;
154                     newch |= (ch & 0077);
155                 }
156                 *inbytes -= eating;
157                 /* newch is now a true ucs-4 character
158                  *
159                  * now we need to fold to ucs-2
160                  */
161                 if (newch < 0x10000)
162                 {
163                     --*outwords;
164                     *(out++) = (apr_wchar_t) newch;
165                 }
166                 else
167                 {
168                     *outwords -= 2;
169                     newch -= 0x10000;
170                     *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10));
171                     *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF));
172                 }
173             }
174         }
175     }
176     /* Buffer full 'errors' aren't errors, the client must inspect both
177      * the inbytes and outwords values
178      */
179     return APR_SUCCESS;
180 }
181 
apr_conv_ucs2_to_utf8(const apr_wchar_t * in,apr_size_t * inwords,char * out,apr_size_t * outbytes)182 APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in,
183                                                 apr_size_t *inwords,
184                                                 char *out,
185                                                 apr_size_t *outbytes)
186 {
187     apr_int64_t newch, require;
188     apr_size_t need;
189     char *invout;
190     int ch;
191 
192     while (*inwords && *outbytes)
193     {
194         ch = (unsigned short)(*in++);
195         if (ch < 0x80)
196         {
197             --*inwords;
198             --*outbytes;
199             *(out++) = (unsigned char) ch;
200         }
201         else
202         {
203             if ((ch & 0xFC00) == 0xDC00) {
204                 /* Invalid Leading ucs-2 Multiword Continuation Character
205                  */
206                 return APR_EINVAL;
207             }
208             if ((ch & 0xFC00) == 0xD800) {
209                 /* Leading ucs-2 Multiword Character
210                  */
211                 if (*inwords < 2) {
212                     /* Missing ucs-2 Multiword Continuation Character
213                      */
214                     return APR_INCOMPLETE;
215                 }
216                 if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
217                     /* Invalid ucs-2 Multiword Continuation Character
218                      */
219                     return APR_EINVAL;
220                 }
221                 newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
222                 newch += 0x10000;
223             }
224             else {
225                 /* ucs-2 Single Word Character
226                  */
227                 newch = ch;
228             }
229             /* Determine the absolute minimum utf-8 bytes required
230              */
231             require = newch >> 11;
232             need = 1;
233             while (require)
234                 require >>= 5, ++need;
235             if (need >= *outbytes)
236                 break; /* Insufficient buffer */
237             *inwords -= (need > 2) + 1;
238             *outbytes -= need + 1;
239             /* Compute the utf-8 characters in last to first order,
240              * calculating the lead character length bits along the way.
241              */
242             ch = 0200;
243             out += need + 1;
244             invout = out;
245             while (need--) {
246                 ch |= ch >> 1;
247                 *(--invout) = (unsigned char)(0200 | (newch & 0077));
248                 newch >>= 6;
249             }
250             /* Compute the lead utf-8 character and move the dest offset
251              */
252             *(--invout) = (unsigned char)(ch | newch);
253         }
254     }
255     /* Buffer full 'errors' aren't errors, the client must inspect both
256      * the inwords and outbytes values
257      */
258     return APR_SUCCESS;
259 }
260