xref: /aosp_15_r20/art/libdexfile/dex/utf.cc (revision 795d594fd825385562da6b089ea9b2033f3abf5a)
1*795d594fSAndroid Build Coastguard Worker /*
2*795d594fSAndroid Build Coastguard Worker  * Copyright (C) 2011 The Android Open Source Project
3*795d594fSAndroid Build Coastguard Worker  *
4*795d594fSAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*795d594fSAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*795d594fSAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*795d594fSAndroid Build Coastguard Worker  *
8*795d594fSAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*795d594fSAndroid Build Coastguard Worker  *
10*795d594fSAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*795d594fSAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*795d594fSAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*795d594fSAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*795d594fSAndroid Build Coastguard Worker  * limitations under the License.
15*795d594fSAndroid Build Coastguard Worker  */
16*795d594fSAndroid Build Coastguard Worker 
17*795d594fSAndroid Build Coastguard Worker #include "utf.h"
18*795d594fSAndroid Build Coastguard Worker 
19*795d594fSAndroid Build Coastguard Worker #include <android-base/logging.h>
20*795d594fSAndroid Build Coastguard Worker #include <android-base/stringprintf.h>
21*795d594fSAndroid Build Coastguard Worker #include <android-base/strings.h>
22*795d594fSAndroid Build Coastguard Worker 
23*795d594fSAndroid Build Coastguard Worker #include "base/casts.h"
24*795d594fSAndroid Build Coastguard Worker #include "utf-inl.h"
25*795d594fSAndroid Build Coastguard Worker 
26*795d594fSAndroid Build Coastguard Worker namespace art {
27*795d594fSAndroid Build Coastguard Worker 
28*795d594fSAndroid Build Coastguard Worker using android::base::StringAppendF;
29*795d594fSAndroid Build Coastguard Worker 
30*795d594fSAndroid Build Coastguard Worker // This is used only from debugger and test code.
CountModifiedUtf8Chars(const char * utf8)31*795d594fSAndroid Build Coastguard Worker size_t CountModifiedUtf8Chars(const char* utf8) {
32*795d594fSAndroid Build Coastguard Worker   return CountModifiedUtf8Chars(utf8, strlen(utf8));
33*795d594fSAndroid Build Coastguard Worker }
34*795d594fSAndroid Build Coastguard Worker 
35*795d594fSAndroid Build Coastguard Worker /*
36*795d594fSAndroid Build Coastguard Worker  * This does not validate UTF8 rules (nor did older code). But it gets the right answer
37*795d594fSAndroid Build Coastguard Worker  * for valid UTF-8 and that's fine because it's used only to size a buffer for later
38*795d594fSAndroid Build Coastguard Worker  * conversion.
39*795d594fSAndroid Build Coastguard Worker  *
40*795d594fSAndroid Build Coastguard Worker  * Modified UTF-8 consists of a series of bytes up to 21 bit Unicode code points as follows:
41*795d594fSAndroid Build Coastguard Worker  * U+0001  - U+007F   0xxxxxxx
42*795d594fSAndroid Build Coastguard Worker  * U+0080  - U+07FF   110xxxxx 10xxxxxx
43*795d594fSAndroid Build Coastguard Worker  * U+0800  - U+FFFF   1110xxxx 10xxxxxx 10xxxxxx
44*795d594fSAndroid Build Coastguard Worker  * U+10000 - U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
45*795d594fSAndroid Build Coastguard Worker  *
46*795d594fSAndroid Build Coastguard Worker  * U+0000 is encoded using the 2nd form to avoid nulls inside strings (this differs from
47*795d594fSAndroid Build Coastguard Worker  * standard UTF-8).
48*795d594fSAndroid Build Coastguard Worker  * The four byte encoding converts to two utf16 characters.
49*795d594fSAndroid Build Coastguard Worker  */
CountModifiedUtf8Chars(const char * utf8,size_t byte_count)50*795d594fSAndroid Build Coastguard Worker size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count) {
51*795d594fSAndroid Build Coastguard Worker   DCHECK_LE(byte_count, strlen(utf8));
52*795d594fSAndroid Build Coastguard Worker   size_t len = 0;
53*795d594fSAndroid Build Coastguard Worker   const char* end = utf8 + byte_count;
54*795d594fSAndroid Build Coastguard Worker   for (; utf8 < end; ++utf8) {
55*795d594fSAndroid Build Coastguard Worker     int ic = *utf8;
56*795d594fSAndroid Build Coastguard Worker     len++;
57*795d594fSAndroid Build Coastguard Worker     if (LIKELY((ic & 0x80) == 0)) {
58*795d594fSAndroid Build Coastguard Worker       // One-byte encoding.
59*795d594fSAndroid Build Coastguard Worker       continue;
60*795d594fSAndroid Build Coastguard Worker     }
61*795d594fSAndroid Build Coastguard Worker     // Two- or three-byte encoding.
62*795d594fSAndroid Build Coastguard Worker     utf8++;
63*795d594fSAndroid Build Coastguard Worker     if ((ic & 0x20) == 0) {
64*795d594fSAndroid Build Coastguard Worker       // Two-byte encoding.
65*795d594fSAndroid Build Coastguard Worker       continue;
66*795d594fSAndroid Build Coastguard Worker     }
67*795d594fSAndroid Build Coastguard Worker     utf8++;
68*795d594fSAndroid Build Coastguard Worker     if ((ic & 0x10) == 0) {
69*795d594fSAndroid Build Coastguard Worker       // Three-byte encoding.
70*795d594fSAndroid Build Coastguard Worker       continue;
71*795d594fSAndroid Build Coastguard Worker     }
72*795d594fSAndroid Build Coastguard Worker 
73*795d594fSAndroid Build Coastguard Worker     // Four-byte encoding: needs to be converted into a surrogate
74*795d594fSAndroid Build Coastguard Worker     // pair.
75*795d594fSAndroid Build Coastguard Worker     utf8++;
76*795d594fSAndroid Build Coastguard Worker     len++;
77*795d594fSAndroid Build Coastguard Worker   }
78*795d594fSAndroid Build Coastguard Worker   return len;
79*795d594fSAndroid Build Coastguard Worker }
80*795d594fSAndroid Build Coastguard Worker 
81*795d594fSAndroid Build Coastguard Worker // This is used only from debugger and test code.
ConvertModifiedUtf8ToUtf16(uint16_t * utf16_data_out,const char * utf8_data_in)82*795d594fSAndroid Build Coastguard Worker void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
83*795d594fSAndroid Build Coastguard Worker   while (*utf8_data_in != '\0') {
84*795d594fSAndroid Build Coastguard Worker     const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
85*795d594fSAndroid Build Coastguard Worker     const uint16_t leading = GetLeadingUtf16Char(ch);
86*795d594fSAndroid Build Coastguard Worker     const uint16_t trailing = GetTrailingUtf16Char(ch);
87*795d594fSAndroid Build Coastguard Worker 
88*795d594fSAndroid Build Coastguard Worker     *utf16_data_out++ = leading;
89*795d594fSAndroid Build Coastguard Worker     if (trailing != 0) {
90*795d594fSAndroid Build Coastguard Worker       *utf16_data_out++ = trailing;
91*795d594fSAndroid Build Coastguard Worker     }
92*795d594fSAndroid Build Coastguard Worker   }
93*795d594fSAndroid Build Coastguard Worker }
94*795d594fSAndroid Build Coastguard Worker 
ConvertModifiedUtf8ToUtf16(uint16_t * utf16_data_out,size_t out_chars,const char * utf8_data_in,size_t in_bytes)95*795d594fSAndroid Build Coastguard Worker void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars,
96*795d594fSAndroid Build Coastguard Worker                                 const char* utf8_data_in, size_t in_bytes) {
97*795d594fSAndroid Build Coastguard Worker   const char *in_start = utf8_data_in;
98*795d594fSAndroid Build Coastguard Worker   const char *in_end = utf8_data_in + in_bytes;
99*795d594fSAndroid Build Coastguard Worker   uint16_t *out_p = utf16_data_out;
100*795d594fSAndroid Build Coastguard Worker 
101*795d594fSAndroid Build Coastguard Worker   if (LIKELY(out_chars == in_bytes)) {
102*795d594fSAndroid Build Coastguard Worker     // Common case where all characters are ASCII.
103*795d594fSAndroid Build Coastguard Worker     for (const char *p = in_start; p < in_end;) {
104*795d594fSAndroid Build Coastguard Worker       // Safe even if char is signed because ASCII characters always have
105*795d594fSAndroid Build Coastguard Worker       // the high bit cleared.
106*795d594fSAndroid Build Coastguard Worker       *out_p++ = dchecked_integral_cast<uint16_t>(*p++);
107*795d594fSAndroid Build Coastguard Worker     }
108*795d594fSAndroid Build Coastguard Worker     return;
109*795d594fSAndroid Build Coastguard Worker   }
110*795d594fSAndroid Build Coastguard Worker 
111*795d594fSAndroid Build Coastguard Worker   // String contains non-ASCII characters.
112*795d594fSAndroid Build Coastguard Worker   for (const char *p = in_start; p < in_end;) {
113*795d594fSAndroid Build Coastguard Worker     const uint32_t ch = GetUtf16FromUtf8(&p);
114*795d594fSAndroid Build Coastguard Worker     const uint16_t leading = GetLeadingUtf16Char(ch);
115*795d594fSAndroid Build Coastguard Worker     const uint16_t trailing = GetTrailingUtf16Char(ch);
116*795d594fSAndroid Build Coastguard Worker 
117*795d594fSAndroid Build Coastguard Worker     *out_p++ = leading;
118*795d594fSAndroid Build Coastguard Worker     if (trailing != 0) {
119*795d594fSAndroid Build Coastguard Worker       *out_p++ = trailing;
120*795d594fSAndroid Build Coastguard Worker     }
121*795d594fSAndroid Build Coastguard Worker   }
122*795d594fSAndroid Build Coastguard Worker }
123*795d594fSAndroid Build Coastguard Worker 
ConvertUtf16ToModifiedUtf8(char * utf8_out,size_t byte_count,const uint16_t * utf16_in,size_t char_count)124*795d594fSAndroid Build Coastguard Worker void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
125*795d594fSAndroid Build Coastguard Worker                                 const uint16_t* utf16_in, size_t char_count) {
126*795d594fSAndroid Build Coastguard Worker   if (LIKELY(byte_count == char_count)) {
127*795d594fSAndroid Build Coastguard Worker     // Common case where all characters are ASCII.
128*795d594fSAndroid Build Coastguard Worker     const uint16_t *utf16_end = utf16_in + char_count;
129*795d594fSAndroid Build Coastguard Worker     for (const uint16_t *p = utf16_in; p < utf16_end;) {
130*795d594fSAndroid Build Coastguard Worker       *utf8_out++ = dchecked_integral_cast<char>(*p++);
131*795d594fSAndroid Build Coastguard Worker     }
132*795d594fSAndroid Build Coastguard Worker     return;
133*795d594fSAndroid Build Coastguard Worker   }
134*795d594fSAndroid Build Coastguard Worker 
135*795d594fSAndroid Build Coastguard Worker   // String contains non-ASCII characters.
136*795d594fSAndroid Build Coastguard Worker   // FIXME: We should not emit 4-byte sequences. Bug: 192935764
137*795d594fSAndroid Build Coastguard Worker   auto append = [&](char c) { *utf8_out++ = c; };
138*795d594fSAndroid Build Coastguard Worker   ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
139*795d594fSAndroid Build Coastguard Worker                      /*kUse4ByteSequence=*/ true,
140*795d594fSAndroid Build Coastguard Worker                      /*kReplaceBadSurrogates=*/ false>(utf16_in, char_count, append);
141*795d594fSAndroid Build Coastguard Worker }
142*795d594fSAndroid Build Coastguard Worker 
ComputeUtf16HashFromModifiedUtf8(const char * utf8,size_t utf16_length)143*795d594fSAndroid Build Coastguard Worker int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
144*795d594fSAndroid Build Coastguard Worker   uint32_t hash = 0;
145*795d594fSAndroid Build Coastguard Worker   while (utf16_length != 0u) {
146*795d594fSAndroid Build Coastguard Worker     const uint32_t pair = GetUtf16FromUtf8(&utf8);
147*795d594fSAndroid Build Coastguard Worker     const uint16_t first = GetLeadingUtf16Char(pair);
148*795d594fSAndroid Build Coastguard Worker     hash = hash * 31 + first;
149*795d594fSAndroid Build Coastguard Worker     --utf16_length;
150*795d594fSAndroid Build Coastguard Worker     const uint16_t second = GetTrailingUtf16Char(pair);
151*795d594fSAndroid Build Coastguard Worker     if (second != 0) {
152*795d594fSAndroid Build Coastguard Worker       hash = hash * 31 + second;
153*795d594fSAndroid Build Coastguard Worker       DCHECK_NE(utf16_length, 0u);
154*795d594fSAndroid Build Coastguard Worker       --utf16_length;
155*795d594fSAndroid Build Coastguard Worker     }
156*795d594fSAndroid Build Coastguard Worker   }
157*795d594fSAndroid Build Coastguard Worker   return static_cast<int32_t>(hash);
158*795d594fSAndroid Build Coastguard Worker }
159*795d594fSAndroid Build Coastguard Worker 
ComputeModifiedUtf8Hash(const char * chars)160*795d594fSAndroid Build Coastguard Worker uint32_t ComputeModifiedUtf8Hash(const char* chars) {
161*795d594fSAndroid Build Coastguard Worker   uint32_t hash = StartModifiedUtf8Hash();
162*795d594fSAndroid Build Coastguard Worker   while (*chars != '\0') {
163*795d594fSAndroid Build Coastguard Worker     hash = UpdateModifiedUtf8Hash(hash, *chars);
164*795d594fSAndroid Build Coastguard Worker     ++chars;
165*795d594fSAndroid Build Coastguard Worker   }
166*795d594fSAndroid Build Coastguard Worker   return hash;
167*795d594fSAndroid Build Coastguard Worker }
168*795d594fSAndroid Build Coastguard Worker 
ComputeModifiedUtf8Hash(std::string_view chars)169*795d594fSAndroid Build Coastguard Worker uint32_t ComputeModifiedUtf8Hash(std::string_view chars) {
170*795d594fSAndroid Build Coastguard Worker   return UpdateModifiedUtf8Hash(StartModifiedUtf8Hash(), chars);
171*795d594fSAndroid Build Coastguard Worker }
172*795d594fSAndroid Build Coastguard Worker 
CompareModifiedUtf8ToUtf16AsCodePointValues(const char * utf8,const uint16_t * utf16,size_t utf16_length)173*795d594fSAndroid Build Coastguard Worker int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
174*795d594fSAndroid Build Coastguard Worker                                                 size_t utf16_length) {
175*795d594fSAndroid Build Coastguard Worker   for (;;) {
176*795d594fSAndroid Build Coastguard Worker     if (*utf8 == '\0') {
177*795d594fSAndroid Build Coastguard Worker       return (utf16_length == 0) ? 0 : -1;
178*795d594fSAndroid Build Coastguard Worker     } else if (utf16_length == 0) {
179*795d594fSAndroid Build Coastguard Worker       return 1;
180*795d594fSAndroid Build Coastguard Worker     }
181*795d594fSAndroid Build Coastguard Worker 
182*795d594fSAndroid Build Coastguard Worker     const uint32_t pair = GetUtf16FromUtf8(&utf8);
183*795d594fSAndroid Build Coastguard Worker 
184*795d594fSAndroid Build Coastguard Worker     // First compare the leading utf16 char.
185*795d594fSAndroid Build Coastguard Worker     const uint16_t lhs = GetLeadingUtf16Char(pair);
186*795d594fSAndroid Build Coastguard Worker     const uint16_t rhs = *utf16++;
187*795d594fSAndroid Build Coastguard Worker     --utf16_length;
188*795d594fSAndroid Build Coastguard Worker     if (lhs != rhs) {
189*795d594fSAndroid Build Coastguard Worker       return lhs > rhs ? 1 : -1;
190*795d594fSAndroid Build Coastguard Worker     }
191*795d594fSAndroid Build Coastguard Worker 
192*795d594fSAndroid Build Coastguard Worker     // Then compare the trailing utf16 char. First check if there
193*795d594fSAndroid Build Coastguard Worker     // are any characters left to consume.
194*795d594fSAndroid Build Coastguard Worker     const uint16_t lhs2 = GetTrailingUtf16Char(pair);
195*795d594fSAndroid Build Coastguard Worker     if (lhs2 != 0) {
196*795d594fSAndroid Build Coastguard Worker       if (utf16_length == 0) {
197*795d594fSAndroid Build Coastguard Worker         return 1;
198*795d594fSAndroid Build Coastguard Worker       }
199*795d594fSAndroid Build Coastguard Worker 
200*795d594fSAndroid Build Coastguard Worker       const uint16_t rhs2 = *utf16++;
201*795d594fSAndroid Build Coastguard Worker       --utf16_length;
202*795d594fSAndroid Build Coastguard Worker       if (lhs2 != rhs2) {
203*795d594fSAndroid Build Coastguard Worker         return lhs2 > rhs2 ? 1 : -1;
204*795d594fSAndroid Build Coastguard Worker       }
205*795d594fSAndroid Build Coastguard Worker     }
206*795d594fSAndroid Build Coastguard Worker   }
207*795d594fSAndroid Build Coastguard Worker }
208*795d594fSAndroid Build Coastguard Worker 
CountModifiedUtf8BytesInUtf16(const uint16_t * chars,size_t char_count)209*795d594fSAndroid Build Coastguard Worker size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count) {
210*795d594fSAndroid Build Coastguard Worker   // FIXME: We should not emit 4-byte sequences. Bug: 192935764
211*795d594fSAndroid Build Coastguard Worker   size_t result = 0;
212*795d594fSAndroid Build Coastguard Worker   auto append = [&]([[maybe_unused]] char c) { ++result; };
213*795d594fSAndroid Build Coastguard Worker   ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
214*795d594fSAndroid Build Coastguard Worker                      /*kUse4ByteSequence=*/ true,
215*795d594fSAndroid Build Coastguard Worker                      /*kReplaceBadSurrogates=*/ false>(chars, char_count, append);
216*795d594fSAndroid Build Coastguard Worker   return result;
217*795d594fSAndroid Build Coastguard Worker }
218*795d594fSAndroid Build Coastguard Worker 
NeedsEscaping(uint16_t ch)219*795d594fSAndroid Build Coastguard Worker static inline constexpr bool NeedsEscaping(uint16_t ch) {
220*795d594fSAndroid Build Coastguard Worker   return (ch < ' ' || ch > '~');
221*795d594fSAndroid Build Coastguard Worker }
222*795d594fSAndroid Build Coastguard Worker 
PrintableChar(uint16_t ch)223*795d594fSAndroid Build Coastguard Worker std::string PrintableChar(uint16_t ch) {
224*795d594fSAndroid Build Coastguard Worker   std::string result;
225*795d594fSAndroid Build Coastguard Worker   result += '\'';
226*795d594fSAndroid Build Coastguard Worker   if (NeedsEscaping(ch)) {
227*795d594fSAndroid Build Coastguard Worker     StringAppendF(&result, "\\u%04x", ch);
228*795d594fSAndroid Build Coastguard Worker   } else {
229*795d594fSAndroid Build Coastguard Worker     result += static_cast<std::string::value_type>(ch);
230*795d594fSAndroid Build Coastguard Worker   }
231*795d594fSAndroid Build Coastguard Worker   result += '\'';
232*795d594fSAndroid Build Coastguard Worker   return result;
233*795d594fSAndroid Build Coastguard Worker }
234*795d594fSAndroid Build Coastguard Worker 
PrintableString(const char * utf8)235*795d594fSAndroid Build Coastguard Worker std::string PrintableString(const char* utf8) {
236*795d594fSAndroid Build Coastguard Worker   std::string result;
237*795d594fSAndroid Build Coastguard Worker   result += '"';
238*795d594fSAndroid Build Coastguard Worker   const char* p = utf8;
239*795d594fSAndroid Build Coastguard Worker   size_t char_count = CountModifiedUtf8Chars(p);
240*795d594fSAndroid Build Coastguard Worker   for (size_t i = 0; i < char_count; ++i) {
241*795d594fSAndroid Build Coastguard Worker     uint32_t ch = GetUtf16FromUtf8(&p);
242*795d594fSAndroid Build Coastguard Worker     if (ch == '\\') {
243*795d594fSAndroid Build Coastguard Worker       result += "\\\\";
244*795d594fSAndroid Build Coastguard Worker     } else if (ch == '\n') {
245*795d594fSAndroid Build Coastguard Worker       result += "\\n";
246*795d594fSAndroid Build Coastguard Worker     } else if (ch == '\r') {
247*795d594fSAndroid Build Coastguard Worker       result += "\\r";
248*795d594fSAndroid Build Coastguard Worker     } else if (ch == '\t') {
249*795d594fSAndroid Build Coastguard Worker       result += "\\t";
250*795d594fSAndroid Build Coastguard Worker     } else {
251*795d594fSAndroid Build Coastguard Worker       const uint16_t leading = GetLeadingUtf16Char(ch);
252*795d594fSAndroid Build Coastguard Worker 
253*795d594fSAndroid Build Coastguard Worker       if (NeedsEscaping(leading)) {
254*795d594fSAndroid Build Coastguard Worker         StringAppendF(&result, "\\u%04x", leading);
255*795d594fSAndroid Build Coastguard Worker       } else {
256*795d594fSAndroid Build Coastguard Worker         result += static_cast<std::string::value_type>(leading);
257*795d594fSAndroid Build Coastguard Worker       }
258*795d594fSAndroid Build Coastguard Worker 
259*795d594fSAndroid Build Coastguard Worker       const uint32_t trailing = GetTrailingUtf16Char(ch);
260*795d594fSAndroid Build Coastguard Worker       if (trailing != 0) {
261*795d594fSAndroid Build Coastguard Worker         // All high surrogates will need escaping.
262*795d594fSAndroid Build Coastguard Worker         StringAppendF(&result, "\\u%04x", trailing);
263*795d594fSAndroid Build Coastguard Worker         // Account for the surrogate pair.
264*795d594fSAndroid Build Coastguard Worker         ++i;
265*795d594fSAndroid Build Coastguard Worker         DCHECK_LT(i, char_count);
266*795d594fSAndroid Build Coastguard Worker       }
267*795d594fSAndroid Build Coastguard Worker     }
268*795d594fSAndroid Build Coastguard Worker   }
269*795d594fSAndroid Build Coastguard Worker   result += '"';
270*795d594fSAndroid Build Coastguard Worker   return result;
271*795d594fSAndroid Build Coastguard Worker }
272*795d594fSAndroid Build Coastguard Worker 
273*795d594fSAndroid Build Coastguard Worker }  // namespace art
274