1*795d594fSAndroid Build Coastguard Worker /*
2*795d594fSAndroid Build Coastguard Worker * Copyright (C) 2011 The Android Open Source Project
3*795d594fSAndroid Build Coastguard Worker *
4*795d594fSAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*795d594fSAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*795d594fSAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*795d594fSAndroid Build Coastguard Worker *
8*795d594fSAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*795d594fSAndroid Build Coastguard Worker *
10*795d594fSAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*795d594fSAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*795d594fSAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*795d594fSAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*795d594fSAndroid Build Coastguard Worker * limitations under the License.
15*795d594fSAndroid Build Coastguard Worker */
16*795d594fSAndroid Build Coastguard Worker
17*795d594fSAndroid Build Coastguard Worker #include "utf.h"
18*795d594fSAndroid Build Coastguard Worker
19*795d594fSAndroid Build Coastguard Worker #include <android-base/logging.h>
20*795d594fSAndroid Build Coastguard Worker #include <android-base/stringprintf.h>
21*795d594fSAndroid Build Coastguard Worker #include <android-base/strings.h>
22*795d594fSAndroid Build Coastguard Worker
23*795d594fSAndroid Build Coastguard Worker #include "base/casts.h"
24*795d594fSAndroid Build Coastguard Worker #include "utf-inl.h"
25*795d594fSAndroid Build Coastguard Worker
26*795d594fSAndroid Build Coastguard Worker namespace art {
27*795d594fSAndroid Build Coastguard Worker
28*795d594fSAndroid Build Coastguard Worker using android::base::StringAppendF;
29*795d594fSAndroid Build Coastguard Worker
30*795d594fSAndroid Build Coastguard Worker // This is used only from debugger and test code.
CountModifiedUtf8Chars(const char * utf8)31*795d594fSAndroid Build Coastguard Worker size_t CountModifiedUtf8Chars(const char* utf8) {
32*795d594fSAndroid Build Coastguard Worker return CountModifiedUtf8Chars(utf8, strlen(utf8));
33*795d594fSAndroid Build Coastguard Worker }
34*795d594fSAndroid Build Coastguard Worker
35*795d594fSAndroid Build Coastguard Worker /*
36*795d594fSAndroid Build Coastguard Worker * This does not validate UTF8 rules (nor did older code). But it gets the right answer
37*795d594fSAndroid Build Coastguard Worker * for valid UTF-8 and that's fine because it's used only to size a buffer for later
38*795d594fSAndroid Build Coastguard Worker * conversion.
39*795d594fSAndroid Build Coastguard Worker *
40*795d594fSAndroid Build Coastguard Worker * Modified UTF-8 consists of a series of bytes up to 21 bit Unicode code points as follows:
41*795d594fSAndroid Build Coastguard Worker * U+0001 - U+007F 0xxxxxxx
42*795d594fSAndroid Build Coastguard Worker * U+0080 - U+07FF 110xxxxx 10xxxxxx
43*795d594fSAndroid Build Coastguard Worker * U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
44*795d594fSAndroid Build Coastguard Worker * U+10000 - U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
45*795d594fSAndroid Build Coastguard Worker *
46*795d594fSAndroid Build Coastguard Worker * U+0000 is encoded using the 2nd form to avoid nulls inside strings (this differs from
47*795d594fSAndroid Build Coastguard Worker * standard UTF-8).
48*795d594fSAndroid Build Coastguard Worker * The four byte encoding converts to two utf16 characters.
49*795d594fSAndroid Build Coastguard Worker */
CountModifiedUtf8Chars(const char * utf8,size_t byte_count)50*795d594fSAndroid Build Coastguard Worker size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count) {
51*795d594fSAndroid Build Coastguard Worker DCHECK_LE(byte_count, strlen(utf8));
52*795d594fSAndroid Build Coastguard Worker size_t len = 0;
53*795d594fSAndroid Build Coastguard Worker const char* end = utf8 + byte_count;
54*795d594fSAndroid Build Coastguard Worker for (; utf8 < end; ++utf8) {
55*795d594fSAndroid Build Coastguard Worker int ic = *utf8;
56*795d594fSAndroid Build Coastguard Worker len++;
57*795d594fSAndroid Build Coastguard Worker if (LIKELY((ic & 0x80) == 0)) {
58*795d594fSAndroid Build Coastguard Worker // One-byte encoding.
59*795d594fSAndroid Build Coastguard Worker continue;
60*795d594fSAndroid Build Coastguard Worker }
61*795d594fSAndroid Build Coastguard Worker // Two- or three-byte encoding.
62*795d594fSAndroid Build Coastguard Worker utf8++;
63*795d594fSAndroid Build Coastguard Worker if ((ic & 0x20) == 0) {
64*795d594fSAndroid Build Coastguard Worker // Two-byte encoding.
65*795d594fSAndroid Build Coastguard Worker continue;
66*795d594fSAndroid Build Coastguard Worker }
67*795d594fSAndroid Build Coastguard Worker utf8++;
68*795d594fSAndroid Build Coastguard Worker if ((ic & 0x10) == 0) {
69*795d594fSAndroid Build Coastguard Worker // Three-byte encoding.
70*795d594fSAndroid Build Coastguard Worker continue;
71*795d594fSAndroid Build Coastguard Worker }
72*795d594fSAndroid Build Coastguard Worker
73*795d594fSAndroid Build Coastguard Worker // Four-byte encoding: needs to be converted into a surrogate
74*795d594fSAndroid Build Coastguard Worker // pair.
75*795d594fSAndroid Build Coastguard Worker utf8++;
76*795d594fSAndroid Build Coastguard Worker len++;
77*795d594fSAndroid Build Coastguard Worker }
78*795d594fSAndroid Build Coastguard Worker return len;
79*795d594fSAndroid Build Coastguard Worker }
80*795d594fSAndroid Build Coastguard Worker
81*795d594fSAndroid Build Coastguard Worker // This is used only from debugger and test code.
ConvertModifiedUtf8ToUtf16(uint16_t * utf16_data_out,const char * utf8_data_in)82*795d594fSAndroid Build Coastguard Worker void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
83*795d594fSAndroid Build Coastguard Worker while (*utf8_data_in != '\0') {
84*795d594fSAndroid Build Coastguard Worker const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
85*795d594fSAndroid Build Coastguard Worker const uint16_t leading = GetLeadingUtf16Char(ch);
86*795d594fSAndroid Build Coastguard Worker const uint16_t trailing = GetTrailingUtf16Char(ch);
87*795d594fSAndroid Build Coastguard Worker
88*795d594fSAndroid Build Coastguard Worker *utf16_data_out++ = leading;
89*795d594fSAndroid Build Coastguard Worker if (trailing != 0) {
90*795d594fSAndroid Build Coastguard Worker *utf16_data_out++ = trailing;
91*795d594fSAndroid Build Coastguard Worker }
92*795d594fSAndroid Build Coastguard Worker }
93*795d594fSAndroid Build Coastguard Worker }
94*795d594fSAndroid Build Coastguard Worker
ConvertModifiedUtf8ToUtf16(uint16_t * utf16_data_out,size_t out_chars,const char * utf8_data_in,size_t in_bytes)95*795d594fSAndroid Build Coastguard Worker void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars,
96*795d594fSAndroid Build Coastguard Worker const char* utf8_data_in, size_t in_bytes) {
97*795d594fSAndroid Build Coastguard Worker const char *in_start = utf8_data_in;
98*795d594fSAndroid Build Coastguard Worker const char *in_end = utf8_data_in + in_bytes;
99*795d594fSAndroid Build Coastguard Worker uint16_t *out_p = utf16_data_out;
100*795d594fSAndroid Build Coastguard Worker
101*795d594fSAndroid Build Coastguard Worker if (LIKELY(out_chars == in_bytes)) {
102*795d594fSAndroid Build Coastguard Worker // Common case where all characters are ASCII.
103*795d594fSAndroid Build Coastguard Worker for (const char *p = in_start; p < in_end;) {
104*795d594fSAndroid Build Coastguard Worker // Safe even if char is signed because ASCII characters always have
105*795d594fSAndroid Build Coastguard Worker // the high bit cleared.
106*795d594fSAndroid Build Coastguard Worker *out_p++ = dchecked_integral_cast<uint16_t>(*p++);
107*795d594fSAndroid Build Coastguard Worker }
108*795d594fSAndroid Build Coastguard Worker return;
109*795d594fSAndroid Build Coastguard Worker }
110*795d594fSAndroid Build Coastguard Worker
111*795d594fSAndroid Build Coastguard Worker // String contains non-ASCII characters.
112*795d594fSAndroid Build Coastguard Worker for (const char *p = in_start; p < in_end;) {
113*795d594fSAndroid Build Coastguard Worker const uint32_t ch = GetUtf16FromUtf8(&p);
114*795d594fSAndroid Build Coastguard Worker const uint16_t leading = GetLeadingUtf16Char(ch);
115*795d594fSAndroid Build Coastguard Worker const uint16_t trailing = GetTrailingUtf16Char(ch);
116*795d594fSAndroid Build Coastguard Worker
117*795d594fSAndroid Build Coastguard Worker *out_p++ = leading;
118*795d594fSAndroid Build Coastguard Worker if (trailing != 0) {
119*795d594fSAndroid Build Coastguard Worker *out_p++ = trailing;
120*795d594fSAndroid Build Coastguard Worker }
121*795d594fSAndroid Build Coastguard Worker }
122*795d594fSAndroid Build Coastguard Worker }
123*795d594fSAndroid Build Coastguard Worker
ConvertUtf16ToModifiedUtf8(char * utf8_out,size_t byte_count,const uint16_t * utf16_in,size_t char_count)124*795d594fSAndroid Build Coastguard Worker void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
125*795d594fSAndroid Build Coastguard Worker const uint16_t* utf16_in, size_t char_count) {
126*795d594fSAndroid Build Coastguard Worker if (LIKELY(byte_count == char_count)) {
127*795d594fSAndroid Build Coastguard Worker // Common case where all characters are ASCII.
128*795d594fSAndroid Build Coastguard Worker const uint16_t *utf16_end = utf16_in + char_count;
129*795d594fSAndroid Build Coastguard Worker for (const uint16_t *p = utf16_in; p < utf16_end;) {
130*795d594fSAndroid Build Coastguard Worker *utf8_out++ = dchecked_integral_cast<char>(*p++);
131*795d594fSAndroid Build Coastguard Worker }
132*795d594fSAndroid Build Coastguard Worker return;
133*795d594fSAndroid Build Coastguard Worker }
134*795d594fSAndroid Build Coastguard Worker
135*795d594fSAndroid Build Coastguard Worker // String contains non-ASCII characters.
136*795d594fSAndroid Build Coastguard Worker // FIXME: We should not emit 4-byte sequences. Bug: 192935764
137*795d594fSAndroid Build Coastguard Worker auto append = [&](char c) { *utf8_out++ = c; };
138*795d594fSAndroid Build Coastguard Worker ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
139*795d594fSAndroid Build Coastguard Worker /*kUse4ByteSequence=*/ true,
140*795d594fSAndroid Build Coastguard Worker /*kReplaceBadSurrogates=*/ false>(utf16_in, char_count, append);
141*795d594fSAndroid Build Coastguard Worker }
142*795d594fSAndroid Build Coastguard Worker
ComputeUtf16HashFromModifiedUtf8(const char * utf8,size_t utf16_length)143*795d594fSAndroid Build Coastguard Worker int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
144*795d594fSAndroid Build Coastguard Worker uint32_t hash = 0;
145*795d594fSAndroid Build Coastguard Worker while (utf16_length != 0u) {
146*795d594fSAndroid Build Coastguard Worker const uint32_t pair = GetUtf16FromUtf8(&utf8);
147*795d594fSAndroid Build Coastguard Worker const uint16_t first = GetLeadingUtf16Char(pair);
148*795d594fSAndroid Build Coastguard Worker hash = hash * 31 + first;
149*795d594fSAndroid Build Coastguard Worker --utf16_length;
150*795d594fSAndroid Build Coastguard Worker const uint16_t second = GetTrailingUtf16Char(pair);
151*795d594fSAndroid Build Coastguard Worker if (second != 0) {
152*795d594fSAndroid Build Coastguard Worker hash = hash * 31 + second;
153*795d594fSAndroid Build Coastguard Worker DCHECK_NE(utf16_length, 0u);
154*795d594fSAndroid Build Coastguard Worker --utf16_length;
155*795d594fSAndroid Build Coastguard Worker }
156*795d594fSAndroid Build Coastguard Worker }
157*795d594fSAndroid Build Coastguard Worker return static_cast<int32_t>(hash);
158*795d594fSAndroid Build Coastguard Worker }
159*795d594fSAndroid Build Coastguard Worker
ComputeModifiedUtf8Hash(const char * chars)160*795d594fSAndroid Build Coastguard Worker uint32_t ComputeModifiedUtf8Hash(const char* chars) {
161*795d594fSAndroid Build Coastguard Worker uint32_t hash = StartModifiedUtf8Hash();
162*795d594fSAndroid Build Coastguard Worker while (*chars != '\0') {
163*795d594fSAndroid Build Coastguard Worker hash = UpdateModifiedUtf8Hash(hash, *chars);
164*795d594fSAndroid Build Coastguard Worker ++chars;
165*795d594fSAndroid Build Coastguard Worker }
166*795d594fSAndroid Build Coastguard Worker return hash;
167*795d594fSAndroid Build Coastguard Worker }
168*795d594fSAndroid Build Coastguard Worker
ComputeModifiedUtf8Hash(std::string_view chars)169*795d594fSAndroid Build Coastguard Worker uint32_t ComputeModifiedUtf8Hash(std::string_view chars) {
170*795d594fSAndroid Build Coastguard Worker return UpdateModifiedUtf8Hash(StartModifiedUtf8Hash(), chars);
171*795d594fSAndroid Build Coastguard Worker }
172*795d594fSAndroid Build Coastguard Worker
CompareModifiedUtf8ToUtf16AsCodePointValues(const char * utf8,const uint16_t * utf16,size_t utf16_length)173*795d594fSAndroid Build Coastguard Worker int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
174*795d594fSAndroid Build Coastguard Worker size_t utf16_length) {
175*795d594fSAndroid Build Coastguard Worker for (;;) {
176*795d594fSAndroid Build Coastguard Worker if (*utf8 == '\0') {
177*795d594fSAndroid Build Coastguard Worker return (utf16_length == 0) ? 0 : -1;
178*795d594fSAndroid Build Coastguard Worker } else if (utf16_length == 0) {
179*795d594fSAndroid Build Coastguard Worker return 1;
180*795d594fSAndroid Build Coastguard Worker }
181*795d594fSAndroid Build Coastguard Worker
182*795d594fSAndroid Build Coastguard Worker const uint32_t pair = GetUtf16FromUtf8(&utf8);
183*795d594fSAndroid Build Coastguard Worker
184*795d594fSAndroid Build Coastguard Worker // First compare the leading utf16 char.
185*795d594fSAndroid Build Coastguard Worker const uint16_t lhs = GetLeadingUtf16Char(pair);
186*795d594fSAndroid Build Coastguard Worker const uint16_t rhs = *utf16++;
187*795d594fSAndroid Build Coastguard Worker --utf16_length;
188*795d594fSAndroid Build Coastguard Worker if (lhs != rhs) {
189*795d594fSAndroid Build Coastguard Worker return lhs > rhs ? 1 : -1;
190*795d594fSAndroid Build Coastguard Worker }
191*795d594fSAndroid Build Coastguard Worker
192*795d594fSAndroid Build Coastguard Worker // Then compare the trailing utf16 char. First check if there
193*795d594fSAndroid Build Coastguard Worker // are any characters left to consume.
194*795d594fSAndroid Build Coastguard Worker const uint16_t lhs2 = GetTrailingUtf16Char(pair);
195*795d594fSAndroid Build Coastguard Worker if (lhs2 != 0) {
196*795d594fSAndroid Build Coastguard Worker if (utf16_length == 0) {
197*795d594fSAndroid Build Coastguard Worker return 1;
198*795d594fSAndroid Build Coastguard Worker }
199*795d594fSAndroid Build Coastguard Worker
200*795d594fSAndroid Build Coastguard Worker const uint16_t rhs2 = *utf16++;
201*795d594fSAndroid Build Coastguard Worker --utf16_length;
202*795d594fSAndroid Build Coastguard Worker if (lhs2 != rhs2) {
203*795d594fSAndroid Build Coastguard Worker return lhs2 > rhs2 ? 1 : -1;
204*795d594fSAndroid Build Coastguard Worker }
205*795d594fSAndroid Build Coastguard Worker }
206*795d594fSAndroid Build Coastguard Worker }
207*795d594fSAndroid Build Coastguard Worker }
208*795d594fSAndroid Build Coastguard Worker
CountModifiedUtf8BytesInUtf16(const uint16_t * chars,size_t char_count)209*795d594fSAndroid Build Coastguard Worker size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count) {
210*795d594fSAndroid Build Coastguard Worker // FIXME: We should not emit 4-byte sequences. Bug: 192935764
211*795d594fSAndroid Build Coastguard Worker size_t result = 0;
212*795d594fSAndroid Build Coastguard Worker auto append = [&]([[maybe_unused]] char c) { ++result; };
213*795d594fSAndroid Build Coastguard Worker ConvertUtf16ToUtf8</*kUseShortZero=*/ false,
214*795d594fSAndroid Build Coastguard Worker /*kUse4ByteSequence=*/ true,
215*795d594fSAndroid Build Coastguard Worker /*kReplaceBadSurrogates=*/ false>(chars, char_count, append);
216*795d594fSAndroid Build Coastguard Worker return result;
217*795d594fSAndroid Build Coastguard Worker }
218*795d594fSAndroid Build Coastguard Worker
NeedsEscaping(uint16_t ch)219*795d594fSAndroid Build Coastguard Worker static inline constexpr bool NeedsEscaping(uint16_t ch) {
220*795d594fSAndroid Build Coastguard Worker return (ch < ' ' || ch > '~');
221*795d594fSAndroid Build Coastguard Worker }
222*795d594fSAndroid Build Coastguard Worker
PrintableChar(uint16_t ch)223*795d594fSAndroid Build Coastguard Worker std::string PrintableChar(uint16_t ch) {
224*795d594fSAndroid Build Coastguard Worker std::string result;
225*795d594fSAndroid Build Coastguard Worker result += '\'';
226*795d594fSAndroid Build Coastguard Worker if (NeedsEscaping(ch)) {
227*795d594fSAndroid Build Coastguard Worker StringAppendF(&result, "\\u%04x", ch);
228*795d594fSAndroid Build Coastguard Worker } else {
229*795d594fSAndroid Build Coastguard Worker result += static_cast<std::string::value_type>(ch);
230*795d594fSAndroid Build Coastguard Worker }
231*795d594fSAndroid Build Coastguard Worker result += '\'';
232*795d594fSAndroid Build Coastguard Worker return result;
233*795d594fSAndroid Build Coastguard Worker }
234*795d594fSAndroid Build Coastguard Worker
PrintableString(const char * utf8)235*795d594fSAndroid Build Coastguard Worker std::string PrintableString(const char* utf8) {
236*795d594fSAndroid Build Coastguard Worker std::string result;
237*795d594fSAndroid Build Coastguard Worker result += '"';
238*795d594fSAndroid Build Coastguard Worker const char* p = utf8;
239*795d594fSAndroid Build Coastguard Worker size_t char_count = CountModifiedUtf8Chars(p);
240*795d594fSAndroid Build Coastguard Worker for (size_t i = 0; i < char_count; ++i) {
241*795d594fSAndroid Build Coastguard Worker uint32_t ch = GetUtf16FromUtf8(&p);
242*795d594fSAndroid Build Coastguard Worker if (ch == '\\') {
243*795d594fSAndroid Build Coastguard Worker result += "\\\\";
244*795d594fSAndroid Build Coastguard Worker } else if (ch == '\n') {
245*795d594fSAndroid Build Coastguard Worker result += "\\n";
246*795d594fSAndroid Build Coastguard Worker } else if (ch == '\r') {
247*795d594fSAndroid Build Coastguard Worker result += "\\r";
248*795d594fSAndroid Build Coastguard Worker } else if (ch == '\t') {
249*795d594fSAndroid Build Coastguard Worker result += "\\t";
250*795d594fSAndroid Build Coastguard Worker } else {
251*795d594fSAndroid Build Coastguard Worker const uint16_t leading = GetLeadingUtf16Char(ch);
252*795d594fSAndroid Build Coastguard Worker
253*795d594fSAndroid Build Coastguard Worker if (NeedsEscaping(leading)) {
254*795d594fSAndroid Build Coastguard Worker StringAppendF(&result, "\\u%04x", leading);
255*795d594fSAndroid Build Coastguard Worker } else {
256*795d594fSAndroid Build Coastguard Worker result += static_cast<std::string::value_type>(leading);
257*795d594fSAndroid Build Coastguard Worker }
258*795d594fSAndroid Build Coastguard Worker
259*795d594fSAndroid Build Coastguard Worker const uint32_t trailing = GetTrailingUtf16Char(ch);
260*795d594fSAndroid Build Coastguard Worker if (trailing != 0) {
261*795d594fSAndroid Build Coastguard Worker // All high surrogates will need escaping.
262*795d594fSAndroid Build Coastguard Worker StringAppendF(&result, "\\u%04x", trailing);
263*795d594fSAndroid Build Coastguard Worker // Account for the surrogate pair.
264*795d594fSAndroid Build Coastguard Worker ++i;
265*795d594fSAndroid Build Coastguard Worker DCHECK_LT(i, char_count);
266*795d594fSAndroid Build Coastguard Worker }
267*795d594fSAndroid Build Coastguard Worker }
268*795d594fSAndroid Build Coastguard Worker }
269*795d594fSAndroid Build Coastguard Worker result += '"';
270*795d594fSAndroid Build Coastguard Worker return result;
271*795d594fSAndroid Build Coastguard Worker }
272*795d594fSAndroid Build Coastguard Worker
273*795d594fSAndroid Build Coastguard Worker } // namespace art
274