1*795d594fSAndroid Build Coastguard Worker /*
2*795d594fSAndroid Build Coastguard Worker * Copyright (C) 2015 The Android Open Source Project
3*795d594fSAndroid Build Coastguard Worker *
4*795d594fSAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*795d594fSAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*795d594fSAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*795d594fSAndroid Build Coastguard Worker *
8*795d594fSAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*795d594fSAndroid Build Coastguard Worker *
10*795d594fSAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*795d594fSAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*795d594fSAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*795d594fSAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*795d594fSAndroid Build Coastguard Worker * limitations under the License.
15*795d594fSAndroid Build Coastguard Worker */
16*795d594fSAndroid Build Coastguard Worker
17*795d594fSAndroid Build Coastguard Worker #include "utf.h"
18*795d594fSAndroid Build Coastguard Worker
19*795d594fSAndroid Build Coastguard Worker #include <map>
20*795d594fSAndroid Build Coastguard Worker #include <vector>
21*795d594fSAndroid Build Coastguard Worker
22*795d594fSAndroid Build Coastguard Worker #include <android-base/stringprintf.h>
23*795d594fSAndroid Build Coastguard Worker
24*795d594fSAndroid Build Coastguard Worker #include "gtest/gtest.h"
25*795d594fSAndroid Build Coastguard Worker #include "utf-inl.h"
26*795d594fSAndroid Build Coastguard Worker
27*795d594fSAndroid Build Coastguard Worker namespace art {
28*795d594fSAndroid Build Coastguard Worker
29*795d594fSAndroid Build Coastguard Worker class UtfTest : public testing::Test {};
30*795d594fSAndroid Build Coastguard Worker
TEST_F(UtfTest,GetLeadingUtf16Char)31*795d594fSAndroid Build Coastguard Worker TEST_F(UtfTest, GetLeadingUtf16Char) {
32*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
33*795d594fSAndroid Build Coastguard Worker }
34*795d594fSAndroid Build Coastguard Worker
TEST_F(UtfTest,GetTrailingUtf16Char)35*795d594fSAndroid Build Coastguard Worker TEST_F(UtfTest, GetTrailingUtf16Char) {
36*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
37*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
38*795d594fSAndroid Build Coastguard Worker }
39*795d594fSAndroid Build Coastguard Worker
40*795d594fSAndroid Build Coastguard Worker #define EXPECT_ARRAY_POSITION(expected, end, start) \
41*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(static_cast<uintptr_t>(expected), \
42*795d594fSAndroid Build Coastguard Worker reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
43*795d594fSAndroid Build Coastguard Worker
44*795d594fSAndroid Build Coastguard Worker // A test string containing one, two, three and four byte UTF-8 sequences.
45*795d594fSAndroid Build Coastguard Worker static const uint8_t kAllSequences[] = {
46*795d594fSAndroid Build Coastguard Worker 0x24,
47*795d594fSAndroid Build Coastguard Worker 0xc2, 0xa2,
48*795d594fSAndroid Build Coastguard Worker 0xe2, 0x82, 0xac,
49*795d594fSAndroid Build Coastguard Worker 0xf0, 0x9f, 0x8f, 0xa0,
50*795d594fSAndroid Build Coastguard Worker 0x00
51*795d594fSAndroid Build Coastguard Worker };
52*795d594fSAndroid Build Coastguard Worker
53*795d594fSAndroid Build Coastguard Worker // A test string that contains a UTF-8 encoding of a surrogate pair
54*795d594fSAndroid Build Coastguard Worker // (code point = U+10400).
55*795d594fSAndroid Build Coastguard Worker static const uint8_t kSurrogateEncoding[] = {
56*795d594fSAndroid Build Coastguard Worker 0xed, 0xa0, 0x81,
57*795d594fSAndroid Build Coastguard Worker 0xed, 0xb0, 0x80,
58*795d594fSAndroid Build Coastguard Worker 0x00
59*795d594fSAndroid Build Coastguard Worker };
60*795d594fSAndroid Build Coastguard Worker
TEST_F(UtfTest,GetUtf16FromUtf8)61*795d594fSAndroid Build Coastguard Worker TEST_F(UtfTest, GetUtf16FromUtf8) {
62*795d594fSAndroid Build Coastguard Worker const char* const start = reinterpret_cast<const char*>(kAllSequences);
63*795d594fSAndroid Build Coastguard Worker const char* ptr = start;
64*795d594fSAndroid Build Coastguard Worker uint32_t pair = 0;
65*795d594fSAndroid Build Coastguard Worker
66*795d594fSAndroid Build Coastguard Worker // Single byte sequence.
67*795d594fSAndroid Build Coastguard Worker pair = GetUtf16FromUtf8(&ptr);
68*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
69*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0, GetTrailingUtf16Char(pair));
70*795d594fSAndroid Build Coastguard Worker EXPECT_ARRAY_POSITION(1, ptr, start);
71*795d594fSAndroid Build Coastguard Worker
72*795d594fSAndroid Build Coastguard Worker // Two byte sequence.
73*795d594fSAndroid Build Coastguard Worker pair = GetUtf16FromUtf8(&ptr);
74*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
75*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0, GetTrailingUtf16Char(pair));
76*795d594fSAndroid Build Coastguard Worker EXPECT_ARRAY_POSITION(3, ptr, start);
77*795d594fSAndroid Build Coastguard Worker
78*795d594fSAndroid Build Coastguard Worker // Three byte sequence.
79*795d594fSAndroid Build Coastguard Worker pair = GetUtf16FromUtf8(&ptr);
80*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
81*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0, GetTrailingUtf16Char(pair));
82*795d594fSAndroid Build Coastguard Worker EXPECT_ARRAY_POSITION(6, ptr, start);
83*795d594fSAndroid Build Coastguard Worker
84*795d594fSAndroid Build Coastguard Worker // Four byte sequence
85*795d594fSAndroid Build Coastguard Worker pair = GetUtf16FromUtf8(&ptr);
86*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
87*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
88*795d594fSAndroid Build Coastguard Worker EXPECT_ARRAY_POSITION(10, ptr, start);
89*795d594fSAndroid Build Coastguard Worker
90*795d594fSAndroid Build Coastguard Worker // Null terminator.
91*795d594fSAndroid Build Coastguard Worker pair = GetUtf16FromUtf8(&ptr);
92*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0, GetLeadingUtf16Char(pair));
93*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0, GetTrailingUtf16Char(pair));
94*795d594fSAndroid Build Coastguard Worker EXPECT_ARRAY_POSITION(11, ptr, start);
95*795d594fSAndroid Build Coastguard Worker }
96*795d594fSAndroid Build Coastguard Worker
TEST_F(UtfTest,GetUtf16FromUtf8_SurrogatesPassThrough)97*795d594fSAndroid Build Coastguard Worker TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
98*795d594fSAndroid Build Coastguard Worker const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
99*795d594fSAndroid Build Coastguard Worker const char* ptr = start;
100*795d594fSAndroid Build Coastguard Worker uint32_t pair = 0;
101*795d594fSAndroid Build Coastguard Worker
102*795d594fSAndroid Build Coastguard Worker pair = GetUtf16FromUtf8(&ptr);
103*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
104*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0, GetTrailingUtf16Char(pair));
105*795d594fSAndroid Build Coastguard Worker EXPECT_ARRAY_POSITION(3, ptr, start);
106*795d594fSAndroid Build Coastguard Worker
107*795d594fSAndroid Build Coastguard Worker pair = GetUtf16FromUtf8(&ptr);
108*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
109*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(0, GetTrailingUtf16Char(pair));
110*795d594fSAndroid Build Coastguard Worker EXPECT_ARRAY_POSITION(6, ptr, start);
111*795d594fSAndroid Build Coastguard Worker }
112*795d594fSAndroid Build Coastguard Worker
TEST_F(UtfTest,CountModifiedUtf8Chars)113*795d594fSAndroid Build Coastguard Worker TEST_F(UtfTest, CountModifiedUtf8Chars) {
114*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
115*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
116*795d594fSAndroid Build Coastguard Worker }
117*795d594fSAndroid Build Coastguard Worker
AssertConversion(const std::vector<uint16_t> & input,const std::vector<uint8_t> & expected)118*795d594fSAndroid Build Coastguard Worker static void AssertConversion(const std::vector<uint16_t>& input,
119*795d594fSAndroid Build Coastguard Worker const std::vector<uint8_t>& expected) {
120*795d594fSAndroid Build Coastguard Worker ASSERT_EQ(expected.size(), CountModifiedUtf8BytesInUtf16(&input[0], input.size()));
121*795d594fSAndroid Build Coastguard Worker
122*795d594fSAndroid Build Coastguard Worker std::vector<uint8_t> output(expected.size());
123*795d594fSAndroid Build Coastguard Worker ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
124*795d594fSAndroid Build Coastguard Worker &input[0], input.size());
125*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(expected, output);
126*795d594fSAndroid Build Coastguard Worker }
127*795d594fSAndroid Build Coastguard Worker
TEST_F(UtfTest,CountAndConvertUtf8Bytes)128*795d594fSAndroid Build Coastguard Worker TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
129*795d594fSAndroid Build Coastguard Worker // Surrogate pairs will be converted into 4 byte sequences.
130*795d594fSAndroid Build Coastguard Worker AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
131*795d594fSAndroid Build Coastguard Worker
132*795d594fSAndroid Build Coastguard Worker // Three byte encodings that are below & above the leading surrogate
133*795d594fSAndroid Build Coastguard Worker // range respectively.
134*795d594fSAndroid Build Coastguard Worker AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 });
135*795d594fSAndroid Build Coastguard Worker AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf });
136*795d594fSAndroid Build Coastguard Worker // Two byte encoding.
137*795d594fSAndroid Build Coastguard Worker AssertConversion({ 0x0101 }, { 0xc4, 0x81 });
138*795d594fSAndroid Build Coastguard Worker
139*795d594fSAndroid Build Coastguard Worker // Two byte special case : 0 must use an overlong encoding.
140*795d594fSAndroid Build Coastguard Worker AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 });
141*795d594fSAndroid Build Coastguard Worker
142*795d594fSAndroid Build Coastguard Worker // One byte encoding.
143*795d594fSAndroid Build Coastguard Worker AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
144*795d594fSAndroid Build Coastguard Worker
145*795d594fSAndroid Build Coastguard Worker AssertConversion({
146*795d594fSAndroid Build Coastguard Worker 0xd802, 0xdc02, // Surrogate pair.
147*795d594fSAndroid Build Coastguard Worker 0xdef0, 0xdcff, // Three byte encodings.
148*795d594fSAndroid Build Coastguard Worker 0x0101, 0x0000, // Two byte encodings.
149*795d594fSAndroid Build Coastguard Worker 'p' , 'p' // One byte encoding.
150*795d594fSAndroid Build Coastguard Worker }, {
151*795d594fSAndroid Build Coastguard Worker 0xf0, 0x90, 0xa0, 0x82,
152*795d594fSAndroid Build Coastguard Worker 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
153*795d594fSAndroid Build Coastguard Worker 0xc4, 0x81, 0xc0, 0x80,
154*795d594fSAndroid Build Coastguard Worker 0x70, 0x70
155*795d594fSAndroid Build Coastguard Worker });
156*795d594fSAndroid Build Coastguard Worker }
157*795d594fSAndroid Build Coastguard Worker
TEST_F(UtfTest,CountAndConvertUtf8Bytes_UnpairedSurrogate)158*795d594fSAndroid Build Coastguard Worker TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) {
159*795d594fSAndroid Build Coastguard Worker // Unpaired trailing surrogate at the end of input.
160*795d594fSAndroid Build Coastguard Worker AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
161*795d594fSAndroid Build Coastguard Worker // Unpaired (or incorrectly paired) surrogates in the middle of the input.
162*795d594fSAndroid Build Coastguard Worker const std::map<std::vector<uint16_t>, std::vector<uint8_t>> prefixes {
163*795d594fSAndroid Build Coastguard Worker {{ 'h' }, { 'h' }},
164*795d594fSAndroid Build Coastguard Worker {{ 0 }, { 0xc0, 0x80 }},
165*795d594fSAndroid Build Coastguard Worker {{ 0x81 }, { 0xc2, 0x81 }},
166*795d594fSAndroid Build Coastguard Worker {{ 0x801 }, { 0xe0, 0xa0, 0x81 }},
167*795d594fSAndroid Build Coastguard Worker };
168*795d594fSAndroid Build Coastguard Worker const std::map<std::vector<uint16_t>, std::vector<uint8_t>> suffixes {
169*795d594fSAndroid Build Coastguard Worker {{ 'e' }, { 'e' }},
170*795d594fSAndroid Build Coastguard Worker {{ 0 }, { 0xc0, 0x80 }},
171*795d594fSAndroid Build Coastguard Worker {{ 0x7ff }, { 0xdf, 0xbf }},
172*795d594fSAndroid Build Coastguard Worker {{ 0xffff }, { 0xef, 0xbf, 0xbf }},
173*795d594fSAndroid Build Coastguard Worker };
174*795d594fSAndroid Build Coastguard Worker const std::map<std::vector<uint16_t>, std::vector<uint8_t>> tests {
175*795d594fSAndroid Build Coastguard Worker {{ 0xd801 }, { 0xed, 0xa0, 0x81 }},
176*795d594fSAndroid Build Coastguard Worker {{ 0xdc00 }, { 0xed, 0xb0, 0x80 }},
177*795d594fSAndroid Build Coastguard Worker {{ 0xd801, 0xd801 }, { 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81 }},
178*795d594fSAndroid Build Coastguard Worker {{ 0xdc00, 0xdc00 }, { 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80 }},
179*795d594fSAndroid Build Coastguard Worker };
180*795d594fSAndroid Build Coastguard Worker for (const auto& prefix : prefixes) {
181*795d594fSAndroid Build Coastguard Worker const std::vector<uint16_t>& prefix_in = prefix.first;
182*795d594fSAndroid Build Coastguard Worker const std::vector<uint8_t>& prefix_out = prefix.second;
183*795d594fSAndroid Build Coastguard Worker for (const auto& test : tests) {
184*795d594fSAndroid Build Coastguard Worker const std::vector<uint16_t>& test_in = test.first;
185*795d594fSAndroid Build Coastguard Worker const std::vector<uint8_t>& test_out = test.second;
186*795d594fSAndroid Build Coastguard Worker for (const auto& suffix : suffixes) {
187*795d594fSAndroid Build Coastguard Worker const std::vector<uint16_t>& suffix_in = suffix.first;
188*795d594fSAndroid Build Coastguard Worker const std::vector<uint8_t>& suffix_out = suffix.second;
189*795d594fSAndroid Build Coastguard Worker std::vector<uint16_t> in = prefix_in;
190*795d594fSAndroid Build Coastguard Worker in.insert(in.end(), test_in.begin(), test_in.end());
191*795d594fSAndroid Build Coastguard Worker in.insert(in.end(), suffix_in.begin(), suffix_in.end());
192*795d594fSAndroid Build Coastguard Worker std::vector<uint8_t> out = prefix_out;
193*795d594fSAndroid Build Coastguard Worker out.insert(out.end(), test_out.begin(), test_out.end());
194*795d594fSAndroid Build Coastguard Worker out.insert(out.end(), suffix_out.begin(), suffix_out.end());
195*795d594fSAndroid Build Coastguard Worker AssertConversion(in, out);
196*795d594fSAndroid Build Coastguard Worker }
197*795d594fSAndroid Build Coastguard Worker }
198*795d594fSAndroid Build Coastguard Worker }
199*795d594fSAndroid Build Coastguard Worker }
200*795d594fSAndroid Build Coastguard Worker
201*795d594fSAndroid Build Coastguard Worker // Old versions of functions, here to compare answers with optimized versions.
202*795d594fSAndroid Build Coastguard Worker
CountModifiedUtf8Chars_reference(const char * utf8)203*795d594fSAndroid Build Coastguard Worker size_t CountModifiedUtf8Chars_reference(const char* utf8) {
204*795d594fSAndroid Build Coastguard Worker size_t len = 0;
205*795d594fSAndroid Build Coastguard Worker int ic;
206*795d594fSAndroid Build Coastguard Worker while ((ic = *utf8++) != '\0') {
207*795d594fSAndroid Build Coastguard Worker len++;
208*795d594fSAndroid Build Coastguard Worker if ((ic & 0x80) == 0) {
209*795d594fSAndroid Build Coastguard Worker // one-byte encoding
210*795d594fSAndroid Build Coastguard Worker continue;
211*795d594fSAndroid Build Coastguard Worker }
212*795d594fSAndroid Build Coastguard Worker // two- or three-byte encoding
213*795d594fSAndroid Build Coastguard Worker utf8++;
214*795d594fSAndroid Build Coastguard Worker if ((ic & 0x20) == 0) {
215*795d594fSAndroid Build Coastguard Worker // two-byte encoding
216*795d594fSAndroid Build Coastguard Worker continue;
217*795d594fSAndroid Build Coastguard Worker }
218*795d594fSAndroid Build Coastguard Worker utf8++;
219*795d594fSAndroid Build Coastguard Worker if ((ic & 0x10) == 0) {
220*795d594fSAndroid Build Coastguard Worker // three-byte encoding
221*795d594fSAndroid Build Coastguard Worker continue;
222*795d594fSAndroid Build Coastguard Worker }
223*795d594fSAndroid Build Coastguard Worker
224*795d594fSAndroid Build Coastguard Worker // four-byte encoding: needs to be converted into a surrogate
225*795d594fSAndroid Build Coastguard Worker // pair.
226*795d594fSAndroid Build Coastguard Worker utf8++;
227*795d594fSAndroid Build Coastguard Worker len++;
228*795d594fSAndroid Build Coastguard Worker }
229*795d594fSAndroid Build Coastguard Worker return len;
230*795d594fSAndroid Build Coastguard Worker }
231*795d594fSAndroid Build Coastguard Worker
CountModifiedUtf8BytesInUtf16_reference(const uint16_t * chars,size_t char_count)232*795d594fSAndroid Build Coastguard Worker static size_t CountModifiedUtf8BytesInUtf16_reference(const uint16_t* chars, size_t char_count) {
233*795d594fSAndroid Build Coastguard Worker size_t result = 0;
234*795d594fSAndroid Build Coastguard Worker while (char_count--) {
235*795d594fSAndroid Build Coastguard Worker const uint16_t ch = *chars++;
236*795d594fSAndroid Build Coastguard Worker if (ch > 0 && ch <= 0x7f) {
237*795d594fSAndroid Build Coastguard Worker ++result;
238*795d594fSAndroid Build Coastguard Worker } else if (ch >= 0xd800 && ch <= 0xdbff) {
239*795d594fSAndroid Build Coastguard Worker if (char_count > 0) {
240*795d594fSAndroid Build Coastguard Worker const uint16_t ch2 = *chars;
241*795d594fSAndroid Build Coastguard Worker // If we find a properly paired surrogate, we emit it as a 4 byte
242*795d594fSAndroid Build Coastguard Worker // UTF sequence. If we find an unpaired leading or trailing surrogate,
243*795d594fSAndroid Build Coastguard Worker // we emit it as a 3 byte sequence like would have done earlier.
244*795d594fSAndroid Build Coastguard Worker if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
245*795d594fSAndroid Build Coastguard Worker chars++;
246*795d594fSAndroid Build Coastguard Worker char_count--;
247*795d594fSAndroid Build Coastguard Worker
248*795d594fSAndroid Build Coastguard Worker result += 4;
249*795d594fSAndroid Build Coastguard Worker } else {
250*795d594fSAndroid Build Coastguard Worker result += 3;
251*795d594fSAndroid Build Coastguard Worker }
252*795d594fSAndroid Build Coastguard Worker } else {
253*795d594fSAndroid Build Coastguard Worker // This implies we found an unpaired trailing surrogate at the end
254*795d594fSAndroid Build Coastguard Worker // of a string.
255*795d594fSAndroid Build Coastguard Worker result += 3;
256*795d594fSAndroid Build Coastguard Worker }
257*795d594fSAndroid Build Coastguard Worker } else if (ch > 0x7ff) {
258*795d594fSAndroid Build Coastguard Worker result += 3;
259*795d594fSAndroid Build Coastguard Worker } else {
260*795d594fSAndroid Build Coastguard Worker result += 2;
261*795d594fSAndroid Build Coastguard Worker }
262*795d594fSAndroid Build Coastguard Worker }
263*795d594fSAndroid Build Coastguard Worker return result;
264*795d594fSAndroid Build Coastguard Worker }
265*795d594fSAndroid Build Coastguard Worker
ConvertUtf16ToModifiedUtf8_reference(char * utf8_out,const uint16_t * utf16_in,size_t char_count)266*795d594fSAndroid Build Coastguard Worker static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in,
267*795d594fSAndroid Build Coastguard Worker size_t char_count) {
268*795d594fSAndroid Build Coastguard Worker while (char_count--) {
269*795d594fSAndroid Build Coastguard Worker const uint16_t ch = *utf16_in++;
270*795d594fSAndroid Build Coastguard Worker if (ch > 0 && ch <= 0x7f) {
271*795d594fSAndroid Build Coastguard Worker *utf8_out++ = ch;
272*795d594fSAndroid Build Coastguard Worker } else {
273*795d594fSAndroid Build Coastguard Worker // Char_count == 0 here implies we've encountered an unpaired
274*795d594fSAndroid Build Coastguard Worker // surrogate and we have no choice but to encode it as 3-byte UTF
275*795d594fSAndroid Build Coastguard Worker // sequence. Note that unpaired surrogates can occur as a part of
276*795d594fSAndroid Build Coastguard Worker // "normal" operation.
277*795d594fSAndroid Build Coastguard Worker if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
278*795d594fSAndroid Build Coastguard Worker const uint16_t ch2 = *utf16_in;
279*795d594fSAndroid Build Coastguard Worker
280*795d594fSAndroid Build Coastguard Worker // Check if the other half of the pair is within the expected
281*795d594fSAndroid Build Coastguard Worker // range. If it isn't, we will have to emit both "halves" as
282*795d594fSAndroid Build Coastguard Worker // separate 3 byte sequences.
283*795d594fSAndroid Build Coastguard Worker if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
284*795d594fSAndroid Build Coastguard Worker utf16_in++;
285*795d594fSAndroid Build Coastguard Worker char_count--;
286*795d594fSAndroid Build Coastguard Worker const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
287*795d594fSAndroid Build Coastguard Worker *utf8_out++ = (code_point >> 18) | 0xf0;
288*795d594fSAndroid Build Coastguard Worker *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
289*795d594fSAndroid Build Coastguard Worker *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
290*795d594fSAndroid Build Coastguard Worker *utf8_out++ = (code_point & 0x3f) | 0x80;
291*795d594fSAndroid Build Coastguard Worker continue;
292*795d594fSAndroid Build Coastguard Worker }
293*795d594fSAndroid Build Coastguard Worker }
294*795d594fSAndroid Build Coastguard Worker
295*795d594fSAndroid Build Coastguard Worker if (ch > 0x07ff) {
296*795d594fSAndroid Build Coastguard Worker // Three byte encoding.
297*795d594fSAndroid Build Coastguard Worker *utf8_out++ = (ch >> 12) | 0xe0;
298*795d594fSAndroid Build Coastguard Worker *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
299*795d594fSAndroid Build Coastguard Worker *utf8_out++ = (ch & 0x3f) | 0x80;
300*795d594fSAndroid Build Coastguard Worker } else /*(ch > 0x7f || ch == 0)*/ {
301*795d594fSAndroid Build Coastguard Worker // Two byte encoding.
302*795d594fSAndroid Build Coastguard Worker *utf8_out++ = (ch >> 6) | 0xc0;
303*795d594fSAndroid Build Coastguard Worker *utf8_out++ = (ch & 0x3f) | 0x80;
304*795d594fSAndroid Build Coastguard Worker }
305*795d594fSAndroid Build Coastguard Worker }
306*795d594fSAndroid Build Coastguard Worker }
307*795d594fSAndroid Build Coastguard Worker }
308*795d594fSAndroid Build Coastguard Worker
309*795d594fSAndroid Build Coastguard Worker // Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again.
310*795d594fSAndroid Build Coastguard Worker
codePointToSurrogatePair(uint32_t code_point,uint16_t & first,uint16_t & second)311*795d594fSAndroid Build Coastguard Worker static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) {
312*795d594fSAndroid Build Coastguard Worker first = (code_point >> 10) + 0xd7c0;
313*795d594fSAndroid Build Coastguard Worker second = (code_point & 0x03ff) + 0xdc00;
314*795d594fSAndroid Build Coastguard Worker }
315*795d594fSAndroid Build Coastguard Worker
testConversions(uint16_t * buf,int char_count)316*795d594fSAndroid Build Coastguard Worker static void testConversions(uint16_t *buf, int char_count) {
317*795d594fSAndroid Build Coastguard Worker char bytes_test[8] = { 0 }, bytes_reference[8] = { 0 };
318*795d594fSAndroid Build Coastguard Worker uint16_t out_buf_test[4] = { 0 }, out_buf_reference[4] = { 0 };
319*795d594fSAndroid Build Coastguard Worker int byte_count_test, byte_count_reference;
320*795d594fSAndroid Build Coastguard Worker int char_count_test, char_count_reference;
321*795d594fSAndroid Build Coastguard Worker
322*795d594fSAndroid Build Coastguard Worker // Calculate the number of utf-8 bytes for the utf-16 chars.
323*795d594fSAndroid Build Coastguard Worker byte_count_reference = CountModifiedUtf8BytesInUtf16_reference(buf, char_count);
324*795d594fSAndroid Build Coastguard Worker byte_count_test = CountModifiedUtf8BytesInUtf16(buf, char_count);
325*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(byte_count_reference, byte_count_test);
326*795d594fSAndroid Build Coastguard Worker
327*795d594fSAndroid Build Coastguard Worker // Convert the utf-16 string to utf-8 bytes.
328*795d594fSAndroid Build Coastguard Worker ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
329*795d594fSAndroid Build Coastguard Worker ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
330*795d594fSAndroid Build Coastguard Worker for (int i = 0; i < byte_count_test; ++i) {
331*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(bytes_reference[i], bytes_test[i]);
332*795d594fSAndroid Build Coastguard Worker }
333*795d594fSAndroid Build Coastguard Worker
334*795d594fSAndroid Build Coastguard Worker // Calculate the number of utf-16 chars from the utf-8 bytes.
335*795d594fSAndroid Build Coastguard Worker bytes_reference[byte_count_reference] = 0; // Reference function needs null termination.
336*795d594fSAndroid Build Coastguard Worker char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
337*795d594fSAndroid Build Coastguard Worker char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
338*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(char_count, char_count_reference);
339*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(char_count, char_count_test);
340*795d594fSAndroid Build Coastguard Worker
341*795d594fSAndroid Build Coastguard Worker // Convert the utf-8 bytes back to utf-16 chars.
342*795d594fSAndroid Build Coastguard Worker // Does not need copied _reference version of the function because the original
343*795d594fSAndroid Build Coastguard Worker // function with the old API is retained for debug/testing code.
344*795d594fSAndroid Build Coastguard Worker ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
345*795d594fSAndroid Build Coastguard Worker ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
346*795d594fSAndroid Build Coastguard Worker for (int i = 0; i < char_count_test; ++i) {
347*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(buf[i], out_buf_reference[i]);
348*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(buf[i], out_buf_test[i]);
349*795d594fSAndroid Build Coastguard Worker }
350*795d594fSAndroid Build Coastguard Worker }
351*795d594fSAndroid Build Coastguard Worker
TEST_F(UtfTest,ExhaustiveBidirectionalCodePointCheck)352*795d594fSAndroid Build Coastguard Worker TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) {
353*795d594fSAndroid Build Coastguard Worker for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) {
354*795d594fSAndroid Build Coastguard Worker uint16_t buf[4] = { 0 };
355*795d594fSAndroid Build Coastguard Worker if (codePoint <= 0xffff) {
356*795d594fSAndroid Build Coastguard Worker if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
357*795d594fSAndroid Build Coastguard Worker // According to the Unicode standard, no character will ever
358*795d594fSAndroid Build Coastguard Worker // be assigned to these code points, and they cannot be encoded
359*795d594fSAndroid Build Coastguard Worker // into either utf-16 or utf-8.
360*795d594fSAndroid Build Coastguard Worker continue;
361*795d594fSAndroid Build Coastguard Worker }
362*795d594fSAndroid Build Coastguard Worker buf[0] = 'h';
363*795d594fSAndroid Build Coastguard Worker buf[1] = codePoint;
364*795d594fSAndroid Build Coastguard Worker buf[2] = 'e';
365*795d594fSAndroid Build Coastguard Worker testConversions(buf, 2);
366*795d594fSAndroid Build Coastguard Worker testConversions(buf, 3);
367*795d594fSAndroid Build Coastguard Worker testConversions(buf + 1, 1);
368*795d594fSAndroid Build Coastguard Worker testConversions(buf + 1, 2);
369*795d594fSAndroid Build Coastguard Worker } else {
370*795d594fSAndroid Build Coastguard Worker buf[0] = 'h';
371*795d594fSAndroid Build Coastguard Worker codePointToSurrogatePair(codePoint, buf[1], buf[2]);
372*795d594fSAndroid Build Coastguard Worker buf[3] = 'e';
373*795d594fSAndroid Build Coastguard Worker testConversions(buf, 2);
374*795d594fSAndroid Build Coastguard Worker testConversions(buf, 3);
375*795d594fSAndroid Build Coastguard Worker testConversions(buf, 4);
376*795d594fSAndroid Build Coastguard Worker testConversions(buf + 1, 1);
377*795d594fSAndroid Build Coastguard Worker testConversions(buf + 1, 2);
378*795d594fSAndroid Build Coastguard Worker testConversions(buf + 1, 3);
379*795d594fSAndroid Build Coastguard Worker }
380*795d594fSAndroid Build Coastguard Worker }
381*795d594fSAndroid Build Coastguard Worker }
382*795d594fSAndroid Build Coastguard Worker
TEST_F(UtfTest,NonAscii)383*795d594fSAndroid Build Coastguard Worker TEST_F(UtfTest, NonAscii) {
384*795d594fSAndroid Build Coastguard Worker const char kNonAsciiCharacter = '\x80';
385*795d594fSAndroid Build Coastguard Worker const char input[] = { kNonAsciiCharacter, '\0' };
386*795d594fSAndroid Build Coastguard Worker uint32_t hash = ComputeModifiedUtf8Hash(input);
387*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(static_cast<uint8_t>(kNonAsciiCharacter), hash);
388*795d594fSAndroid Build Coastguard Worker }
389*795d594fSAndroid Build Coastguard Worker
TEST_F(UtfTest,PrintableStringUtf8)390*795d594fSAndroid Build Coastguard Worker TEST_F(UtfTest, PrintableStringUtf8) {
391*795d594fSAndroid Build Coastguard Worker // Note: This is UTF-8, not Modified-UTF-8.
392*795d594fSAndroid Build Coastguard Worker const uint8_t kTestSequence[] = { 0xf0, 0x90, 0x80, 0x80, 0 };
393*795d594fSAndroid Build Coastguard Worker const char* start = reinterpret_cast<const char*>(kTestSequence);
394*795d594fSAndroid Build Coastguard Worker const char* ptr = start;
395*795d594fSAndroid Build Coastguard Worker uint32_t pair = GetUtf16FromUtf8(&ptr);
396*795d594fSAndroid Build Coastguard Worker ASSERT_EQ(*ptr, '\0');
397*795d594fSAndroid Build Coastguard Worker uint16_t leading = GetLeadingUtf16Char(pair);
398*795d594fSAndroid Build Coastguard Worker uint16_t trailing = GetTrailingUtf16Char(pair);
399*795d594fSAndroid Build Coastguard Worker ASSERT_NE(0u, trailing);
400*795d594fSAndroid Build Coastguard Worker
401*795d594fSAndroid Build Coastguard Worker std::string expected = android::base::StringPrintf("\"\\u%04x\\u%04x\"",
402*795d594fSAndroid Build Coastguard Worker static_cast<unsigned>(leading),
403*795d594fSAndroid Build Coastguard Worker static_cast<unsigned>(trailing));
404*795d594fSAndroid Build Coastguard Worker std::string printable = PrintableString(start);
405*795d594fSAndroid Build Coastguard Worker EXPECT_EQ(expected, printable);
406*795d594fSAndroid Build Coastguard Worker }
407*795d594fSAndroid Build Coastguard Worker
408*795d594fSAndroid Build Coastguard Worker } // namespace art
409