1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <stddef.h>
6
7 #include <algorithm>
8
9 #include "base/logging.h"
10 #include "base/macros.h"
11 #include "base/strings/string_piece.h"
12 #include "base/strings/utf_offset_string_conversions.h"
13 #include "testing/gtest/include/gtest/gtest.h"
14
15 namespace base {
16
17 namespace {
18
19 static const size_t kNpos = string16::npos;
20
21 } // namespace
22
TEST(UTFOffsetStringConversionsTest,AdjustOffset)23 TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
24 struct UTF8ToUTF16Case {
25 const char* utf8;
26 size_t input_offset;
27 size_t output_offset;
28 } utf8_to_utf16_cases[] = {
29 {"", 0, 0},
30 {"", kNpos, kNpos},
31 {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
32 {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
33 {"\xed\xb0\x80z", 3, 3},
34 {"A\xF0\x90\x8C\x80z", 1, 1},
35 {"A\xF0\x90\x8C\x80z", 2, kNpos},
36 {"A\xF0\x90\x8C\x80z", 5, 3},
37 {"A\xF0\x90\x8C\x80z", 6, 4},
38 {"A\xF0\x90\x8C\x80z", kNpos, kNpos},
39 };
40 for (size_t i = 0; i < arraysize(utf8_to_utf16_cases); ++i) {
41 const size_t offset = utf8_to_utf16_cases[i].input_offset;
42 std::vector<size_t> offsets;
43 offsets.push_back(offset);
44 UTF8ToUTF16AndAdjustOffsets(utf8_to_utf16_cases[i].utf8, &offsets);
45 EXPECT_EQ(utf8_to_utf16_cases[i].output_offset, offsets[0]);
46 }
47
48 struct UTF16ToUTF8Case {
49 char16 utf16[10];
50 size_t input_offset;
51 size_t output_offset;
52 } utf16_to_utf8_cases[] = {
53 {{}, 0, 0},
54 // Converted to 3-byte utf-8 sequences
55 {{0x5909, 0x63DB}, 3, kNpos},
56 {{0x5909, 0x63DB}, 2, 6},
57 {{0x5909, 0x63DB}, 1, 3},
58 {{0x5909, 0x63DB}, 0, 0},
59 // Converted to 2-byte utf-8 sequences
60 {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
61 {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
62 {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
63 {{'A', 0x00bc, 0x00be, 'z'}, 4, 6},
64 // Surrogate pair
65 {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
66 {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos},
67 {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
68 {{'A', 0xd800, 0xdf00, 'z'}, 4, 6},
69 };
70 for (size_t i = 0; i < arraysize(utf16_to_utf8_cases); ++i) {
71 size_t offset = utf16_to_utf8_cases[i].input_offset;
72 std::vector<size_t> offsets;
73 offsets.push_back(offset);
74 UTF16ToUTF8AndAdjustOffsets(utf16_to_utf8_cases[i].utf16, &offsets);
75 EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offsets[0]) << i;
76 }
77 }
78
TEST(UTFOffsetStringConversionsTest,LimitOffsets)79 TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
80 const OffsetAdjuster::Adjustments kNoAdjustments;
81 const size_t kLimit = 10;
82 const size_t kItems = 20;
83 std::vector<size_t> size_ts;
84 for (size_t t = 0; t < kItems; ++t) {
85 size_ts.push_back(t);
86 OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
87 }
88 size_t unlimited_count = 0;
89 for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
90 ++ti) {
91 if (*ti != kNpos)
92 ++unlimited_count;
93 }
94 EXPECT_EQ(11U, unlimited_count);
95
96 // Reverse the values in the vector and try again.
97 size_ts.clear();
98 for (size_t t = kItems; t > 0; --t) {
99 size_ts.push_back(t - 1);
100 OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
101 }
102 unlimited_count = 0;
103 for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
104 ++ti) {
105 if (*ti != kNpos)
106 ++unlimited_count;
107 }
108 EXPECT_EQ(11U, unlimited_count);
109 }
110
TEST(UTFOffsetStringConversionsTest,AdjustOffsets)111 TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
112 // Imagine we have strings as shown in the following cases where the
113 // X's represent encoded characters.
114 // 1: abcXXXdef ==> abcXdef
115 {
116 std::vector<size_t> offsets;
117 for (size_t t = 0; t <= 9; ++t)
118 offsets.push_back(t);
119 OffsetAdjuster::Adjustments adjustments;
120 adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
121 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
122 size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6, 7};
123 EXPECT_EQ(offsets.size(), arraysize(expected_1));
124 for (size_t i = 0; i < arraysize(expected_1); ++i)
125 EXPECT_EQ(expected_1[i], offsets[i]);
126 }
127
128 // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
129 {
130 std::vector<size_t> offsets;
131 for (size_t t = 0; t <= 23; ++t)
132 offsets.push_back(t);
133 OffsetAdjuster::Adjustments adjustments;
134 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
135 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
136 adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
137 adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
138 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
139 size_t expected_2[] = {
140 0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, kNpos, kNpos, kNpos,
141 kNpos, kNpos, kNpos, 10, 11, 12, 13, kNpos, kNpos, 14
142 };
143 EXPECT_EQ(offsets.size(), arraysize(expected_2));
144 for (size_t i = 0; i < arraysize(expected_2); ++i)
145 EXPECT_EQ(expected_2[i], offsets[i]);
146 }
147
148 // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
149 {
150 std::vector<size_t> offsets;
151 for (size_t t = 0; t <= 17; ++t)
152 offsets.push_back(t);
153 OffsetAdjuster::Adjustments adjustments;
154 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
155 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
156 adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
157 adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
158 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
159 size_t expected_3[] = {
160 0, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, 7, 8, kNpos, kNpos, 11,
161 12, kNpos, 12
162 };
163 EXPECT_EQ(offsets.size(), arraysize(expected_3));
164 for (size_t i = 0; i < arraysize(expected_3); ++i)
165 EXPECT_EQ(expected_3[i], offsets[i]);
166 }
167 }
168
TEST(UTFOffsetStringConversionsTest,UnadjustOffsets)169 TEST(UTFOffsetStringConversionsTest, UnadjustOffsets) {
170 // Imagine we have strings as shown in the following cases where the
171 // X's represent encoded characters.
172 // 1: abcXXXdef ==> abcXdef
173 {
174 std::vector<size_t> offsets;
175 for (size_t t = 0; t <= 7; ++t)
176 offsets.push_back(t);
177 OffsetAdjuster::Adjustments adjustments;
178 adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
179 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
180 size_t expected_1[] = {0, 1, 2, 3, 6, 7, 8, 9};
181 EXPECT_EQ(offsets.size(), arraysize(expected_1));
182 for (size_t i = 0; i < arraysize(expected_1); ++i)
183 EXPECT_EQ(expected_1[i], offsets[i]);
184 }
185
186 // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
187 {
188 std::vector<size_t> offsets;
189 for (size_t t = 0; t <= 14; ++t)
190 offsets.push_back(t);
191 OffsetAdjuster::Adjustments adjustments;
192 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
193 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
194 adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
195 adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
196 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
197 size_t expected_2[] = {
198 0, 3, 4, kNpos, 8, 9, 10, kNpos, kNpos, kNpos, 17, 18, 19, 20, 23
199 };
200 EXPECT_EQ(offsets.size(), arraysize(expected_2));
201 for (size_t i = 0; i < arraysize(expected_2); ++i)
202 EXPECT_EQ(expected_2[i], offsets[i]);
203 }
204
205 // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
206 {
207 std::vector<size_t> offsets;
208 for (size_t t = 0; t <= 12; ++t)
209 offsets.push_back(t);
210 OffsetAdjuster::Adjustments adjustments;
211 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
212 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
213 adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
214 adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
215 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
216 size_t expected_3[] = {
217 0, // this could just as easily be 3
218 4, kNpos, kNpos, kNpos, 8, 9, 10, 11, kNpos, kNpos, 14,
219 15 // this could just as easily be 17
220 };
221 EXPECT_EQ(offsets.size(), arraysize(expected_3));
222 for (size_t i = 0; i < arraysize(expected_3); ++i)
223 EXPECT_EQ(expected_3[i], offsets[i]);
224 }
225 }
226
227 // MergeSequentialAdjustments is used by net/base/escape.{h,cc} and
228 // net/base/net_util.{h,cc}. The two tests EscapeTest.AdjustOffset and
229 // NetUtilTest.FormatUrlWithOffsets test its behavior extensively. This
230 // is simply a short, additional test.
TEST(UTFOffsetStringConversionsTest,MergeSequentialAdjustments)231 TEST(UTFOffsetStringConversionsTest, MergeSequentialAdjustments) {
232 // Pretend the input string is "abcdefghijklmnopqrstuvwxyz".
233
234 // Set up |first_adjustments| to
235 // - remove the leading "a"
236 // - combine the "bc" into one character (call it ".")
237 // - remove the "f"
238 // - remove the "tuv"
239 // The resulting string should be ".deghijklmnopqrswxyz".
240 OffsetAdjuster::Adjustments first_adjustments;
241 first_adjustments.push_back(OffsetAdjuster::Adjustment(0, 1, 0));
242 first_adjustments.push_back(OffsetAdjuster::Adjustment(1, 2, 1));
243 first_adjustments.push_back(OffsetAdjuster::Adjustment(5, 1, 0));
244 first_adjustments.push_back(OffsetAdjuster::Adjustment(19, 3, 0));
245
246 // Set up |adjustments_on_adjusted_string| to
247 // - combine the "." character that replaced "bc" with "d" into one character
248 // (call it "?")
249 // - remove the "egh"
250 // - expand the "i" into two characters (call them "12")
251 // - combine the "jkl" into one character (call it "@")
252 // - expand the "z" into two characters (call it "34")
253 // The resulting string should be "?12@mnopqrswxy34".
254 OffsetAdjuster::Adjustments adjustments_on_adjusted_string;
255 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
256 0, 2, 1));
257 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
258 2, 3, 0));
259 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
260 5, 1, 2));
261 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
262 6, 3, 1));
263 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
264 19, 1, 2));
265
266 // Now merge the adjustments and check the results.
267 OffsetAdjuster::MergeSequentialAdjustments(first_adjustments,
268 &adjustments_on_adjusted_string);
269 // The merged adjustments should look like
270 // - combine abcd into "?"
271 // - note: it's also reasonable for the Merge function to instead produce
272 // two adjustments instead of this, one to remove a and another to
273 // combine bcd into "?". This test verifies the current behavior.
274 // - remove efgh
275 // - expand i into "12"
276 // - combine jkl into "@"
277 // - remove tuv
278 // - expand z into "34"
279 ASSERT_EQ(6u, adjustments_on_adjusted_string.size());
280 EXPECT_EQ(0u, adjustments_on_adjusted_string[0].original_offset);
281 EXPECT_EQ(4u, adjustments_on_adjusted_string[0].original_length);
282 EXPECT_EQ(1u, adjustments_on_adjusted_string[0].output_length);
283 EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_offset);
284 EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_length);
285 EXPECT_EQ(0u, adjustments_on_adjusted_string[1].output_length);
286 EXPECT_EQ(8u, adjustments_on_adjusted_string[2].original_offset);
287 EXPECT_EQ(1u, adjustments_on_adjusted_string[2].original_length);
288 EXPECT_EQ(2u, adjustments_on_adjusted_string[2].output_length);
289 EXPECT_EQ(9u, adjustments_on_adjusted_string[3].original_offset);
290 EXPECT_EQ(3u, adjustments_on_adjusted_string[3].original_length);
291 EXPECT_EQ(1u, adjustments_on_adjusted_string[3].output_length);
292 EXPECT_EQ(19u, adjustments_on_adjusted_string[4].original_offset);
293 EXPECT_EQ(3u, adjustments_on_adjusted_string[4].original_length);
294 EXPECT_EQ(0u, adjustments_on_adjusted_string[4].output_length);
295 EXPECT_EQ(25u, adjustments_on_adjusted_string[5].original_offset);
296 EXPECT_EQ(1u, adjustments_on_adjusted_string[5].original_length);
297 EXPECT_EQ(2u, adjustments_on_adjusted_string[5].output_length);
298 }
299
300 } // namespace base
301