xref: /aosp_15_r20/external/cronet/url/url_canon_icu_unittest.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "url/url_canon_icu.h"
6 
7 #include <stddef.h>
8 
9 #include "base/logging.h"
10 #include "base/memory/raw_ptr.h"
11 #include "testing/gtest/include/gtest/gtest.h"
12 #include "third_party/icu/source/common/unicode/ucnv.h"
13 #include "url/url_canon.h"
14 #include "url/url_canon_icu_test_helpers.h"
15 #include "url/url_canon_stdstring.h"
16 #include "url/url_test_utils.h"
17 
18 namespace url {
19 
20 namespace {
21 
TEST(URLCanonIcuTest,ICUCharsetConverter)22 TEST(URLCanonIcuTest, ICUCharsetConverter) {
23   struct ICUCase {
24     const wchar_t* input;
25     const char* encoding;
26     const char* expected;
27   } icu_cases[] = {
28       // UTF-8.
29     {L"Hello, world", "utf-8", "Hello, world"},
30     {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
31       // Non-BMP UTF-8.
32     {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
33       // Big5
34     {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
35       // Unrepresentable character in the destination set.
36     {L"hello\x4f60\x06de\x597dworld", "big5",
37       "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
38   };
39 
40   for (size_t i = 0; i < std::size(icu_cases); i++) {
41     test::UConvScoper conv(icu_cases[i].encoding);
42     ASSERT_TRUE(conv.converter() != NULL);
43     ICUCharsetConverter converter(conv.converter());
44 
45     std::string str;
46     StdStringCanonOutput output(&str);
47 
48     std::u16string input_str(
49         test_utils::TruncateWStringToUTF16(icu_cases[i].input));
50     int input_len = static_cast<int>(input_str.length());
51     converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
52     output.Complete();
53 
54     EXPECT_STREQ(icu_cases[i].expected, str.c_str());
55   }
56 
57   // Test string sizes around the resize boundary for the output to make sure
58   // the converter resizes as needed.
59   const int static_size = 16;
60   test::UConvScoper conv("utf-8");
61   ASSERT_TRUE(conv.converter());
62   ICUCharsetConverter converter(conv.converter());
63   for (int i = static_size - 2; i <= static_size + 2; i++) {
64     // Make a string with the appropriate length.
65     std::u16string input;
66     for (int ch = 0; ch < i; ch++)
67       input.push_back('a');
68 
69     RawCanonOutput<static_size> output;
70     converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
71                                &output);
72     EXPECT_EQ(input.length(), output.length());
73   }
74 }
75 
TEST(URLCanonIcuTest,QueryWithConverter)76 TEST(URLCanonIcuTest, QueryWithConverter) {
77   struct QueryCase {
78     const char* input8;
79     const wchar_t* input16;
80     const char* encoding;
81     const char* expected;
82   } query_cases[] = {
83       // Regular ASCII case in some different encodings.
84     {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
85     {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
86     {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
87       // Chinese input/output
88     {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312",
89       "?q=%C4%E3%BA%C3"},
90     {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
91       // Unencodable character in the destination character set should be
92       // escaped. The escape sequence unescapes to be the entity name:
93       // "?q=&#20320;"
94     {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1",
95       "?q=Chinese%26%2365319%3B"},
96   };
97 
98   for (size_t i = 0; i < std::size(query_cases); i++) {
99     Component out_comp;
100 
101     test::UConvScoper conv(query_cases[i].encoding);
102     ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
103     ICUCharsetConverter converter(conv.converter());
104 
105     if (query_cases[i].input8) {
106       int len = static_cast<int>(strlen(query_cases[i].input8));
107       Component in_comp(0, len);
108       std::string out_str;
109 
110       StdStringCanonOutput output(&out_str);
111       CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output,
112                         &out_comp);
113       output.Complete();
114 
115       EXPECT_EQ(query_cases[i].expected, out_str);
116     }
117 
118     if (query_cases[i].input16) {
119       std::u16string input16(
120           test_utils::TruncateWStringToUTF16(query_cases[i].input16));
121       int len = static_cast<int>(input16.length());
122       Component in_comp(0, len);
123       std::string out_str;
124 
125       StdStringCanonOutput output(&out_str);
126       CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output,
127                         &out_comp);
128       output.Complete();
129 
130       EXPECT_EQ(query_cases[i].expected, out_str);
131     }
132   }
133 
134   // Extra test for input with embedded NULL;
135   std::string out_str;
136   StdStringCanonOutput output(&out_str);
137   Component out_comp;
138   CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
139   output.Complete();
140   EXPECT_EQ("?a%20%00z%01", out_str);
141 }
142 
143 }  // namespace
144 
145 }  // namespace url
146