xref: /aosp_15_r20/external/cronet/base/i18n/streaming_utf8_validator_unittest.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/i18n/streaming_utf8_validator.h"
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <stdio.h>
10 #include <string.h>
11 
12 #include <string>
13 #include <string_view>
14 
15 #include "base/functional/bind.h"
16 #include "base/location.h"
17 #include "base/logging.h"
18 #include "base/memory/ref_counted.h"
19 #include "base/strings/string_util.h"
20 #include "base/strings/stringprintf.h"
21 #include "base/strings/utf_string_conversion_utils.h"
22 #include "base/synchronization/lock.h"
23 #include "base/task/thread_pool.h"
24 #include "base/test/task_environment.h"
25 #include "testing/gtest/include/gtest/gtest.h"
26 #include "third_party/icu/source/common/unicode/utf8.h"
27 
28 namespace base {
29 namespace {
30 
31 // Avoid having to qualify the enum values in the tests.
32 const StreamingUtf8Validator::State VALID_ENDPOINT =
33     StreamingUtf8Validator::VALID_ENDPOINT;
34 const StreamingUtf8Validator::State VALID_MIDPOINT =
35     StreamingUtf8Validator::VALID_MIDPOINT;
36 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID;
37 
38 const uint32_t kThoroughTestChunkSize = 1 << 24;
39 
40 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test {
41  protected:
42   StreamingUtf8ValidatorThoroughTest() = default;
43 
44   // This uses the same logic as base::IsStringUTF8 except it considers
45   // non-characters valid (and doesn't require a string as input).
IsStringUtf8(const uint8_t * src,int32_t src_len)46   static bool IsStringUtf8(const uint8_t* src, int32_t src_len) {
47     int32_t char_index = 0;
48     while (char_index < src_len) {
49       base_icu::UChar32 code_point;
50       U8_NEXT(src, char_index, src_len, code_point);
51       if (!base::IsValidCodepoint(code_point))
52         return false;
53     }
54     return true;
55   }
56 
57   // Converts the passed-in integer to a 4 byte string and then
58   // verifies that IsStringUtf8 and StreamingUtf8Validator agree on
59   // whether it is valid UTF-8 or not.
TestNumber(uint32_t n) const60   void TestNumber(uint32_t n) const {
61     uint8_t test[sizeof n];
62     memcpy(test, &n, sizeof n);
63     StreamingUtf8Validator validator;
64     EXPECT_EQ(IsStringUtf8(test, sizeof n),
65               validator.AddBytes(test) == VALID_ENDPOINT)
66         << "Difference of opinion for \""
67         << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X", test[0], test[1],
68                               test[2], test[3])
69         << "\"";
70   }
71 
72  public:
73   // Tests the 4-byte sequences corresponding to the |size| integers
74   // starting at |begin|. This is intended to be run from a worker
75   // pool. Signals |all_done_| at the end if it thinks all tasks are
76   // finished.
TestRange(uint32_t begin,uint32_t size)77   void TestRange(uint32_t begin, uint32_t size) {
78     for (uint32_t i = 0; i < size; ++i) {
79       TestNumber(begin + i);
80     }
81     base::AutoLock al(lock_);
82     ++tasks_finished_;
83     LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_
84               << " tasks done\n";
85   }
86 
87  protected:
88   base::Lock lock_;
89   int tasks_dispatched_ = 0;
90   int tasks_finished_ = 0;
91 };
92 
93 // Enable locally to verify that this class accepts exactly the same set of
94 // 4-byte strings as ICU-based validation. This tests every possible 4-byte
95 // string, so it is too slow to run routinely on low-powered machines.
TEST_F(StreamingUtf8ValidatorThoroughTest,DISABLED_TestEverything)96 TEST_F(StreamingUtf8ValidatorThoroughTest, DISABLED_TestEverything) {
97   base::test::TaskEnvironment task_environment;
98   {
99     base::AutoLock al(lock_);
100     uint32_t begin = 0;
101     do {
102       base::ThreadPool::PostTask(
103           FROM_HERE, {base::TaskShutdownBehavior::BLOCK_SHUTDOWN},
104           base::BindOnce(&StreamingUtf8ValidatorThoroughTest::TestRange,
105                          base::Unretained(this), begin,
106                          kThoroughTestChunkSize));
107       ++tasks_dispatched_;
108       begin += kThoroughTestChunkSize;
109     } while (begin != 0);
110   }
111 }
112 
113 // These valid and invalid UTF-8 sequences are based on the tests from
114 // base/strings/string_util_unittest.cc
115 
116 // All of the strings in |valid| must represent a single codepoint, because
117 // partial sequences are constructed by taking non-empty prefixes of these
118 // strings.
119 const char* const valid[] = {"\r",           "\n",           "a",
120                              "\xc2\x81",     "\xe1\x80\xbf", "\xf1\x80\xa0\xbf",
121                              "\xef\xbb\xbf",  // UTF-8 BOM
122 };
123 
124 const char* const* const valid_end = valid + std::size(valid);
125 
126 const char* const invalid[] = {
127     // always invalid bytes
128     "\xc0", "\xc1",
129     "\xf5", "\xf6", "\xf7",
130     "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff",
131     // surrogate code points
132     "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf",
133     //
134     // overlong sequences
135     "\xc0\x80",              // U+0000
136     "\xc1\x80",              // "A"
137     "\xc1\x81",              // "B"
138     "\xe0\x80\x80",          // U+0000
139     "\xe0\x82\x80",          // U+0080
140     "\xe0\x9f\xbf",          // U+07ff
141     "\xf0\x80\x80\x8D",      // U+000D
142     "\xf0\x80\x82\x91",      // U+0091
143     "\xf0\x80\xa0\x80",      // U+0800
144     "\xf0\x8f\xbb\xbf",      // U+FEFF (BOM)
145     "\xf8\x80\x80\x80\xbf",  // U+003F
146     "\xfc\x80\x80\x80\xa0\xa5",
147     //
148     // Beyond U+10FFFF
149     "\xf4\x90\x80\x80",          // U+110000
150     "\xf8\xa0\xbf\x80\xbf",      // 5 bytes
151     "\xfc\x9c\xbf\x80\xbf\x80",  // 6 bytes
152     //
153     // BOMs in UTF-16(BE|LE)
154     "\xfe\xff", "\xff\xfe",
155 };
156 
157 const char* const* const invalid_end = invalid + std::size(invalid);
158 
159 // A ForwardIterator which returns all the non-empty prefixes of the elements of
160 // "valid".
161 class PartialIterator {
162  public:
163   // The constructor returns the first iterator, ie. it is equivalent to
164   // begin().
PartialIterator()165   PartialIterator() : index_(0), prefix_length_(0) { Advance(); }
166   // The trivial destructor left intentionally undefined.
167   // This is a value type; the default copy constructor and assignment operator
168   // generated by the compiler are used.
169 
end()170   static PartialIterator end() { return PartialIterator(std::size(valid), 1); }
171 
operator ++()172   PartialIterator& operator++() {
173     Advance();
174     return *this;
175   }
176 
operator *() const177   std::string_view operator*() const {
178     return std::string_view(valid[index_], prefix_length_);
179   }
180 
operator ==(const PartialIterator & rhs) const181   bool operator==(const PartialIterator& rhs) const {
182     return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_;
183   }
184 
operator !=(const PartialIterator & rhs) const185   bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); }
186 
187  private:
188   // This constructor is used by the end() method.
PartialIterator(size_t index,size_t prefix_length)189   PartialIterator(size_t index, size_t prefix_length)
190       : index_(index), prefix_length_(prefix_length) {}
191 
Advance()192   void Advance() {
193     if (index_ < std::size(valid) && prefix_length_ < strlen(valid[index_]))
194       ++prefix_length_;
195     while (index_ < std::size(valid) &&
196            prefix_length_ == strlen(valid[index_])) {
197       ++index_;
198       prefix_length_ = 1;
199     }
200   }
201 
202   // The UTF-8 sequence, as an offset into the |valid| array.
203   size_t index_;
204   size_t prefix_length_;
205 };
206 
207 // A test fixture for tests which test one UTF-8 sequence (or invalid
208 // byte sequence) at a time.
209 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test {
210  protected:
211   // Iterator must be convertible when de-referenced to std::string_view.
212   template <typename Iterator>
CheckRange(Iterator begin,Iterator end,StreamingUtf8Validator::State expected)213   void CheckRange(Iterator begin,
214                   Iterator end,
215                   StreamingUtf8Validator::State expected) {
216     for (Iterator it = begin; it != end; ++it) {
217       StreamingUtf8Validator validator;
218       std::string_view sequence = *it;
219       EXPECT_EQ(expected, validator.AddBytes(base::as_byte_span(sequence)))
220           << "Failed for \"" << sequence << "\"";
221     }
222   }
223 
224   // Adding input a byte at a time should make absolutely no difference.
225   template <typename Iterator>
CheckRangeByteAtATime(Iterator begin,Iterator end,StreamingUtf8Validator::State expected)226   void CheckRangeByteAtATime(Iterator begin,
227                              Iterator end,
228                              StreamingUtf8Validator::State expected) {
229     for (Iterator it = begin; it != end; ++it) {
230       StreamingUtf8Validator validator;
231       std::string_view sequence = *it;
232       StreamingUtf8Validator::State state = VALID_ENDPOINT;
233       for (const auto& cit : sequence) {
234         state = validator.AddBytes(base::as_bytes(base::make_span(&cit, 1u)));
235       }
236       EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\"";
237     }
238   }
239 };
240 
241 // A test fixture for tests which test the concatenation of byte sequences.
242 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test {
243  protected:
244   // Check every possible concatenation of byte sequences from two
245   // ranges, and verify that the combination matches the expected
246   // state.
247   template <typename Iterator1, typename Iterator2>
CheckCombinations(Iterator1 begin1,Iterator1 end1,Iterator2 begin2,Iterator2 end2,StreamingUtf8Validator::State expected)248   void CheckCombinations(Iterator1 begin1,
249                          Iterator1 end1,
250                          Iterator2 begin2,
251                          Iterator2 end2,
252                          StreamingUtf8Validator::State expected) {
253     StreamingUtf8Validator validator;
254     for (Iterator1 it1 = begin1; it1 != end1; ++it1) {
255       std::string_view c1 = *it1;
256       for (Iterator2 it2 = begin2; it2 != end2; ++it2) {
257         std::string_view c2 = *it2;
258         validator.AddBytes(base::as_byte_span(c1));
259         EXPECT_EQ(expected, validator.AddBytes(base::as_byte_span(c2)))
260             << "Failed for \"" << c1 << c2 << "\"";
261         validator.Reset();
262       }
263     }
264   }
265 };
266 
TEST(StreamingUtf8ValidatorTest,NothingIsValid)267 TEST(StreamingUtf8ValidatorTest, NothingIsValid) {
268   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes({}));
269 }
270 
271 // Because the members of the |valid| array need to be non-zero length
272 // sequences and are measured with strlen(), |valid| cannot be used it
273 // to test the NUL character '\0', so the NUL character gets its own
274 // test.
TEST(StreamingUtf8ValidatorTest,NulIsValid)275 TEST(StreamingUtf8ValidatorTest, NulIsValid) {
276   static const char kNul[] = "\x00";
277   EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(
278                                 base::as_bytes(base::make_span(kNul, 1u))));
279 }
280 
281 // Just a basic sanity test before we start getting fancy.
TEST(StreamingUtf8ValidatorTest,HelloWorld)282 TEST(StreamingUtf8ValidatorTest, HelloWorld) {
283   static const char kHelloWorld[] = "Hello, World!";
284   EXPECT_EQ(VALID_ENDPOINT,
285             StreamingUtf8Validator().AddBytes(base::as_bytes(
286                 base::make_span(kHelloWorld, strlen(kHelloWorld)))));
287 }
288 
289 // Check that the Reset() method works.
TEST(StreamingUtf8ValidatorTest,ResetWorks)290 TEST(StreamingUtf8ValidatorTest, ResetWorks) {
291   StreamingUtf8Validator validator;
292   EXPECT_EQ(INVALID,
293             validator.AddBytes(base::as_bytes(base::make_span("\xC0", 1u))));
294   EXPECT_EQ(INVALID,
295             validator.AddBytes(base::as_bytes(base::make_span("a", 1u))));
296   validator.Reset();
297   EXPECT_EQ(VALID_ENDPOINT,
298             validator.AddBytes(base::as_bytes(base::make_span("a", 1u))));
299 }
300 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Valid)301 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) {
302   CheckRange(valid, valid_end, VALID_ENDPOINT);
303 }
304 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Partial)305 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) {
306   CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
307 }
308 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,Invalid)309 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) {
310   CheckRange(invalid, invalid_end, INVALID);
311 }
312 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,ValidByByte)313 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) {
314   CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT);
315 }
316 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,PartialByByte)317 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) {
318   CheckRangeByteAtATime(
319       PartialIterator(), PartialIterator::end(), VALID_MIDPOINT);
320 }
321 
TEST_F(StreamingUtf8ValidatorSingleSequenceTest,InvalidByByte)322 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) {
323   CheckRangeByteAtATime(invalid, invalid_end, INVALID);
324 }
325 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusValidIsValid)326 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) {
327   CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT);
328 }
329 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusPartialIsPartial)330 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) {
331   CheckCombinations(valid,
332                     valid_end,
333                     PartialIterator(),
334                     PartialIterator::end(),
335                     VALID_MIDPOINT);
336 }
337 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusValidIsInvalid)338 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) {
339   CheckCombinations(
340       PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID);
341 }
342 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusPartialIsInvalid)343 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) {
344   CheckCombinations(PartialIterator(),
345                     PartialIterator::end(),
346                     PartialIterator(),
347                     PartialIterator::end(),
348                     INVALID);
349 }
350 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,ValidPlusInvalidIsInvalid)351 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) {
352   CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID);
353 }
354 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusValidIsInvalid)355 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) {
356   CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID);
357 }
358 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusInvalidIsInvalid)359 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) {
360   CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID);
361 }
362 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,InvalidPlusPartialIsInvalid)363 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) {
364   CheckCombinations(
365       invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID);
366 }
367 
TEST_F(StreamingUtf8ValidatorDoubleSequenceTest,PartialPlusInvalidIsInvalid)368 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) {
369   CheckCombinations(
370       PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID);
371 }
372 
TEST(StreamingUtf8ValidatorValidateTest,EmptyIsValid)373 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) {
374   EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string()));
375 }
376 
TEST(StreamingUtf8ValidatorValidateTest,SimpleValidCase)377 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) {
378   EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81"));
379 }
380 
TEST(StreamingUtf8ValidatorValidateTest,SimpleInvalidCase)381 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) {
382   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80"));
383 }
384 
TEST(StreamingUtf8ValidatorValidateTest,TruncatedIsInvalid)385 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) {
386   EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2"));
387 }
388 
389 }  // namespace
390 }  // namespace base
391