xref: /aosp_15_r20/external/pigweed/pw_string/utf_codecs_test.cc (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1 // Copyright 2024 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 #include "pw_string/utf_codecs.h"
16 
17 #include <array>
18 #include <string>
19 #include <string_view>
20 
21 #include "pw_unit_test/framework.h"
22 
23 namespace pw {
24 namespace {
25 
TEST(UtfCodecs,IsValidCodepoint)26 TEST(UtfCodecs, IsValidCodepoint) {
27   EXPECT_TRUE(utf::IsValidCodepoint(0u));
28   EXPECT_TRUE(utf::IsValidCodepoint(0xD800u - 1u));
29   EXPECT_FALSE(utf::IsValidCodepoint(0xD800u));
30   EXPECT_FALSE(utf::IsValidCodepoint(0xE000u - 1u));
31   EXPECT_TRUE(utf::IsValidCodepoint(0xE000u));
32   EXPECT_TRUE(utf::IsValidCodepoint(0x10FFFFu));
33   EXPECT_FALSE(utf::IsValidCodepoint(0x10FFFFu + 1));
34   EXPECT_FALSE(utf::IsValidCodepoint(0xFFFFFFFFu));
35 }
36 
TEST(UtfCodecs,IsValidCharacter)37 TEST(UtfCodecs, IsValidCharacter) {
38   EXPECT_TRUE(utf::IsValidCharacter(0u));
39   EXPECT_TRUE(utf::IsValidCharacter(0xD800u - 1u));
40   EXPECT_FALSE(utf::IsValidCharacter(0xD800u));
41   EXPECT_FALSE(utf::IsValidCharacter(0xE000u - 1u));
42   EXPECT_TRUE(utf::IsValidCharacter(0xE000u));
43   EXPECT_TRUE(utf::IsValidCharacter(0xFDD0u - 1u));
44   EXPECT_FALSE(utf::IsValidCharacter(0xFDD0u));
45   EXPECT_FALSE(utf::IsValidCharacter(0xFDEFu));
46   EXPECT_TRUE(utf::IsValidCharacter(0xFDEFu + 1u));
47   EXPECT_TRUE(utf::IsValidCharacter(0x10FFFFu - 2u));
48   EXPECT_FALSE(utf::IsValidCharacter(0x10FFFFu + 1u));
49   EXPECT_FALSE(utf::IsValidCharacter(0xFFFEu));
50   EXPECT_FALSE(utf::IsValidCharacter(0x1FFFEu));
51 }
52 
TEST(UtfCodecs,IsStringUTF8)53 TEST(UtfCodecs, IsStringUTF8) {
54   EXPECT_TRUE(pw::utf8::IsStringValid("Just some ascii!"));
55   EXPECT_TRUE(pw::utf8::IsStringValid("Test��"));
56   EXPECT_TRUE(pw::utf8::IsStringValid(""));
57   std::array<char, 4> invalid = {
58       char(0xFF), char(0xFF), char(0xFF), char(0xFF)};
59   EXPECT_FALSE(pw::utf8::IsStringValid({invalid.data(), invalid.size()}));
60 }
61 
TEST(UtfCodecs,ReadCharacter)62 TEST(UtfCodecs, ReadCharacter) {
63   {
64     const char str[] = "$";
65     const size_t char_byte_size = sizeof(str) - 1;
66 
67     auto rslt = pw::utf8::ReadCodePoint(str);
68     EXPECT_TRUE(rslt.ok());
69     EXPECT_EQ(rslt->size(), char_byte_size);
70     EXPECT_EQ(rslt->code_point(), 0x0024u);
71   }
72 
73   {
74     const char str[] = "£";
75     const size_t char_byte_size = sizeof(str) - 1;
76 
77     auto rslt = pw::utf8::ReadCodePoint(str);
78     EXPECT_TRUE(rslt.ok());
79     EXPECT_EQ(rslt->size(), char_byte_size);
80     EXPECT_EQ(rslt->code_point(), 0x00A3u);
81 
82     // Read too small.
83     rslt = pw::utf8::ReadCodePoint({str, char_byte_size - 1});
84     EXPECT_FALSE(rslt.ok());
85 
86     // Continuation bits are incorrect.
87     std::string bad_continuation{str};
88     uint8_t last_byte = static_cast<uint8_t>(bad_continuation.back());
89     last_byte &= 0x7F;
90     bad_continuation.back() = static_cast<char>(last_byte);
91     rslt = pw::utf8::ReadCodePoint(bad_continuation);
92     EXPECT_FALSE(rslt.ok());
93   }
94 
95   {
96     const char str[] = "€";
97     const size_t char_byte_size = sizeof(str) - 1;
98 
99     auto rslt = pw::utf8::ReadCodePoint(str);
100     EXPECT_TRUE(rslt.ok());
101     EXPECT_EQ(rslt->size(), char_byte_size);
102     EXPECT_EQ(rslt->code_point(), 0x20ACu);
103 
104     // Read too small.
105     rslt = pw::utf8::ReadCodePoint({str, char_byte_size - 1});
106     EXPECT_FALSE(rslt.ok());
107 
108     // Continuation bits are incorrect.
109     std::string bad_continuation{str};
110     uint8_t last_byte = static_cast<uint8_t>(bad_continuation.back());
111     last_byte &= 0x7F;
112     bad_continuation.back() = static_cast<char>(last_byte);
113     rslt = pw::utf8::ReadCodePoint(bad_continuation);
114     EXPECT_FALSE(rslt.ok());
115   }
116 
117   {
118     const char str[] = "��";
119     const size_t char_byte_size = sizeof(str) - 1;
120 
121     auto rslt = pw::utf8::ReadCodePoint(str);
122     EXPECT_TRUE(rslt.ok());
123     EXPECT_EQ(rslt->size(), char_byte_size);
124     EXPECT_EQ(rslt->code_point(), 0x10348u);
125 
126     // Read too small.
127     rslt = pw::utf8::ReadCodePoint({str, char_byte_size - 1});
128     EXPECT_FALSE(rslt.ok());
129 
130     // Continuation bits are incorrect.
131     std::string bad_continuation{str};
132     uint8_t last_byte = static_cast<uint8_t>(bad_continuation.back());
133     last_byte &= 0x7F;
134     bad_continuation.back() = static_cast<char>(last_byte);
135     rslt = pw::utf8::ReadCodePoint(bad_continuation);
136     EXPECT_FALSE(rslt.ok());
137   }
138 
139   {
140     const char str[] = {
141         char(0xFF), char(0xFF), char(0xFF), char(0xFF), char(0)};
142 
143     auto rslt = pw::utf8::ReadCodePoint(str);
144     EXPECT_FALSE(rslt.ok());
145     EXPECT_TRUE(rslt.status().IsInvalidArgument());
146   }
147 
148   {
149     const char str[] = "";
150 
151     auto rslt = pw::utf8::ReadCodePoint(str);
152     EXPECT_FALSE(rslt.ok());
153     EXPECT_TRUE(rslt.status().IsInvalidArgument());
154   }
155 
156   {
157     // Encode a code point that ends up being an invalid utf-8 encoding.
158     const uint32_t invalid_utf8_code_point = 0xD800u + 1u;
159     auto encoded = pw::utf8::EncodeCodePoint(invalid_utf8_code_point);
160     EXPECT_TRUE(encoded.ok());
161 
162     // Reading it back should fail validation.
163     auto rslt = pw::utf8::ReadCodePoint(encoded->as_view());
164     EXPECT_FALSE(rslt.ok());
165     EXPECT_TRUE(rslt.status().IsOutOfRange());
166   }
167 }
168 
TEST(UtfCodecs,FunctionsAreConstexpr)169 TEST(UtfCodecs, FunctionsAreConstexpr) {
170   {
171     constexpr char str[] = "$";
172     constexpr size_t char_byte_size = sizeof(str) - 1;
173     const uint32_t code_point = 0x0024u;
174 
175     constexpr pw::Result<pw::utf::CodePointAndSize> rslt =
176         pw::utf8::ReadCodePoint(str);
177     static_assert(rslt.ok());
178 
179     EXPECT_TRUE(rslt.ok());
180     EXPECT_EQ(rslt->size(), char_byte_size);
181     EXPECT_EQ(rslt->code_point(), 0x0024u);
182 
183     constexpr bool valid_str = pw::utf8::IsStringValid(str);
184     static_assert(valid_str);
185 
186     EXPECT_TRUE(valid_str);
187 
188     constexpr auto encoded = pw::utf8::EncodeCodePoint(code_point);
189     static_assert(encoded.ok());
190   }
191 }
192 
TEST(UtfCodecs,WriteCodePoint)193 TEST(UtfCodecs, WriteCodePoint) {
194   {
195     const char str[] = "$";
196     const size_t char_byte_size = sizeof(str) - 1;
197     const uint32_t code_point = 0x0024u;
198 
199     std::array<char, 2> buffer{};
200     pw::StringBuilder out(buffer);
201     auto rslt = pw::utf8::WriteCodePoint(code_point, out);
202     EXPECT_TRUE(rslt.ok());
203     EXPECT_EQ(out.size(), char_byte_size);
204     EXPECT_EQ(out, std::string_view(str, char_byte_size));
205   }
206 
207   {
208     const char str[] = "£";
209     const size_t char_byte_size = sizeof(str) - 1;
210     const uint32_t code_point = 0x00A3u;
211 
212     std::array<char, 3> buffer{};
213     pw::StringBuilder out(buffer);
214     auto rslt = pw::utf8::WriteCodePoint(code_point, out);
215     EXPECT_TRUE(rslt.ok());
216     EXPECT_EQ(out.size(), char_byte_size);
217     EXPECT_EQ(out, std::string_view(str, char_byte_size));
218   }
219 
220   {
221     const char str[] = "€";
222     const size_t char_byte_size = sizeof(str) - 1;
223     const uint32_t code_point = 0x20ACu;
224 
225     std::array<char, 4> buffer{};
226     pw::StringBuilder out(buffer);
227     auto rslt = pw::utf8::WriteCodePoint(code_point, out);
228     EXPECT_TRUE(rslt.ok());
229     EXPECT_EQ(out.size(), char_byte_size);
230     EXPECT_EQ(out, std::string_view(str, char_byte_size));
231   }
232 
233   {
234     const char str[] = "��";
235     const size_t char_byte_size = sizeof(str) - 1;
236     const uint32_t code_point = 0x10348u;
237 
238     std::array<char, 5> buffer{};
239     pw::StringBuilder out(buffer);
240     auto rslt = pw::utf8::WriteCodePoint(code_point, out);
241     EXPECT_TRUE(rslt.ok());
242     EXPECT_EQ(out.size(), char_byte_size);
243     EXPECT_EQ(out, std::string_view(str, char_byte_size));
244   }
245 
246   {
247     const uint32_t code_point = 0xFFFFFFFFu;
248 
249     std::array<char, 4> buffer{};
250     pw::StringBuilder out(buffer);
251     EXPECT_FALSE(pw::utf8::WriteCodePoint(code_point, out).ok());
252     // Nothing should be written.
253     EXPECT_EQ(out.view(), std::string_view{});
254   }
255 
256   {
257     const char str[] = "��";
258     const uint32_t code_point = 0x10348u;
259 
260     std::array<char, 3> buffer{};
261     pw::StringBuilder out(buffer);
262     // Buffer was too small so it should have a failure status.
263     EXPECT_FALSE(pw::utf8::WriteCodePoint(code_point, out).ok());
264     // We expect the first two code units to be written.
265     EXPECT_EQ(out.view(), std::string_view(str, 2));
266   }
267 }
268 
TEST(UtfCodecs,EncodeCodePoint)269 TEST(UtfCodecs, EncodeCodePoint) {
270   {
271     const char str[] = "$";
272     const size_t char_byte_size = sizeof(str) - 1;
273     const uint32_t code_point = 0x0024u;
274 
275     auto rslt = pw::utf8::EncodeCodePoint(code_point);
276     EXPECT_TRUE(rslt.ok());
277     EXPECT_EQ(rslt->as_view(), std::string_view(str, char_byte_size));
278   }
279 
280   {
281     const char str[] = "£";
282     const size_t char_byte_size = sizeof(str) - 1;
283     const uint32_t code_point = 0x00A3u;
284 
285     auto rslt = pw::utf8::EncodeCodePoint(code_point);
286     EXPECT_TRUE(rslt.ok());
287     EXPECT_EQ(rslt->as_view(), std::string_view(str, char_byte_size));
288   }
289 
290   {
291     const char str[] = "€";
292     const size_t char_byte_size = sizeof(str) - 1;
293     const uint32_t code_point = 0x20ACu;
294 
295     auto rslt = pw::utf8::EncodeCodePoint(code_point);
296     EXPECT_TRUE(rslt.ok());
297     EXPECT_EQ(rslt->as_view(), std::string_view(str, char_byte_size));
298   }
299 
300   {
301     const char str[] = "��";
302     const size_t char_byte_size = sizeof(str) - 1;
303     const uint32_t code_point = 0x10348u;
304 
305     auto rslt = pw::utf8::EncodeCodePoint(code_point);
306     EXPECT_TRUE(rslt.ok());
307     EXPECT_EQ(rslt->as_view(), std::string_view(str, char_byte_size));
308   }
309 
310   {
311     const uint32_t code_point = 0xFFFFFFFFu;
312 
313     EXPECT_FALSE(pw::utf8::EncodeCodePoint(code_point).ok());
314   }
315 }
316 
317 }  // namespace
318 }  // namespace pw
319