1 // Copyright 2024 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14
15 #include "pw_string/utf_codecs.h"
16
17 #include <array>
18 #include <string>
19 #include <string_view>
20
21 #include "pw_unit_test/framework.h"
22
23 namespace pw {
24 namespace {
25
TEST(UtfCodecs,IsValidCodepoint)26 TEST(UtfCodecs, IsValidCodepoint) {
27 EXPECT_TRUE(utf::IsValidCodepoint(0u));
28 EXPECT_TRUE(utf::IsValidCodepoint(0xD800u - 1u));
29 EXPECT_FALSE(utf::IsValidCodepoint(0xD800u));
30 EXPECT_FALSE(utf::IsValidCodepoint(0xE000u - 1u));
31 EXPECT_TRUE(utf::IsValidCodepoint(0xE000u));
32 EXPECT_TRUE(utf::IsValidCodepoint(0x10FFFFu));
33 EXPECT_FALSE(utf::IsValidCodepoint(0x10FFFFu + 1));
34 EXPECT_FALSE(utf::IsValidCodepoint(0xFFFFFFFFu));
35 }
36
TEST(UtfCodecs,IsValidCharacter)37 TEST(UtfCodecs, IsValidCharacter) {
38 EXPECT_TRUE(utf::IsValidCharacter(0u));
39 EXPECT_TRUE(utf::IsValidCharacter(0xD800u - 1u));
40 EXPECT_FALSE(utf::IsValidCharacter(0xD800u));
41 EXPECT_FALSE(utf::IsValidCharacter(0xE000u - 1u));
42 EXPECT_TRUE(utf::IsValidCharacter(0xE000u));
43 EXPECT_TRUE(utf::IsValidCharacter(0xFDD0u - 1u));
44 EXPECT_FALSE(utf::IsValidCharacter(0xFDD0u));
45 EXPECT_FALSE(utf::IsValidCharacter(0xFDEFu));
46 EXPECT_TRUE(utf::IsValidCharacter(0xFDEFu + 1u));
47 EXPECT_TRUE(utf::IsValidCharacter(0x10FFFFu - 2u));
48 EXPECT_FALSE(utf::IsValidCharacter(0x10FFFFu + 1u));
49 EXPECT_FALSE(utf::IsValidCharacter(0xFFFEu));
50 EXPECT_FALSE(utf::IsValidCharacter(0x1FFFEu));
51 }
52
TEST(UtfCodecs,IsStringUTF8)53 TEST(UtfCodecs, IsStringUTF8) {
54 EXPECT_TRUE(pw::utf8::IsStringValid("Just some ascii!"));
55 EXPECT_TRUE(pw::utf8::IsStringValid("Test"));
56 EXPECT_TRUE(pw::utf8::IsStringValid(""));
57 std::array<char, 4> invalid = {
58 char(0xFF), char(0xFF), char(0xFF), char(0xFF)};
59 EXPECT_FALSE(pw::utf8::IsStringValid({invalid.data(), invalid.size()}));
60 }
61
TEST(UtfCodecs,ReadCharacter)62 TEST(UtfCodecs, ReadCharacter) {
63 {
64 const char str[] = "$";
65 const size_t char_byte_size = sizeof(str) - 1;
66
67 auto rslt = pw::utf8::ReadCodePoint(str);
68 EXPECT_TRUE(rslt.ok());
69 EXPECT_EQ(rslt->size(), char_byte_size);
70 EXPECT_EQ(rslt->code_point(), 0x0024u);
71 }
72
73 {
74 const char str[] = "£";
75 const size_t char_byte_size = sizeof(str) - 1;
76
77 auto rslt = pw::utf8::ReadCodePoint(str);
78 EXPECT_TRUE(rslt.ok());
79 EXPECT_EQ(rslt->size(), char_byte_size);
80 EXPECT_EQ(rslt->code_point(), 0x00A3u);
81
82 // Read too small.
83 rslt = pw::utf8::ReadCodePoint({str, char_byte_size - 1});
84 EXPECT_FALSE(rslt.ok());
85
86 // Continuation bits are incorrect.
87 std::string bad_continuation{str};
88 uint8_t last_byte = static_cast<uint8_t>(bad_continuation.back());
89 last_byte &= 0x7F;
90 bad_continuation.back() = static_cast<char>(last_byte);
91 rslt = pw::utf8::ReadCodePoint(bad_continuation);
92 EXPECT_FALSE(rslt.ok());
93 }
94
95 {
96 const char str[] = "€";
97 const size_t char_byte_size = sizeof(str) - 1;
98
99 auto rslt = pw::utf8::ReadCodePoint(str);
100 EXPECT_TRUE(rslt.ok());
101 EXPECT_EQ(rslt->size(), char_byte_size);
102 EXPECT_EQ(rslt->code_point(), 0x20ACu);
103
104 // Read too small.
105 rslt = pw::utf8::ReadCodePoint({str, char_byte_size - 1});
106 EXPECT_FALSE(rslt.ok());
107
108 // Continuation bits are incorrect.
109 std::string bad_continuation{str};
110 uint8_t last_byte = static_cast<uint8_t>(bad_continuation.back());
111 last_byte &= 0x7F;
112 bad_continuation.back() = static_cast<char>(last_byte);
113 rslt = pw::utf8::ReadCodePoint(bad_continuation);
114 EXPECT_FALSE(rslt.ok());
115 }
116
117 {
118 const char str[] = "";
119 const size_t char_byte_size = sizeof(str) - 1;
120
121 auto rslt = pw::utf8::ReadCodePoint(str);
122 EXPECT_TRUE(rslt.ok());
123 EXPECT_EQ(rslt->size(), char_byte_size);
124 EXPECT_EQ(rslt->code_point(), 0x10348u);
125
126 // Read too small.
127 rslt = pw::utf8::ReadCodePoint({str, char_byte_size - 1});
128 EXPECT_FALSE(rslt.ok());
129
130 // Continuation bits are incorrect.
131 std::string bad_continuation{str};
132 uint8_t last_byte = static_cast<uint8_t>(bad_continuation.back());
133 last_byte &= 0x7F;
134 bad_continuation.back() = static_cast<char>(last_byte);
135 rslt = pw::utf8::ReadCodePoint(bad_continuation);
136 EXPECT_FALSE(rslt.ok());
137 }
138
139 {
140 const char str[] = {
141 char(0xFF), char(0xFF), char(0xFF), char(0xFF), char(0)};
142
143 auto rslt = pw::utf8::ReadCodePoint(str);
144 EXPECT_FALSE(rslt.ok());
145 EXPECT_TRUE(rslt.status().IsInvalidArgument());
146 }
147
148 {
149 const char str[] = "";
150
151 auto rslt = pw::utf8::ReadCodePoint(str);
152 EXPECT_FALSE(rslt.ok());
153 EXPECT_TRUE(rslt.status().IsInvalidArgument());
154 }
155
156 {
157 // Encode a code point that ends up being an invalid utf-8 encoding.
158 const uint32_t invalid_utf8_code_point = 0xD800u + 1u;
159 auto encoded = pw::utf8::EncodeCodePoint(invalid_utf8_code_point);
160 EXPECT_TRUE(encoded.ok());
161
162 // Reading it back should fail validation.
163 auto rslt = pw::utf8::ReadCodePoint(encoded->as_view());
164 EXPECT_FALSE(rslt.ok());
165 EXPECT_TRUE(rslt.status().IsOutOfRange());
166 }
167 }
168
TEST(UtfCodecs,FunctionsAreConstexpr)169 TEST(UtfCodecs, FunctionsAreConstexpr) {
170 {
171 constexpr char str[] = "$";
172 constexpr size_t char_byte_size = sizeof(str) - 1;
173 const uint32_t code_point = 0x0024u;
174
175 constexpr pw::Result<pw::utf::CodePointAndSize> rslt =
176 pw::utf8::ReadCodePoint(str);
177 static_assert(rslt.ok());
178
179 EXPECT_TRUE(rslt.ok());
180 EXPECT_EQ(rslt->size(), char_byte_size);
181 EXPECT_EQ(rslt->code_point(), 0x0024u);
182
183 constexpr bool valid_str = pw::utf8::IsStringValid(str);
184 static_assert(valid_str);
185
186 EXPECT_TRUE(valid_str);
187
188 constexpr auto encoded = pw::utf8::EncodeCodePoint(code_point);
189 static_assert(encoded.ok());
190 }
191 }
192
TEST(UtfCodecs,WriteCodePoint)193 TEST(UtfCodecs, WriteCodePoint) {
194 {
195 const char str[] = "$";
196 const size_t char_byte_size = sizeof(str) - 1;
197 const uint32_t code_point = 0x0024u;
198
199 std::array<char, 2> buffer{};
200 pw::StringBuilder out(buffer);
201 auto rslt = pw::utf8::WriteCodePoint(code_point, out);
202 EXPECT_TRUE(rslt.ok());
203 EXPECT_EQ(out.size(), char_byte_size);
204 EXPECT_EQ(out, std::string_view(str, char_byte_size));
205 }
206
207 {
208 const char str[] = "£";
209 const size_t char_byte_size = sizeof(str) - 1;
210 const uint32_t code_point = 0x00A3u;
211
212 std::array<char, 3> buffer{};
213 pw::StringBuilder out(buffer);
214 auto rslt = pw::utf8::WriteCodePoint(code_point, out);
215 EXPECT_TRUE(rslt.ok());
216 EXPECT_EQ(out.size(), char_byte_size);
217 EXPECT_EQ(out, std::string_view(str, char_byte_size));
218 }
219
220 {
221 const char str[] = "€";
222 const size_t char_byte_size = sizeof(str) - 1;
223 const uint32_t code_point = 0x20ACu;
224
225 std::array<char, 4> buffer{};
226 pw::StringBuilder out(buffer);
227 auto rslt = pw::utf8::WriteCodePoint(code_point, out);
228 EXPECT_TRUE(rslt.ok());
229 EXPECT_EQ(out.size(), char_byte_size);
230 EXPECT_EQ(out, std::string_view(str, char_byte_size));
231 }
232
233 {
234 const char str[] = "";
235 const size_t char_byte_size = sizeof(str) - 1;
236 const uint32_t code_point = 0x10348u;
237
238 std::array<char, 5> buffer{};
239 pw::StringBuilder out(buffer);
240 auto rslt = pw::utf8::WriteCodePoint(code_point, out);
241 EXPECT_TRUE(rslt.ok());
242 EXPECT_EQ(out.size(), char_byte_size);
243 EXPECT_EQ(out, std::string_view(str, char_byte_size));
244 }
245
246 {
247 const uint32_t code_point = 0xFFFFFFFFu;
248
249 std::array<char, 4> buffer{};
250 pw::StringBuilder out(buffer);
251 EXPECT_FALSE(pw::utf8::WriteCodePoint(code_point, out).ok());
252 // Nothing should be written.
253 EXPECT_EQ(out.view(), std::string_view{});
254 }
255
256 {
257 const char str[] = "";
258 const uint32_t code_point = 0x10348u;
259
260 std::array<char, 3> buffer{};
261 pw::StringBuilder out(buffer);
262 // Buffer was too small so it should have a failure status.
263 EXPECT_FALSE(pw::utf8::WriteCodePoint(code_point, out).ok());
264 // We expect the first two code units to be written.
265 EXPECT_EQ(out.view(), std::string_view(str, 2));
266 }
267 }
268
TEST(UtfCodecs,EncodeCodePoint)269 TEST(UtfCodecs, EncodeCodePoint) {
270 {
271 const char str[] = "$";
272 const size_t char_byte_size = sizeof(str) - 1;
273 const uint32_t code_point = 0x0024u;
274
275 auto rslt = pw::utf8::EncodeCodePoint(code_point);
276 EXPECT_TRUE(rslt.ok());
277 EXPECT_EQ(rslt->as_view(), std::string_view(str, char_byte_size));
278 }
279
280 {
281 const char str[] = "£";
282 const size_t char_byte_size = sizeof(str) - 1;
283 const uint32_t code_point = 0x00A3u;
284
285 auto rslt = pw::utf8::EncodeCodePoint(code_point);
286 EXPECT_TRUE(rslt.ok());
287 EXPECT_EQ(rslt->as_view(), std::string_view(str, char_byte_size));
288 }
289
290 {
291 const char str[] = "€";
292 const size_t char_byte_size = sizeof(str) - 1;
293 const uint32_t code_point = 0x20ACu;
294
295 auto rslt = pw::utf8::EncodeCodePoint(code_point);
296 EXPECT_TRUE(rslt.ok());
297 EXPECT_EQ(rslt->as_view(), std::string_view(str, char_byte_size));
298 }
299
300 {
301 const char str[] = "";
302 const size_t char_byte_size = sizeof(str) - 1;
303 const uint32_t code_point = 0x10348u;
304
305 auto rslt = pw::utf8::EncodeCodePoint(code_point);
306 EXPECT_TRUE(rslt.ok());
307 EXPECT_EQ(rslt->as_view(), std::string_view(str, char_byte_size));
308 }
309
310 {
311 const uint32_t code_point = 0xFFFFFFFFu;
312
313 EXPECT_FALSE(pw::utf8::EncodeCodePoint(code_point).ok());
314 }
315 }
316
317 } // namespace
318 } // namespace pw
319