1 // Copyright 2024 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14
15 #pragma once
16
17 /// Provides basic helpers for reading and writing UTF-8 encoded strings.
18
19 #include <array>
20 #include <cstdint>
21 #include <string_view>
22
23 #include "pw_result/result.h"
24 #include "pw_status/status.h"
25 #include "pw_string/string_builder.h"
26
27 namespace pw {
28 namespace utf {
29 /// Checks if the code point is in a valid range.
30 ///
31 /// Excludes the surrogate code points (`[0xD800, 0xDFFF]`) and
32 /// codepoints larger than `0x10FFFF` (the highest codepoint allowed).
33 /// Non-characters and unassigned codepoints are allowed.
IsValidCodepoint(uint32_t code_point)34 constexpr inline bool IsValidCodepoint(uint32_t code_point) {
35 return code_point < 0xD800u ||
36 (code_point >= 0xE000u && code_point <= 0x10FFFFu);
37 }
38
39 /// Checks if the code point is a valid character.
40 ///
41 /// Excludes non-characters (`U+FDD0..U+FDEF`, and all codepoints ending in
42 /// `0xFFFE` or `0xFFFF`) from the set of valid code points.
IsValidCharacter(uint32_t code_point)43 constexpr inline bool IsValidCharacter(uint32_t code_point) {
44 return code_point < 0xD800u ||
45 (code_point >= 0xE000u && code_point < 0xFDD0u) ||
46 (code_point > 0xFDEFu && code_point <= 0x10FFFFu &&
47 (code_point & 0xFFFEu) != 0xFFFEu);
48 }
49
50 /// @class CodePointAndSize
51 ///
52 /// Provides a combined view of a valid codepoint and the number of bytes its
53 /// encoding requires. The maximum valid codepoint is `0x10FFFFU` which requires
54 /// 20 bits to represent. This combined view uses the available upper bits to
55 /// encode the number of bytes required to represent the codepoint when UTF
56 /// encoded.
57 class CodePointAndSize final {
58 public:
59 /// Creates a combined view of a @code_point and its encoded @size.
CodePointAndSize(uint32_t code_point,size_t size)60 explicit constexpr CodePointAndSize(uint32_t code_point, size_t size)
61 : code_point_((static_cast<uint32_t>(size) << kSizeShift) | code_point) {}
62
63 constexpr CodePointAndSize(const CodePointAndSize&) = default;
64 constexpr CodePointAndSize& operator=(const CodePointAndSize&) = default;
65 constexpr CodePointAndSize(CodePointAndSize&&) = default;
66 constexpr CodePointAndSize& operator=(CodePointAndSize&&) = default;
67
68 /// Returns the code point this represents.
code_point()69 constexpr uint32_t code_point() const { return code_point_ & kCodePointMask; }
70
71 /// Returns the number of bytes required to encode this codepoint.
size()72 constexpr size_t size() const {
73 return (code_point_ & kSizeMask) >> kSizeShift;
74 }
75
76 private:
77 static constexpr size_t kSizeBits = 4;
78 static constexpr uint32_t kCodePointMask = ~0U >> kSizeBits;
79 static constexpr uint32_t kSizeMask = ~kCodePointMask;
80 static constexpr size_t kSizeShift = sizeof(uint32_t) * 8 - kSizeBits;
81 uint32_t code_point_;
82 };
83 } // namespace utf
84
85 namespace utf8 {
86 /// @brief Reads the first code point from a UTF-8 encoded `str`.
87 ///
88 /// This is a very basic decoder without much thought for performance and very
89 /// basic validation that the correct number of bytes are available and that
90 /// each byte of a multibyte sequence has a continuation character. See
91 /// `pw::utf8::EncodeCharacter()` for encoding details.
92 ///
93 /// @return @rst
94 ///
95 /// .. pw-status-codes::
96 ///
97 /// OK: The decoded code point and the number of bytes read.
98 ///
99 /// INVALID_ARGUMENT: The string was empty or malformed.
100 ///
101 /// OUT_OF_RANGE: The decoded code point was not in the valid range.
102 ///
103 /// @endrst
ReadCodePoint(std::string_view str)104 constexpr pw::Result<utf::CodePointAndSize> ReadCodePoint(
105 std::string_view str) {
106 if (str.empty()) {
107 return pw::Status::InvalidArgument();
108 }
109
110 const uint8_t leading_byte = static_cast<uint8_t>(str.front());
111 size_t byte_count = 0;
112 uint32_t code_point = 0xFFFFFFFFu;
113
114 if (leading_byte <= 0x7F) {
115 byte_count = 1;
116 // b0xxx xxxx
117 code_point = leading_byte;
118 } else if (leading_byte <= 0xDF) {
119 byte_count = 2;
120 if (str.size() < byte_count) {
121 return pw::Status::InvalidArgument();
122 }
123 // b110x xxxx 10xx xxxx
124 if ((str[1] & 0xC0) != 0x80) {
125 // Invalid continuation
126 return pw::Status::InvalidArgument();
127 }
128 code_point = (static_cast<uint32_t>(str[0] & 0x1F) << 6) +
129 static_cast<uint32_t>(str[1] & 0x3F);
130 } else if (leading_byte <= 0xEF) {
131 byte_count = 3;
132 if (str.size() < byte_count) {
133 return pw::Status::InvalidArgument();
134 }
135 if ((str[1] & 0xC0) != 0x80 || (str[2] & 0xC0) != 0x80) {
136 // Invalid continuation
137 return pw::Status::InvalidArgument();
138 }
139 // b1110 xxxx 10xx xxxx 10xx xxxx
140 code_point = (static_cast<uint32_t>(str[0] & 0x0F) << 12) +
141 (static_cast<uint32_t>(str[1] & 0x3F) << 6) +
142 static_cast<uint32_t>(str[2] & 0x3F);
143 } else if (leading_byte <= 0xF7) {
144 byte_count = 4;
145 if (str.size() < byte_count) {
146 return pw::Status::InvalidArgument();
147 }
148 if ((str[1] & 0xC0) != 0x80 || (str[2] & 0xC0) != 0x80 ||
149 (str[3] & 0xC0) != 0x80) {
150 // Invalid continuation
151 return pw::Status::InvalidArgument();
152 }
153 // b1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
154 code_point = (static_cast<uint32_t>(str[0] & 0x07) << 18) +
155 (static_cast<uint32_t>(str[1] & 0x3F) << 12) +
156 (static_cast<uint32_t>(str[2] & 0x3F) << 6) +
157 static_cast<uint32_t>(str[3] & 0x3F);
158 } else {
159 return pw::Status::InvalidArgument();
160 }
161
162 // Validate the decoded value.
163 if (utf::IsValidCodepoint(code_point)) {
164 return utf::CodePointAndSize(code_point, byte_count);
165 }
166
167 return pw::Status::OutOfRange();
168 }
169
170 /// Determines if `str` is a valid UTF-8 string.
IsStringValid(std::string_view str)171 constexpr bool IsStringValid(std::string_view str) {
172 while (!str.empty()) {
173 auto rslt = utf8::ReadCodePoint(str);
174 if (!rslt.ok() || !utf::IsValidCharacter(rslt->code_point())) {
175 return false;
176 }
177 str = str.substr(rslt->size());
178 }
179 return true;
180 }
181
182 /// Encapsulates the result of encoding a single code point as UTF-8.
183 class EncodedCodePoint {
184 public:
EncodedCodePoint(uint32_t size,std::array<char,4> data)185 constexpr EncodedCodePoint(uint32_t size, std::array<char, 4> data)
186 : size_(size), data_(std::move(data)) {}
187 constexpr EncodedCodePoint(EncodedCodePoint&& encoded) = default;
as_view()188 constexpr std::string_view as_view() const { return {data_.data(), size_}; }
189
190 private:
191 uint32_t size_;
192 std::array<char, 4> data_;
193 };
194
195 /// @brief Encodes a single code point as UTF-8.
196 ///
197 /// UTF-8 encodes as 1-4 bytes from a range of `[0, 0x10FFFF]`.
198 ///
199 /// 1-byte encoding has a top bit of zero:
200 /// @code
201 /// [0, 0x7F] 1-bytes: b0xxx xxxx
202 /// @endcode
203 /// N-bytes sequences are denoted by annotating the top N+1 bits of the leading
204 /// byte and then using a 2-bit continuation marker on the following bytes.
205 /// @code
206 /// [0x00080, 0x0007FF] 2-bytes: b110x xxxx 10xx xxxx
207 /// [0x00800, 0x00FFFF] 3-bytes: b1110 xxxx 10xx xxxx 10xx xxxx
208 /// [0x10000, 0x10FFFF] 4-bytes: b1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
209 /// @endcode
210 ///
211 /// @return @rst
212 ///
213 /// .. pw-status-codes::
214 ///
215 /// OK: The codepoint encoded as UTF-8.
216 ///
217 /// OUT_OF_RANGE: The code point was not in the valid range for UTF-8
218 /// encoding.
219 ///
220 /// @endrst
EncodeCodePoint(uint32_t code_point)221 constexpr Result<EncodedCodePoint> EncodeCodePoint(uint32_t code_point) {
222 if (code_point <= 0x7F) {
223 return EncodedCodePoint{1, {static_cast<char>(code_point)}};
224 }
225 if (code_point <= 0x7FF) {
226 return EncodedCodePoint{2,
227 {static_cast<char>(0xC0 | (code_point >> 6)),
228 static_cast<char>(0x80 | (code_point & 0x3F))}};
229 }
230 if (code_point <= 0xFFFF) {
231 return EncodedCodePoint{
232 3,
233 {static_cast<char>(0xE0 | (code_point >> 12)),
234 static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)),
235 static_cast<char>(0x80 | (code_point & 0x3F))}};
236 }
237 if (code_point <= 0x10FFFF) {
238 return EncodedCodePoint{
239 4,
240 {static_cast<char>(0xF0 | (code_point >> 18)),
241 static_cast<char>(0x80 | ((code_point >> 12) & 0x3F)),
242 static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)),
243 static_cast<char>(0x80 | (code_point & 0x3F))}};
244 }
245
246 return pw::Status::OutOfRange();
247 }
248
249 /// Helper that writes a code point to the provided `pw::StringBuilder`.
250 Status WriteCodePoint(uint32_t code_point, pw::StringBuilder& output);
251
252 } // namespace utf8
253
254 } // namespace pw
255