1 // Scintilla source code edit control
2 /** @file UniConversion.cxx
3 ** Functions to handle UTF-8 and UTF-16 strings.
4 **/
5 // Copyright 1998-2001 by Neil Hodgson <[email protected]>
6 // The License.txt file describes the conditions under which this software may be distributed.
7
8 #include <cstdlib>
9
10 #include <stdexcept>
11 #include <string>
12 #include <string_view>
13
14 #include "UniConversion.h"
15
16 using namespace Scintilla;
17
18 namespace Scintilla {
19
UTF8Length(std::wstring_view wsv)20 size_t UTF8Length(std::wstring_view wsv) noexcept {
21 size_t len = 0;
22 for (size_t i = 0; i < wsv.length() && wsv[i];) {
23 const unsigned int uch = wsv[i];
24 if (uch < 0x80) {
25 len++;
26 } else if (uch < 0x800) {
27 len += 2;
28 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
29 (uch <= SURROGATE_TRAIL_LAST)) {
30 len += 4;
31 i++;
32 } else {
33 len += 3;
34 }
35 i++;
36 }
37 return len;
38 }
39
UTF8PositionFromUTF16Position(std::string_view u8Text,size_t positionUTF16)40 size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept {
41 size_t positionUTF8 = 0;
42 for (size_t lengthUTF16 = 0; (positionUTF8 < u8Text.length()) && (lengthUTF16 < positionUTF16);) {
43 const unsigned char uch = u8Text[positionUTF8];
44 const unsigned int byteCount = UTF8BytesOfLead[uch];
45 lengthUTF16 += UTF16LengthFromUTF8ByteCount(byteCount);
46 positionUTF8 += byteCount;
47 }
48
49 return positionUTF8;
50 }
51
UTF8FromUTF16(std::wstring_view wsv,char * putf,size_t len)52 void UTF8FromUTF16(std::wstring_view wsv, char *putf, size_t len) noexcept {
53 size_t k = 0;
54 for (size_t i = 0; i < wsv.length() && wsv[i];) {
55 const unsigned int uch = wsv[i];
56 if (uch < 0x80) {
57 putf[k++] = static_cast<char>(uch);
58 } else if (uch < 0x800) {
59 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
60 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
61 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
62 (uch <= SURROGATE_TRAIL_LAST)) {
63 // Half a surrogate pair
64 i++;
65 const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (wsv[i] & 0x3ff);
66 putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
67 putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
68 putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
69 putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
70 } else {
71 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
72 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
73 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
74 }
75 i++;
76 }
77 if (k < len)
78 putf[k] = '\0';
79 }
80
UTF8FromUTF32Character(int uch,char * putf)81 void UTF8FromUTF32Character(int uch, char *putf) noexcept {
82 size_t k = 0;
83 if (uch < 0x80) {
84 putf[k++] = static_cast<char>(uch);
85 } else if (uch < 0x800) {
86 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
87 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
88 } else if (uch < 0x10000) {
89 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
90 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
91 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
92 } else {
93 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
94 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
95 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
96 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
97 }
98 putf[k] = '\0';
99 }
100
UTF16Length(std::string_view svu8)101 size_t UTF16Length(std::string_view svu8) noexcept {
102 size_t ulen = 0;
103 for (size_t i = 0; i< svu8.length();) {
104 const unsigned char ch = svu8[i];
105 const unsigned int byteCount = UTF8BytesOfLead[ch];
106 const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
107 i += byteCount;
108 ulen += (i > svu8.length()) ? 1 : utf16Len;
109 }
110 return ulen;
111 }
112
TrailByteValue(unsigned char c)113 constexpr unsigned char TrailByteValue(unsigned char c) {
114 // The top 2 bits are 0b10 to indicate a trail byte.
115 // The lower 6 bits contain the value.
116 return c & 0b0011'1111;
117 }
118
UTF16FromUTF8(std::string_view svu8,wchar_t * tbuf,size_t tlen)119 size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen) {
120 size_t ui = 0;
121 for (size_t i = 0; i < svu8.length();) {
122 unsigned char ch = svu8[i];
123 const unsigned int byteCount = UTF8BytesOfLead[ch];
124 unsigned int value;
125
126 if (i + byteCount > svu8.length()) {
127 // Trying to read past end but still have space to write
128 if (ui < tlen) {
129 tbuf[ui] = ch;
130 ui++;
131 }
132 break;
133 }
134
135 const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
136 if (ui + outLen > tlen) {
137 throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
138 }
139
140 i++;
141 switch (byteCount) {
142 case 1:
143 tbuf[ui] = ch;
144 break;
145 case 2:
146 value = (ch & 0x1F) << 6;
147 ch = svu8[i++];
148 value += TrailByteValue(ch);
149 tbuf[ui] = static_cast<wchar_t>(value);
150 break;
151 case 3:
152 value = (ch & 0xF) << 12;
153 ch = svu8[i++];
154 value += (TrailByteValue(ch) << 6);
155 ch = svu8[i++];
156 value += TrailByteValue(ch);
157 tbuf[ui] = static_cast<wchar_t>(value);
158 break;
159 default:
160 // Outside the BMP so need two surrogates
161 value = (ch & 0x7) << 18;
162 ch = svu8[i++];
163 value += TrailByteValue(ch) << 12;
164 ch = svu8[i++];
165 value += TrailByteValue(ch) << 6;
166 ch = svu8[i++];
167 value += TrailByteValue(ch);
168 tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
169 ui++;
170 tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
171 break;
172 }
173 ui++;
174 }
175 return ui;
176 }
177
UTF32Length(std::string_view svu8)178 size_t UTF32Length(std::string_view svu8) noexcept {
179 size_t ulen = 0;
180 for (size_t i = 0; i < svu8.length();) {
181 const unsigned char ch = svu8[i];
182 const unsigned int byteCount = UTF8BytesOfLead[ch];
183 i += byteCount;
184 ulen++;
185 }
186 return ulen;
187 }
188
UTF32FromUTF8(std::string_view svu8,unsigned int * tbuf,size_t tlen)189 size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen) {
190 size_t ui = 0;
191 for (size_t i = 0; i < svu8.length();) {
192 unsigned char ch = svu8[i];
193 const unsigned int byteCount = UTF8BytesOfLead[ch];
194 unsigned int value;
195
196 if (i + byteCount > svu8.length()) {
197 // Trying to read past end but still have space to write
198 if (ui < tlen) {
199 tbuf[ui] = ch;
200 ui++;
201 }
202 break;
203 }
204
205 if (ui == tlen) {
206 throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
207 }
208
209 i++;
210 switch (byteCount) {
211 case 1:
212 value = ch;
213 break;
214 case 2:
215 value = (ch & 0x1F) << 6;
216 ch = svu8[i++];
217 value += TrailByteValue(ch);
218 break;
219 case 3:
220 value = (ch & 0xF) << 12;
221 ch = svu8[i++];
222 value += TrailByteValue(ch) << 6;
223 ch = svu8[i++];
224 value += TrailByteValue(ch);
225 break;
226 default:
227 value = (ch & 0x7) << 18;
228 ch = svu8[i++];
229 value += TrailByteValue(ch) << 12;
230 ch = svu8[i++];
231 value += TrailByteValue(ch) << 6;
232 ch = svu8[i++];
233 value += TrailByteValue(ch);
234 break;
235 }
236 tbuf[ui] = value;
237 ui++;
238 }
239 return ui;
240 }
241
WStringFromUTF8(std::string_view svu8)242 std::wstring WStringFromUTF8(std::string_view svu8) {
243 if constexpr (sizeof(wchar_t) == 2) {
244 const size_t len16 = UTF16Length(svu8);
245 std::wstring ws(len16, 0);
246 UTF16FromUTF8(svu8, &ws[0], len16);
247 return ws;
248 } else {
249 const size_t len32 = UTF32Length(svu8);
250 std::wstring ws(len32, 0);
251 UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws[0]), len32);
252 return ws;
253 }
254 }
255
UTF16FromUTF32Character(unsigned int val,wchar_t * tbuf)256 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
257 if (val < SUPPLEMENTAL_PLANE_FIRST) {
258 tbuf[0] = static_cast<wchar_t>(val);
259 return 1;
260 } else {
261 tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
262 tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
263 return 2;
264 }
265 }
266
267 const unsigned char UTF8BytesOfLead[256] = {
268 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
269 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
271 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
272 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
273 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
274 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
275 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
276 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
277 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
278 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
279 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
280 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
281 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
282 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
283 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
284 };
285
286 // Return both the width of the first character in the string and a status
287 // saying whether it is valid or invalid.
288 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
289 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
290 // reasonably treated as code points in some circumstances. They will, however,
291 // not have associated glyphs.
UTF8Classify(const unsigned char * us,size_t len)292 int UTF8Classify(const unsigned char *us, size_t len) noexcept {
293 // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
294 if (us[0] < 0x80) {
295 // ASCII
296 return 1;
297 }
298
299 const size_t byteCount = UTF8BytesOfLead[us[0]];
300 if (byteCount == 1 || byteCount > len) {
301 // Invalid lead byte
302 return UTF8MaskInvalid | 1;
303 }
304
305 if (!UTF8IsTrailByte(us[1])) {
306 // Invalid trail byte
307 return UTF8MaskInvalid | 1;
308 }
309
310 switch (byteCount) {
311 case 2:
312 return 2;
313
314 case 3:
315 if (UTF8IsTrailByte(us[2])) {
316 if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
317 // Overlong
318 return UTF8MaskInvalid | 1;
319 }
320 if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
321 // Surrogate
322 return UTF8MaskInvalid | 1;
323 }
324 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
325 // U+FFFE non-character - 3 bytes long
326 return UTF8MaskInvalid | 3;
327 }
328 if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
329 // U+FFFF non-character - 3 bytes long
330 return UTF8MaskInvalid | 3;
331 }
332 if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
333 // U+FDD0 .. U+FDEF
334 return UTF8MaskInvalid | 3;
335 }
336 return 3;
337 }
338 break;
339
340 default:
341 if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
342 if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
343 // *FFFE or *FFFF non-character
344 return UTF8MaskInvalid | 4;
345 }
346 if (*us == 0xf4) {
347 // Check if encoding a value beyond the last Unicode character 10FFFF
348 if (us[1] > 0x8f) {
349 return UTF8MaskInvalid | 1;
350 }
351 } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
352 // Overlong
353 return UTF8MaskInvalid | 1;
354 }
355 return 4;
356 }
357 break;
358 }
359
360 return UTF8MaskInvalid | 1;
361 }
362
UTF8DrawBytes(const unsigned char * us,int len)363 int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
364 const int utf8StatusNext = UTF8Classify(us, len);
365 return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
366 }
367
UTF8IsValid(std::string_view svu8)368 bool UTF8IsValid(std::string_view svu8) noexcept {
369 const unsigned char *us = reinterpret_cast<const unsigned char *>(svu8.data());
370 size_t remaining = svu8.length();
371 while (remaining > 0) {
372 const int utf8Status = UTF8Classify(us, remaining);
373 if (utf8Status & UTF8MaskInvalid) {
374 return false;
375 } else {
376 const int lenChar = utf8Status & UTF8MaskWidth;
377 us += lenChar;
378 remaining -= lenChar;
379 }
380 }
381 return remaining == 0;
382 }
383
384 // Replace invalid bytes in UTF-8 with the replacement character
FixInvalidUTF8(const std::string & text)385 std::string FixInvalidUTF8(const std::string &text) {
386 std::string result;
387 const char *s = text.c_str();
388 size_t remaining = text.size();
389 while (remaining > 0) {
390 const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
391 if (utf8Status & UTF8MaskInvalid) {
392 // Replacement character 0xFFFD = UTF8:"efbfbd".
393 result.append("\xef\xbf\xbd");
394 s++;
395 remaining--;
396 } else {
397 const size_t len = utf8Status & UTF8MaskWidth;
398 result.append(s, len);
399 s += len;
400 remaining -= len;
401 }
402 }
403 return result;
404 }
405
406 }
407