xref: /MusicPlayer2/scintilla/src/UniConversion.cxx (revision 8af74909132ed5e696cb05b6689ae4baf14c1c96)
1*8af74909SZhong Yang // Scintilla source code edit control
2*8af74909SZhong Yang /** @file UniConversion.cxx
3*8af74909SZhong Yang  ** Functions to handle UTF-8 and UTF-16 strings.
4*8af74909SZhong Yang  **/
5*8af74909SZhong Yang // Copyright 1998-2001 by Neil Hodgson <[email protected]>
6*8af74909SZhong Yang // The License.txt file describes the conditions under which this software may be distributed.
7*8af74909SZhong Yang 
8*8af74909SZhong Yang #include <cstdlib>
9*8af74909SZhong Yang 
10*8af74909SZhong Yang #include <stdexcept>
11*8af74909SZhong Yang #include <string>
12*8af74909SZhong Yang #include <string_view>
13*8af74909SZhong Yang 
14*8af74909SZhong Yang #include "UniConversion.h"
15*8af74909SZhong Yang 
16*8af74909SZhong Yang using namespace Scintilla;
17*8af74909SZhong Yang 
18*8af74909SZhong Yang namespace Scintilla {
19*8af74909SZhong Yang 
UTF8Length(std::wstring_view wsv)20*8af74909SZhong Yang size_t UTF8Length(std::wstring_view wsv) noexcept {
21*8af74909SZhong Yang 	size_t len = 0;
22*8af74909SZhong Yang 	for (size_t i = 0; i < wsv.length() && wsv[i];) {
23*8af74909SZhong Yang 		const unsigned int uch = wsv[i];
24*8af74909SZhong Yang 		if (uch < 0x80) {
25*8af74909SZhong Yang 			len++;
26*8af74909SZhong Yang 		} else if (uch < 0x800) {
27*8af74909SZhong Yang 			len += 2;
28*8af74909SZhong Yang 		} else if ((uch >= SURROGATE_LEAD_FIRST) &&
29*8af74909SZhong Yang 			(uch <= SURROGATE_TRAIL_LAST)) {
30*8af74909SZhong Yang 			len += 4;
31*8af74909SZhong Yang 			i++;
32*8af74909SZhong Yang 		} else {
33*8af74909SZhong Yang 			len += 3;
34*8af74909SZhong Yang 		}
35*8af74909SZhong Yang 		i++;
36*8af74909SZhong Yang 	}
37*8af74909SZhong Yang 	return len;
38*8af74909SZhong Yang }
39*8af74909SZhong Yang 
UTF8PositionFromUTF16Position(std::string_view u8Text,size_t positionUTF16)40*8af74909SZhong Yang size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept {
41*8af74909SZhong Yang 	size_t positionUTF8 = 0;
42*8af74909SZhong Yang 	for (size_t lengthUTF16 = 0; (positionUTF8 < u8Text.length()) && (lengthUTF16 < positionUTF16);) {
43*8af74909SZhong Yang 		const unsigned char uch = u8Text[positionUTF8];
44*8af74909SZhong Yang 		const unsigned int byteCount = UTF8BytesOfLead[uch];
45*8af74909SZhong Yang 		lengthUTF16 += UTF16LengthFromUTF8ByteCount(byteCount);
46*8af74909SZhong Yang 		positionUTF8 += byteCount;
47*8af74909SZhong Yang 	}
48*8af74909SZhong Yang 
49*8af74909SZhong Yang 	return positionUTF8;
50*8af74909SZhong Yang }
51*8af74909SZhong Yang 
UTF8FromUTF16(std::wstring_view wsv,char * putf,size_t len)52*8af74909SZhong Yang void UTF8FromUTF16(std::wstring_view wsv, char *putf, size_t len) noexcept {
53*8af74909SZhong Yang 	size_t k = 0;
54*8af74909SZhong Yang 	for (size_t i = 0; i < wsv.length() && wsv[i];) {
55*8af74909SZhong Yang 		const unsigned int uch = wsv[i];
56*8af74909SZhong Yang 		if (uch < 0x80) {
57*8af74909SZhong Yang 			putf[k++] = static_cast<char>(uch);
58*8af74909SZhong Yang 		} else if (uch < 0x800) {
59*8af74909SZhong Yang 			putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
60*8af74909SZhong Yang 			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
61*8af74909SZhong Yang 		} else if ((uch >= SURROGATE_LEAD_FIRST) &&
62*8af74909SZhong Yang 			(uch <= SURROGATE_TRAIL_LAST)) {
63*8af74909SZhong Yang 			// Half a surrogate pair
64*8af74909SZhong Yang 			i++;
65*8af74909SZhong Yang 			const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (wsv[i] & 0x3ff);
66*8af74909SZhong Yang 			putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
67*8af74909SZhong Yang 			putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
68*8af74909SZhong Yang 			putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
69*8af74909SZhong Yang 			putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
70*8af74909SZhong Yang 		} else {
71*8af74909SZhong Yang 			putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
72*8af74909SZhong Yang 			putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
73*8af74909SZhong Yang 			putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
74*8af74909SZhong Yang 		}
75*8af74909SZhong Yang 		i++;
76*8af74909SZhong Yang 	}
77*8af74909SZhong Yang 	if (k < len)
78*8af74909SZhong Yang 		putf[k] = '\0';
79*8af74909SZhong Yang }
80*8af74909SZhong Yang 
UTF8FromUTF32Character(int uch,char * putf)81*8af74909SZhong Yang void UTF8FromUTF32Character(int uch, char *putf) noexcept {
82*8af74909SZhong Yang 	size_t k = 0;
83*8af74909SZhong Yang 	if (uch < 0x80) {
84*8af74909SZhong Yang 		putf[k++] = static_cast<char>(uch);
85*8af74909SZhong Yang 	} else if (uch < 0x800) {
86*8af74909SZhong Yang 		putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
87*8af74909SZhong Yang 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
88*8af74909SZhong Yang 	} else if (uch < 0x10000) {
89*8af74909SZhong Yang 		putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
90*8af74909SZhong Yang 		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
91*8af74909SZhong Yang 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
92*8af74909SZhong Yang 	} else {
93*8af74909SZhong Yang 		putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
94*8af74909SZhong Yang 		putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
95*8af74909SZhong Yang 		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
96*8af74909SZhong Yang 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
97*8af74909SZhong Yang 	}
98*8af74909SZhong Yang 	putf[k] = '\0';
99*8af74909SZhong Yang }
100*8af74909SZhong Yang 
UTF16Length(std::string_view svu8)101*8af74909SZhong Yang size_t UTF16Length(std::string_view svu8) noexcept {
102*8af74909SZhong Yang 	size_t ulen = 0;
103*8af74909SZhong Yang 	for (size_t i = 0; i< svu8.length();) {
104*8af74909SZhong Yang 		const unsigned char ch = svu8[i];
105*8af74909SZhong Yang 		const unsigned int byteCount = UTF8BytesOfLead[ch];
106*8af74909SZhong Yang 		const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
107*8af74909SZhong Yang 		i += byteCount;
108*8af74909SZhong Yang 		ulen += (i > svu8.length()) ? 1 : utf16Len;
109*8af74909SZhong Yang 	}
110*8af74909SZhong Yang 	return ulen;
111*8af74909SZhong Yang }
112*8af74909SZhong Yang 
TrailByteValue(unsigned char c)113*8af74909SZhong Yang constexpr unsigned char TrailByteValue(unsigned char c) {
114*8af74909SZhong Yang 	// The top 2 bits are 0b10 to indicate a trail byte.
115*8af74909SZhong Yang 	// The lower 6 bits contain the value.
116*8af74909SZhong Yang 	return c & 0b0011'1111;
117*8af74909SZhong Yang }
118*8af74909SZhong Yang 
UTF16FromUTF8(std::string_view svu8,wchar_t * tbuf,size_t tlen)119*8af74909SZhong Yang size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen) {
120*8af74909SZhong Yang 	size_t ui = 0;
121*8af74909SZhong Yang 	for (size_t i = 0; i < svu8.length();) {
122*8af74909SZhong Yang 		unsigned char ch = svu8[i];
123*8af74909SZhong Yang 		const unsigned int byteCount = UTF8BytesOfLead[ch];
124*8af74909SZhong Yang 		unsigned int value;
125*8af74909SZhong Yang 
126*8af74909SZhong Yang 		if (i + byteCount > svu8.length()) {
127*8af74909SZhong Yang 			// Trying to read past end but still have space to write
128*8af74909SZhong Yang 			if (ui < tlen) {
129*8af74909SZhong Yang 				tbuf[ui] = ch;
130*8af74909SZhong Yang 				ui++;
131*8af74909SZhong Yang 			}
132*8af74909SZhong Yang 			break;
133*8af74909SZhong Yang 		}
134*8af74909SZhong Yang 
135*8af74909SZhong Yang 		const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
136*8af74909SZhong Yang 		if (ui + outLen > tlen) {
137*8af74909SZhong Yang 			throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
138*8af74909SZhong Yang 		}
139*8af74909SZhong Yang 
140*8af74909SZhong Yang 		i++;
141*8af74909SZhong Yang 		switch (byteCount) {
142*8af74909SZhong Yang 		case 1:
143*8af74909SZhong Yang 			tbuf[ui] = ch;
144*8af74909SZhong Yang 			break;
145*8af74909SZhong Yang 		case 2:
146*8af74909SZhong Yang 			value = (ch & 0x1F) << 6;
147*8af74909SZhong Yang 			ch = svu8[i++];
148*8af74909SZhong Yang 			value += TrailByteValue(ch);
149*8af74909SZhong Yang 			tbuf[ui] = static_cast<wchar_t>(value);
150*8af74909SZhong Yang 			break;
151*8af74909SZhong Yang 		case 3:
152*8af74909SZhong Yang 			value = (ch & 0xF) << 12;
153*8af74909SZhong Yang 			ch = svu8[i++];
154*8af74909SZhong Yang 			value += (TrailByteValue(ch) << 6);
155*8af74909SZhong Yang 			ch = svu8[i++];
156*8af74909SZhong Yang 			value += TrailByteValue(ch);
157*8af74909SZhong Yang 			tbuf[ui] = static_cast<wchar_t>(value);
158*8af74909SZhong Yang 			break;
159*8af74909SZhong Yang 		default:
160*8af74909SZhong Yang 			// Outside the BMP so need two surrogates
161*8af74909SZhong Yang 			value = (ch & 0x7) << 18;
162*8af74909SZhong Yang 			ch = svu8[i++];
163*8af74909SZhong Yang 			value += TrailByteValue(ch) << 12;
164*8af74909SZhong Yang 			ch = svu8[i++];
165*8af74909SZhong Yang 			value += TrailByteValue(ch) << 6;
166*8af74909SZhong Yang 			ch = svu8[i++];
167*8af74909SZhong Yang 			value += TrailByteValue(ch);
168*8af74909SZhong Yang 			tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
169*8af74909SZhong Yang 			ui++;
170*8af74909SZhong Yang 			tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
171*8af74909SZhong Yang 			break;
172*8af74909SZhong Yang 		}
173*8af74909SZhong Yang 		ui++;
174*8af74909SZhong Yang 	}
175*8af74909SZhong Yang 	return ui;
176*8af74909SZhong Yang }
177*8af74909SZhong Yang 
UTF32Length(std::string_view svu8)178*8af74909SZhong Yang size_t UTF32Length(std::string_view svu8) noexcept {
179*8af74909SZhong Yang 	size_t ulen = 0;
180*8af74909SZhong Yang 	for (size_t i = 0; i < svu8.length();) {
181*8af74909SZhong Yang 		const unsigned char ch = svu8[i];
182*8af74909SZhong Yang 		const unsigned int byteCount = UTF8BytesOfLead[ch];
183*8af74909SZhong Yang 		i += byteCount;
184*8af74909SZhong Yang 		ulen++;
185*8af74909SZhong Yang 	}
186*8af74909SZhong Yang 	return ulen;
187*8af74909SZhong Yang }
188*8af74909SZhong Yang 
UTF32FromUTF8(std::string_view svu8,unsigned int * tbuf,size_t tlen)189*8af74909SZhong Yang size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen) {
190*8af74909SZhong Yang 	size_t ui = 0;
191*8af74909SZhong Yang 	for (size_t i = 0; i < svu8.length();) {
192*8af74909SZhong Yang 		unsigned char ch = svu8[i];
193*8af74909SZhong Yang 		const unsigned int byteCount = UTF8BytesOfLead[ch];
194*8af74909SZhong Yang 		unsigned int value;
195*8af74909SZhong Yang 
196*8af74909SZhong Yang 		if (i + byteCount > svu8.length()) {
197*8af74909SZhong Yang 			// Trying to read past end but still have space to write
198*8af74909SZhong Yang 			if (ui < tlen) {
199*8af74909SZhong Yang 				tbuf[ui] = ch;
200*8af74909SZhong Yang 				ui++;
201*8af74909SZhong Yang 			}
202*8af74909SZhong Yang 			break;
203*8af74909SZhong Yang 		}
204*8af74909SZhong Yang 
205*8af74909SZhong Yang 		if (ui == tlen) {
206*8af74909SZhong Yang 			throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
207*8af74909SZhong Yang 		}
208*8af74909SZhong Yang 
209*8af74909SZhong Yang 		i++;
210*8af74909SZhong Yang 		switch (byteCount) {
211*8af74909SZhong Yang 		case 1:
212*8af74909SZhong Yang 			value = ch;
213*8af74909SZhong Yang 			break;
214*8af74909SZhong Yang 		case 2:
215*8af74909SZhong Yang 			value = (ch & 0x1F) << 6;
216*8af74909SZhong Yang 			ch = svu8[i++];
217*8af74909SZhong Yang 			value += TrailByteValue(ch);
218*8af74909SZhong Yang 			break;
219*8af74909SZhong Yang 		case 3:
220*8af74909SZhong Yang 			value = (ch & 0xF) << 12;
221*8af74909SZhong Yang 			ch = svu8[i++];
222*8af74909SZhong Yang 			value += TrailByteValue(ch) << 6;
223*8af74909SZhong Yang 			ch = svu8[i++];
224*8af74909SZhong Yang 			value += TrailByteValue(ch);
225*8af74909SZhong Yang 			break;
226*8af74909SZhong Yang 		default:
227*8af74909SZhong Yang 			value = (ch & 0x7) << 18;
228*8af74909SZhong Yang 			ch = svu8[i++];
229*8af74909SZhong Yang 			value += TrailByteValue(ch) << 12;
230*8af74909SZhong Yang 			ch = svu8[i++];
231*8af74909SZhong Yang 			value += TrailByteValue(ch) << 6;
232*8af74909SZhong Yang 			ch = svu8[i++];
233*8af74909SZhong Yang 			value += TrailByteValue(ch);
234*8af74909SZhong Yang 			break;
235*8af74909SZhong Yang 		}
236*8af74909SZhong Yang 		tbuf[ui] = value;
237*8af74909SZhong Yang 		ui++;
238*8af74909SZhong Yang 	}
239*8af74909SZhong Yang 	return ui;
240*8af74909SZhong Yang }
241*8af74909SZhong Yang 
WStringFromUTF8(std::string_view svu8)242*8af74909SZhong Yang std::wstring WStringFromUTF8(std::string_view svu8) {
243*8af74909SZhong Yang 	if constexpr (sizeof(wchar_t) == 2) {
244*8af74909SZhong Yang 		const size_t len16 = UTF16Length(svu8);
245*8af74909SZhong Yang 		std::wstring ws(len16, 0);
246*8af74909SZhong Yang 		UTF16FromUTF8(svu8, &ws[0], len16);
247*8af74909SZhong Yang 		return ws;
248*8af74909SZhong Yang 	} else {
249*8af74909SZhong Yang 		const size_t len32 = UTF32Length(svu8);
250*8af74909SZhong Yang 		std::wstring ws(len32, 0);
251*8af74909SZhong Yang 		UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws[0]), len32);
252*8af74909SZhong Yang 		return ws;
253*8af74909SZhong Yang 	}
254*8af74909SZhong Yang }
255*8af74909SZhong Yang 
UTF16FromUTF32Character(unsigned int val,wchar_t * tbuf)256*8af74909SZhong Yang unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
257*8af74909SZhong Yang 	if (val < SUPPLEMENTAL_PLANE_FIRST) {
258*8af74909SZhong Yang 		tbuf[0] = static_cast<wchar_t>(val);
259*8af74909SZhong Yang 		return 1;
260*8af74909SZhong Yang 	} else {
261*8af74909SZhong Yang 		tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
262*8af74909SZhong Yang 		tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
263*8af74909SZhong Yang 		return 2;
264*8af74909SZhong Yang 	}
265*8af74909SZhong Yang }
266*8af74909SZhong Yang 
267*8af74909SZhong Yang const unsigned char UTF8BytesOfLead[256] = {
268*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
269*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
270*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
271*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
272*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
273*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
274*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
275*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
276*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
277*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
278*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
279*8af74909SZhong Yang 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
280*8af74909SZhong Yang 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
281*8af74909SZhong Yang 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
282*8af74909SZhong Yang 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
283*8af74909SZhong Yang 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
284*8af74909SZhong Yang };
285*8af74909SZhong Yang 
286*8af74909SZhong Yang // Return both the width of the first character in the string and a status
287*8af74909SZhong Yang // saying whether it is valid or invalid.
288*8af74909SZhong Yang // Most invalid sequences return a width of 1 so are treated as isolated bytes but
289*8af74909SZhong Yang // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
290*8af74909SZhong Yang // reasonably treated as code points in some circumstances. They will, however,
291*8af74909SZhong Yang // not have associated glyphs.
UTF8Classify(const unsigned char * us,size_t len)292*8af74909SZhong Yang int UTF8Classify(const unsigned char *us, size_t len) noexcept {
293*8af74909SZhong Yang 	// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
294*8af74909SZhong Yang 	if (us[0] < 0x80) {
295*8af74909SZhong Yang 		// ASCII
296*8af74909SZhong Yang 		return 1;
297*8af74909SZhong Yang 	}
298*8af74909SZhong Yang 
299*8af74909SZhong Yang 	const size_t byteCount = UTF8BytesOfLead[us[0]];
300*8af74909SZhong Yang 	if (byteCount == 1 || byteCount > len) {
301*8af74909SZhong Yang 		// Invalid lead byte
302*8af74909SZhong Yang 		return UTF8MaskInvalid | 1;
303*8af74909SZhong Yang 	}
304*8af74909SZhong Yang 
305*8af74909SZhong Yang 	if (!UTF8IsTrailByte(us[1])) {
306*8af74909SZhong Yang 		// Invalid trail byte
307*8af74909SZhong Yang 		return UTF8MaskInvalid | 1;
308*8af74909SZhong Yang 	}
309*8af74909SZhong Yang 
310*8af74909SZhong Yang 	switch (byteCount) {
311*8af74909SZhong Yang 	case 2:
312*8af74909SZhong Yang 		return 2;
313*8af74909SZhong Yang 
314*8af74909SZhong Yang 	case 3:
315*8af74909SZhong Yang 		if (UTF8IsTrailByte(us[2])) {
316*8af74909SZhong Yang 			if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
317*8af74909SZhong Yang 				// Overlong
318*8af74909SZhong Yang 				return UTF8MaskInvalid | 1;
319*8af74909SZhong Yang 			}
320*8af74909SZhong Yang 			if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
321*8af74909SZhong Yang 				// Surrogate
322*8af74909SZhong Yang 				return UTF8MaskInvalid | 1;
323*8af74909SZhong Yang 			}
324*8af74909SZhong Yang 			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
325*8af74909SZhong Yang 				// U+FFFE non-character - 3 bytes long
326*8af74909SZhong Yang 				return UTF8MaskInvalid | 3;
327*8af74909SZhong Yang 			}
328*8af74909SZhong Yang 			if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
329*8af74909SZhong Yang 				// U+FFFF non-character - 3 bytes long
330*8af74909SZhong Yang 				return UTF8MaskInvalid | 3;
331*8af74909SZhong Yang 			}
332*8af74909SZhong Yang 			if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
333*8af74909SZhong Yang 				// U+FDD0 .. U+FDEF
334*8af74909SZhong Yang 				return UTF8MaskInvalid | 3;
335*8af74909SZhong Yang 			}
336*8af74909SZhong Yang 			return 3;
337*8af74909SZhong Yang 		}
338*8af74909SZhong Yang 		break;
339*8af74909SZhong Yang 
340*8af74909SZhong Yang 	default:
341*8af74909SZhong Yang 		if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
342*8af74909SZhong Yang 			if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
343*8af74909SZhong Yang 				// *FFFE or *FFFF non-character
344*8af74909SZhong Yang 				return UTF8MaskInvalid | 4;
345*8af74909SZhong Yang 			}
346*8af74909SZhong Yang 			if (*us == 0xf4) {
347*8af74909SZhong Yang 				// Check if encoding a value beyond the last Unicode character 10FFFF
348*8af74909SZhong Yang 				if (us[1] > 0x8f) {
349*8af74909SZhong Yang 					return UTF8MaskInvalid | 1;
350*8af74909SZhong Yang 				}
351*8af74909SZhong Yang 			} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
352*8af74909SZhong Yang 				// Overlong
353*8af74909SZhong Yang 				return UTF8MaskInvalid | 1;
354*8af74909SZhong Yang 			}
355*8af74909SZhong Yang 			return 4;
356*8af74909SZhong Yang 		}
357*8af74909SZhong Yang 		break;
358*8af74909SZhong Yang 	}
359*8af74909SZhong Yang 
360*8af74909SZhong Yang 	return UTF8MaskInvalid | 1;
361*8af74909SZhong Yang }
362*8af74909SZhong Yang 
UTF8DrawBytes(const unsigned char * us,int len)363*8af74909SZhong Yang int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
364*8af74909SZhong Yang 	const int utf8StatusNext = UTF8Classify(us, len);
365*8af74909SZhong Yang 	return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
366*8af74909SZhong Yang }
367*8af74909SZhong Yang 
UTF8IsValid(std::string_view svu8)368*8af74909SZhong Yang bool UTF8IsValid(std::string_view svu8) noexcept {
369*8af74909SZhong Yang 	const unsigned char *us = reinterpret_cast<const unsigned char *>(svu8.data());
370*8af74909SZhong Yang 	size_t remaining = svu8.length();
371*8af74909SZhong Yang 	while (remaining > 0) {
372*8af74909SZhong Yang 		const int utf8Status = UTF8Classify(us, remaining);
373*8af74909SZhong Yang 		if (utf8Status & UTF8MaskInvalid) {
374*8af74909SZhong Yang 			return false;
375*8af74909SZhong Yang 		} else {
376*8af74909SZhong Yang 			const int lenChar = utf8Status & UTF8MaskWidth;
377*8af74909SZhong Yang 			us += lenChar;
378*8af74909SZhong Yang 			remaining -= lenChar;
379*8af74909SZhong Yang 		}
380*8af74909SZhong Yang 	}
381*8af74909SZhong Yang 	return remaining == 0;
382*8af74909SZhong Yang }
383*8af74909SZhong Yang 
384*8af74909SZhong Yang // Replace invalid bytes in UTF-8 with the replacement character
FixInvalidUTF8(const std::string & text)385*8af74909SZhong Yang std::string FixInvalidUTF8(const std::string &text) {
386*8af74909SZhong Yang 	std::string result;
387*8af74909SZhong Yang 	const char *s = text.c_str();
388*8af74909SZhong Yang 	size_t remaining = text.size();
389*8af74909SZhong Yang 	while (remaining > 0) {
390*8af74909SZhong Yang 		const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
391*8af74909SZhong Yang 		if (utf8Status & UTF8MaskInvalid) {
392*8af74909SZhong Yang 			// Replacement character 0xFFFD = UTF8:"efbfbd".
393*8af74909SZhong Yang 			result.append("\xef\xbf\xbd");
394*8af74909SZhong Yang 			s++;
395*8af74909SZhong Yang 			remaining--;
396*8af74909SZhong Yang 		} else {
397*8af74909SZhong Yang 			const size_t len = utf8Status & UTF8MaskWidth;
398*8af74909SZhong Yang 			result.append(s, len);
399*8af74909SZhong Yang 			s += len;
400*8af74909SZhong Yang 			remaining -= len;
401*8af74909SZhong Yang 		}
402*8af74909SZhong Yang 	}
403*8af74909SZhong Yang 	return result;
404*8af74909SZhong Yang }
405*8af74909SZhong Yang 
406*8af74909SZhong Yang }
407