1 // Scintilla source code edit control 2 /** @file UniConversion.h 3 ** Functions to handle UTF-8 and UTF-16 strings. 4 **/ 5 // Copyright 1998-2001 by Neil Hodgson <[email protected]> 6 // The License.txt file describes the conditions under which this software may be distributed. 7 8 #ifndef UNICONVERSION_H 9 #define UNICONVERSION_H 10 11 namespace Scintilla { 12 13 constexpr int UTF8MaxBytes = 4; 14 15 constexpr int unicodeReplacementChar = 0xFFFD; 16 17 size_t UTF8Length(std::wstring_view wsv) noexcept; 18 size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept; 19 void UTF8FromUTF16(std::wstring_view wsv, char *putf, size_t len) noexcept; 20 void UTF8FromUTF32Character(int uch, char *putf) noexcept; 21 size_t UTF16Length(std::string_view svu8) noexcept; 22 size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen); 23 size_t UTF32Length(std::string_view svu8) noexcept; 24 size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen); 25 // WStringFromUTF8 does the right thing when wchar_t is 2 or 4 bytes so 26 // works on both Windows and Unix. 27 std::wstring WStringFromUTF8(std::string_view svu8); 28 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept; 29 bool UTF8IsValid(std::string_view svu8) noexcept; 30 std::string FixInvalidUTF8(const std::string &text); 31 32 extern const unsigned char UTF8BytesOfLead[256]; 33 34 inline int UnicodeFromUTF8(const unsigned char *us) noexcept { 35 switch (UTF8BytesOfLead[us[0]]) { 36 case 1: 37 return us[0]; 38 case 2: 39 return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F); 40 case 3: 41 return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F); 42 default: 43 return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F); 44 } 45 } 46 47 inline constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept { 48 return (ch >= 0x80) && (ch < 0xc0); 49 } 50 51 inline constexpr bool UTF8IsAscii(int ch) noexcept { 52 return ch < 0x80; 53 } 54 55 enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 }; 56 int UTF8Classify(const unsigned char *us, size_t len) noexcept; 57 inline int UTF8Classify(std::string_view sv) noexcept { 58 return UTF8Classify(reinterpret_cast<const unsigned char *>(sv.data()), sv.length()); 59 } 60 61 // Similar to UTF8Classify but returns a length of 1 for invalid bytes 62 // instead of setting the invalid flag 63 int UTF8DrawBytes(const unsigned char *us, int len) noexcept; 64 65 // Line separator is U+2028 \xe2\x80\xa8 66 // Paragraph separator is U+2029 \xe2\x80\xa9 67 constexpr int UTF8SeparatorLength = 3; 68 inline bool UTF8IsSeparator(const unsigned char *us) noexcept { 69 return (us[0] == 0xe2) && (us[1] == 0x80) && ((us[2] == 0xa8) || (us[2] == 0xa9)); 70 } 71 72 // NEL is U+0085 \xc2\x85 73 constexpr int UTF8NELLength = 2; 74 inline bool UTF8IsNEL(const unsigned char *us) noexcept { 75 return (us[0] == 0xc2) && (us[1] == 0x85); 76 } 77 78 // Is the sequence of 3 char a UTF-8 line end? Only the last two char are tested for a NEL. 79 constexpr bool UTF8IsMultibyteLineEnd(unsigned char ch0, unsigned char ch1, unsigned char ch2) noexcept { 80 return 81 ((ch0 == 0xe2) && (ch1 == 0x80) && ((ch2 == 0xa8) || (ch2 == 0xa9))) || 82 ((ch1 == 0xc2) && (ch2 == 0x85)); 83 } 84 85 enum { SURROGATE_LEAD_FIRST = 0xD800 }; 86 enum { SURROGATE_LEAD_LAST = 0xDBFF }; 87 enum { SURROGATE_TRAIL_FIRST = 0xDC00 }; 88 enum { SURROGATE_TRAIL_LAST = 0xDFFF }; 89 enum { SUPPLEMENTAL_PLANE_FIRST = 0x10000 }; 90 91 inline constexpr unsigned int UTF16CharLength(wchar_t uch) noexcept { 92 return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1; 93 } 94 95 inline constexpr unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) noexcept { 96 return (byteCount < 4) ? 1 : 2; 97 } 98 99 } 100 101 #endif 102