1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_ 18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include "utils/base/integral_types.h" 21*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h" 22*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-common.h" 23*993b0882SAndroid Build Coastguard Worker 24*993b0882SAndroid Build Coastguard Worker #if defined TC3_UNILIB_ICU 25*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-icu.h" 26*993b0882SAndroid Build Coastguard Worker #define INIT_UNILIB_FOR_TESTING(VAR) VAR() 27*993b0882SAndroid Build Coastguard Worker #elif defined TC3_UNILIB_JAVAICU 28*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-javaicu.h" 29*993b0882SAndroid Build Coastguard Worker #define INIT_UNILIB_FOR_TESTING(VAR) VAR(nullptr) 30*993b0882SAndroid Build Coastguard Worker #elif defined TC3_UNILIB_APPLE 31*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-apple.h" 32*993b0882SAndroid Build Coastguard Worker #define INIT_UNILIB_FOR_TESTING(VAR) VAR() 33*993b0882SAndroid Build Coastguard Worker #else 34*993b0882SAndroid Build Coastguard Worker #error No TC3_UNILIB implementation specified. 35*993b0882SAndroid Build Coastguard Worker #endif 36*993b0882SAndroid Build Coastguard Worker 37*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 38*993b0882SAndroid Build Coastguard Worker 39*993b0882SAndroid Build Coastguard Worker class UniLib : public UniLibBase { 40*993b0882SAndroid Build Coastguard Worker public: 41*993b0882SAndroid Build Coastguard Worker using UniLibBase::UniLibBase; 42*993b0882SAndroid Build Coastguard Worker 43*993b0882SAndroid Build Coastguard Worker // Lowercase a unicode string. ToLowerText(const UnicodeText & text)44*993b0882SAndroid Build Coastguard Worker UnicodeText ToLowerText(const UnicodeText& text) const { 45*993b0882SAndroid Build Coastguard Worker UnicodeText result; 46*993b0882SAndroid Build Coastguard Worker for (const char32 codepoint : text) { 47*993b0882SAndroid Build Coastguard Worker result.push_back(ToLower(codepoint)); 48*993b0882SAndroid Build Coastguard Worker } 49*993b0882SAndroid Build Coastguard Worker return result; 50*993b0882SAndroid Build Coastguard Worker } 51*993b0882SAndroid Build Coastguard Worker 52*993b0882SAndroid Build Coastguard Worker // Uppercase a unicode string. ToUpperText(const UnicodeText & text)53*993b0882SAndroid Build Coastguard Worker UnicodeText ToUpperText(const UnicodeText& text) const { 54*993b0882SAndroid Build Coastguard Worker UnicodeText result; 55*993b0882SAndroid Build Coastguard Worker for (const char32 codepoint : text) { 56*993b0882SAndroid Build Coastguard Worker result.push_back(UniLibBase::ToUpper(codepoint)); 57*993b0882SAndroid Build Coastguard Worker } 58*993b0882SAndroid Build Coastguard Worker return result; 59*993b0882SAndroid Build Coastguard Worker } 60*993b0882SAndroid Build Coastguard Worker IsLowerText(const UnicodeText & text)61*993b0882SAndroid Build Coastguard Worker bool IsLowerText(const UnicodeText& text) const { 62*993b0882SAndroid Build Coastguard Worker for (const char32 codepoint : text) { 63*993b0882SAndroid Build Coastguard Worker if (!IsLower(codepoint)) { 64*993b0882SAndroid Build Coastguard Worker return false; 65*993b0882SAndroid Build Coastguard Worker } 66*993b0882SAndroid Build Coastguard Worker } 67*993b0882SAndroid Build Coastguard Worker return true; 68*993b0882SAndroid Build Coastguard Worker } 69*993b0882SAndroid Build Coastguard Worker IsUpperText(const UnicodeText & text)70*993b0882SAndroid Build Coastguard Worker bool IsUpperText(const UnicodeText& text) const { 71*993b0882SAndroid Build Coastguard Worker for (const char32 codepoint : text) { 72*993b0882SAndroid Build Coastguard Worker if (!IsUpper(codepoint)) { 73*993b0882SAndroid Build Coastguard Worker return false; 74*993b0882SAndroid Build Coastguard Worker } 75*993b0882SAndroid Build Coastguard Worker } 76*993b0882SAndroid Build Coastguard Worker return true; 77*993b0882SAndroid Build Coastguard Worker } 78*993b0882SAndroid Build Coastguard Worker IsDigits(const UnicodeText & text)79*993b0882SAndroid Build Coastguard Worker bool IsDigits(const UnicodeText& text) const { 80*993b0882SAndroid Build Coastguard Worker for (const char32 codepoint : text) { 81*993b0882SAndroid Build Coastguard Worker if (!IsDigit(codepoint)) { 82*993b0882SAndroid Build Coastguard Worker return false; 83*993b0882SAndroid Build Coastguard Worker } 84*993b0882SAndroid Build Coastguard Worker } 85*993b0882SAndroid Build Coastguard Worker return true; 86*993b0882SAndroid Build Coastguard Worker } 87*993b0882SAndroid Build Coastguard Worker IsPercentage(char32 codepoint)88*993b0882SAndroid Build Coastguard Worker bool IsPercentage(char32 codepoint) const { 89*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsPercentage(codepoint); 90*993b0882SAndroid Build Coastguard Worker } 91*993b0882SAndroid Build Coastguard Worker IsSlash(char32 codepoint)92*993b0882SAndroid Build Coastguard Worker bool IsSlash(char32 codepoint) const { 93*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsSlash(codepoint); 94*993b0882SAndroid Build Coastguard Worker } 95*993b0882SAndroid Build Coastguard Worker IsMinus(char32 codepoint)96*993b0882SAndroid Build Coastguard Worker bool IsMinus(char32 codepoint) const { 97*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsMinus(codepoint); 98*993b0882SAndroid Build Coastguard Worker } 99*993b0882SAndroid Build Coastguard Worker IsNumberSign(char32 codepoint)100*993b0882SAndroid Build Coastguard Worker bool IsNumberSign(char32 codepoint) const { 101*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsNumberSign(codepoint); 102*993b0882SAndroid Build Coastguard Worker } 103*993b0882SAndroid Build Coastguard Worker IsDot(char32 codepoint)104*993b0882SAndroid Build Coastguard Worker bool IsDot(char32 codepoint) const { 105*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsDot(codepoint); 106*993b0882SAndroid Build Coastguard Worker } 107*993b0882SAndroid Build Coastguard Worker IsApostrophe(char32 codepoint)108*993b0882SAndroid Build Coastguard Worker bool IsApostrophe(char32 codepoint) const { 109*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsApostrophe(codepoint); 110*993b0882SAndroid Build Coastguard Worker } 111*993b0882SAndroid Build Coastguard Worker IsQuotation(char32 codepoint)112*993b0882SAndroid Build Coastguard Worker bool IsQuotation(char32 codepoint) const { 113*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsQuotation(codepoint); 114*993b0882SAndroid Build Coastguard Worker } 115*993b0882SAndroid Build Coastguard Worker IsAmpersand(char32 codepoint)116*993b0882SAndroid Build Coastguard Worker bool IsAmpersand(char32 codepoint) const { 117*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsAmpersand(codepoint); 118*993b0882SAndroid Build Coastguard Worker } 119*993b0882SAndroid Build Coastguard Worker IsLatinLetter(char32 codepoint)120*993b0882SAndroid Build Coastguard Worker bool IsLatinLetter(char32 codepoint) const { 121*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsLatinLetter(codepoint); 122*993b0882SAndroid Build Coastguard Worker } 123*993b0882SAndroid Build Coastguard Worker IsArabicLetter(char32 codepoint)124*993b0882SAndroid Build Coastguard Worker bool IsArabicLetter(char32 codepoint) const { 125*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsArabicLetter(codepoint); 126*993b0882SAndroid Build Coastguard Worker } 127*993b0882SAndroid Build Coastguard Worker IsCyrillicLetter(char32 codepoint)128*993b0882SAndroid Build Coastguard Worker bool IsCyrillicLetter(char32 codepoint) const { 129*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsCyrillicLetter(codepoint); 130*993b0882SAndroid Build Coastguard Worker } 131*993b0882SAndroid Build Coastguard Worker IsChineseLetter(char32 codepoint)132*993b0882SAndroid Build Coastguard Worker bool IsChineseLetter(char32 codepoint) const { 133*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsChineseLetter(codepoint); 134*993b0882SAndroid Build Coastguard Worker } 135*993b0882SAndroid Build Coastguard Worker IsJapaneseLetter(char32 codepoint)136*993b0882SAndroid Build Coastguard Worker bool IsJapaneseLetter(char32 codepoint) const { 137*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsJapaneseLetter(codepoint); 138*993b0882SAndroid Build Coastguard Worker } 139*993b0882SAndroid Build Coastguard Worker IsKoreanLetter(char32 codepoint)140*993b0882SAndroid Build Coastguard Worker bool IsKoreanLetter(char32 codepoint) const { 141*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsKoreanLetter(codepoint); 142*993b0882SAndroid Build Coastguard Worker } 143*993b0882SAndroid Build Coastguard Worker IsThaiLetter(char32 codepoint)144*993b0882SAndroid Build Coastguard Worker bool IsThaiLetter(char32 codepoint) const { 145*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsThaiLetter(codepoint); 146*993b0882SAndroid Build Coastguard Worker } 147*993b0882SAndroid Build Coastguard Worker IsCJTletter(char32 codepoint)148*993b0882SAndroid Build Coastguard Worker bool IsCJTletter(char32 codepoint) const { 149*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsCJTletter(codepoint); 150*993b0882SAndroid Build Coastguard Worker } 151*993b0882SAndroid Build Coastguard Worker IsLetter(char32 codepoint)152*993b0882SAndroid Build Coastguard Worker bool IsLetter(char32 codepoint) const { 153*993b0882SAndroid Build Coastguard Worker return libtextclassifier3::IsLetter(codepoint); 154*993b0882SAndroid Build Coastguard Worker } 155*993b0882SAndroid Build Coastguard Worker IsValidUtf8(const UnicodeText & text)156*993b0882SAndroid Build Coastguard Worker bool IsValidUtf8(const UnicodeText& text) const { 157*993b0882SAndroid Build Coastguard Worker // Basic check of structural validity of UTF8. 158*993b0882SAndroid Build Coastguard Worker if (!text.is_valid()) { 159*993b0882SAndroid Build Coastguard Worker return false; 160*993b0882SAndroid Build Coastguard Worker } 161*993b0882SAndroid Build Coastguard Worker // In addition to that, we declare that a valid UTF8 is when the number of 162*993b0882SAndroid Build Coastguard Worker // codepoints in the string as measured by ICU is the same as the number of 163*993b0882SAndroid Build Coastguard Worker // codepoints as measured by UnicodeText. Because if we don't do this check, 164*993b0882SAndroid Build Coastguard Worker // the indices might differ, and cause trouble, because the assumption 165*993b0882SAndroid Build Coastguard Worker // throughout the code is that ICU indices and UnicodeText indices are the 166*993b0882SAndroid Build Coastguard Worker // same. 167*993b0882SAndroid Build Coastguard Worker // NOTE: This is not perfect, as this doesn't check the alignment of the 168*993b0882SAndroid Build Coastguard Worker // codepoints, but for the practical purposes should be enough. 169*993b0882SAndroid Build Coastguard Worker const StatusOr<int32> icu_length = Length(text); 170*993b0882SAndroid Build Coastguard Worker if (!icu_length.ok()) { 171*993b0882SAndroid Build Coastguard Worker return false; 172*993b0882SAndroid Build Coastguard Worker } 173*993b0882SAndroid Build Coastguard Worker 174*993b0882SAndroid Build Coastguard Worker if (icu_length.ValueOrDie() != text.size_codepoints()) { 175*993b0882SAndroid Build Coastguard Worker return false; 176*993b0882SAndroid Build Coastguard Worker } 177*993b0882SAndroid Build Coastguard Worker 178*993b0882SAndroid Build Coastguard Worker return true; 179*993b0882SAndroid Build Coastguard Worker } 180*993b0882SAndroid Build Coastguard Worker }; 181*993b0882SAndroid Build Coastguard Worker 182*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3 183*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_ 184