xref: /aosp_15_r20/external/libtextclassifier/native/utils/utf8/unilib.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_
18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker #include "utils/base/integral_types.h"
21*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
22*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-common.h"
23*993b0882SAndroid Build Coastguard Worker 
24*993b0882SAndroid Build Coastguard Worker #if defined TC3_UNILIB_ICU
25*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-icu.h"
26*993b0882SAndroid Build Coastguard Worker #define INIT_UNILIB_FOR_TESTING(VAR) VAR()
27*993b0882SAndroid Build Coastguard Worker #elif defined TC3_UNILIB_JAVAICU
28*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-javaicu.h"
29*993b0882SAndroid Build Coastguard Worker #define INIT_UNILIB_FOR_TESTING(VAR) VAR(nullptr)
30*993b0882SAndroid Build Coastguard Worker #elif defined TC3_UNILIB_APPLE
31*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-apple.h"
32*993b0882SAndroid Build Coastguard Worker #define INIT_UNILIB_FOR_TESTING(VAR) VAR()
33*993b0882SAndroid Build Coastguard Worker #else
34*993b0882SAndroid Build Coastguard Worker #error No TC3_UNILIB implementation specified.
35*993b0882SAndroid Build Coastguard Worker #endif
36*993b0882SAndroid Build Coastguard Worker 
37*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
38*993b0882SAndroid Build Coastguard Worker 
39*993b0882SAndroid Build Coastguard Worker class UniLib : public UniLibBase {
40*993b0882SAndroid Build Coastguard Worker  public:
41*993b0882SAndroid Build Coastguard Worker   using UniLibBase::UniLibBase;
42*993b0882SAndroid Build Coastguard Worker 
43*993b0882SAndroid Build Coastguard Worker   // Lowercase a unicode string.
ToLowerText(const UnicodeText & text)44*993b0882SAndroid Build Coastguard Worker   UnicodeText ToLowerText(const UnicodeText& text) const {
45*993b0882SAndroid Build Coastguard Worker     UnicodeText result;
46*993b0882SAndroid Build Coastguard Worker     for (const char32 codepoint : text) {
47*993b0882SAndroid Build Coastguard Worker       result.push_back(ToLower(codepoint));
48*993b0882SAndroid Build Coastguard Worker     }
49*993b0882SAndroid Build Coastguard Worker     return result;
50*993b0882SAndroid Build Coastguard Worker   }
51*993b0882SAndroid Build Coastguard Worker 
52*993b0882SAndroid Build Coastguard Worker   // Uppercase a unicode string.
ToUpperText(const UnicodeText & text)53*993b0882SAndroid Build Coastguard Worker   UnicodeText ToUpperText(const UnicodeText& text) const {
54*993b0882SAndroid Build Coastguard Worker     UnicodeText result;
55*993b0882SAndroid Build Coastguard Worker     for (const char32 codepoint : text) {
56*993b0882SAndroid Build Coastguard Worker       result.push_back(UniLibBase::ToUpper(codepoint));
57*993b0882SAndroid Build Coastguard Worker     }
58*993b0882SAndroid Build Coastguard Worker     return result;
59*993b0882SAndroid Build Coastguard Worker   }
60*993b0882SAndroid Build Coastguard Worker 
IsLowerText(const UnicodeText & text)61*993b0882SAndroid Build Coastguard Worker   bool IsLowerText(const UnicodeText& text) const {
62*993b0882SAndroid Build Coastguard Worker     for (const char32 codepoint : text) {
63*993b0882SAndroid Build Coastguard Worker       if (!IsLower(codepoint)) {
64*993b0882SAndroid Build Coastguard Worker         return false;
65*993b0882SAndroid Build Coastguard Worker       }
66*993b0882SAndroid Build Coastguard Worker     }
67*993b0882SAndroid Build Coastguard Worker     return true;
68*993b0882SAndroid Build Coastguard Worker   }
69*993b0882SAndroid Build Coastguard Worker 
IsUpperText(const UnicodeText & text)70*993b0882SAndroid Build Coastguard Worker   bool IsUpperText(const UnicodeText& text) const {
71*993b0882SAndroid Build Coastguard Worker     for (const char32 codepoint : text) {
72*993b0882SAndroid Build Coastguard Worker       if (!IsUpper(codepoint)) {
73*993b0882SAndroid Build Coastguard Worker         return false;
74*993b0882SAndroid Build Coastguard Worker       }
75*993b0882SAndroid Build Coastguard Worker     }
76*993b0882SAndroid Build Coastguard Worker     return true;
77*993b0882SAndroid Build Coastguard Worker   }
78*993b0882SAndroid Build Coastguard Worker 
IsDigits(const UnicodeText & text)79*993b0882SAndroid Build Coastguard Worker   bool IsDigits(const UnicodeText& text) const {
80*993b0882SAndroid Build Coastguard Worker     for (const char32 codepoint : text) {
81*993b0882SAndroid Build Coastguard Worker       if (!IsDigit(codepoint)) {
82*993b0882SAndroid Build Coastguard Worker         return false;
83*993b0882SAndroid Build Coastguard Worker       }
84*993b0882SAndroid Build Coastguard Worker     }
85*993b0882SAndroid Build Coastguard Worker     return true;
86*993b0882SAndroid Build Coastguard Worker   }
87*993b0882SAndroid Build Coastguard Worker 
IsPercentage(char32 codepoint)88*993b0882SAndroid Build Coastguard Worker   bool IsPercentage(char32 codepoint) const {
89*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsPercentage(codepoint);
90*993b0882SAndroid Build Coastguard Worker   }
91*993b0882SAndroid Build Coastguard Worker 
IsSlash(char32 codepoint)92*993b0882SAndroid Build Coastguard Worker   bool IsSlash(char32 codepoint) const {
93*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsSlash(codepoint);
94*993b0882SAndroid Build Coastguard Worker   }
95*993b0882SAndroid Build Coastguard Worker 
IsMinus(char32 codepoint)96*993b0882SAndroid Build Coastguard Worker   bool IsMinus(char32 codepoint) const {
97*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsMinus(codepoint);
98*993b0882SAndroid Build Coastguard Worker   }
99*993b0882SAndroid Build Coastguard Worker 
IsNumberSign(char32 codepoint)100*993b0882SAndroid Build Coastguard Worker   bool IsNumberSign(char32 codepoint) const {
101*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsNumberSign(codepoint);
102*993b0882SAndroid Build Coastguard Worker   }
103*993b0882SAndroid Build Coastguard Worker 
IsDot(char32 codepoint)104*993b0882SAndroid Build Coastguard Worker   bool IsDot(char32 codepoint) const {
105*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsDot(codepoint);
106*993b0882SAndroid Build Coastguard Worker   }
107*993b0882SAndroid Build Coastguard Worker 
IsApostrophe(char32 codepoint)108*993b0882SAndroid Build Coastguard Worker   bool IsApostrophe(char32 codepoint) const {
109*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsApostrophe(codepoint);
110*993b0882SAndroid Build Coastguard Worker   }
111*993b0882SAndroid Build Coastguard Worker 
IsQuotation(char32 codepoint)112*993b0882SAndroid Build Coastguard Worker   bool IsQuotation(char32 codepoint) const {
113*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsQuotation(codepoint);
114*993b0882SAndroid Build Coastguard Worker   }
115*993b0882SAndroid Build Coastguard Worker 
IsAmpersand(char32 codepoint)116*993b0882SAndroid Build Coastguard Worker   bool IsAmpersand(char32 codepoint) const {
117*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsAmpersand(codepoint);
118*993b0882SAndroid Build Coastguard Worker   }
119*993b0882SAndroid Build Coastguard Worker 
IsLatinLetter(char32 codepoint)120*993b0882SAndroid Build Coastguard Worker   bool IsLatinLetter(char32 codepoint) const {
121*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsLatinLetter(codepoint);
122*993b0882SAndroid Build Coastguard Worker   }
123*993b0882SAndroid Build Coastguard Worker 
IsArabicLetter(char32 codepoint)124*993b0882SAndroid Build Coastguard Worker   bool IsArabicLetter(char32 codepoint) const {
125*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsArabicLetter(codepoint);
126*993b0882SAndroid Build Coastguard Worker   }
127*993b0882SAndroid Build Coastguard Worker 
IsCyrillicLetter(char32 codepoint)128*993b0882SAndroid Build Coastguard Worker   bool IsCyrillicLetter(char32 codepoint) const {
129*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsCyrillicLetter(codepoint);
130*993b0882SAndroid Build Coastguard Worker   }
131*993b0882SAndroid Build Coastguard Worker 
IsChineseLetter(char32 codepoint)132*993b0882SAndroid Build Coastguard Worker   bool IsChineseLetter(char32 codepoint) const {
133*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsChineseLetter(codepoint);
134*993b0882SAndroid Build Coastguard Worker   }
135*993b0882SAndroid Build Coastguard Worker 
IsJapaneseLetter(char32 codepoint)136*993b0882SAndroid Build Coastguard Worker   bool IsJapaneseLetter(char32 codepoint) const {
137*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsJapaneseLetter(codepoint);
138*993b0882SAndroid Build Coastguard Worker   }
139*993b0882SAndroid Build Coastguard Worker 
IsKoreanLetter(char32 codepoint)140*993b0882SAndroid Build Coastguard Worker   bool IsKoreanLetter(char32 codepoint) const {
141*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsKoreanLetter(codepoint);
142*993b0882SAndroid Build Coastguard Worker   }
143*993b0882SAndroid Build Coastguard Worker 
IsThaiLetter(char32 codepoint)144*993b0882SAndroid Build Coastguard Worker   bool IsThaiLetter(char32 codepoint) const {
145*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsThaiLetter(codepoint);
146*993b0882SAndroid Build Coastguard Worker   }
147*993b0882SAndroid Build Coastguard Worker 
IsCJTletter(char32 codepoint)148*993b0882SAndroid Build Coastguard Worker   bool IsCJTletter(char32 codepoint) const {
149*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsCJTletter(codepoint);
150*993b0882SAndroid Build Coastguard Worker   }
151*993b0882SAndroid Build Coastguard Worker 
IsLetter(char32 codepoint)152*993b0882SAndroid Build Coastguard Worker   bool IsLetter(char32 codepoint) const {
153*993b0882SAndroid Build Coastguard Worker     return libtextclassifier3::IsLetter(codepoint);
154*993b0882SAndroid Build Coastguard Worker   }
155*993b0882SAndroid Build Coastguard Worker 
IsValidUtf8(const UnicodeText & text)156*993b0882SAndroid Build Coastguard Worker   bool IsValidUtf8(const UnicodeText& text) const {
157*993b0882SAndroid Build Coastguard Worker     // Basic check of structural validity of UTF8.
158*993b0882SAndroid Build Coastguard Worker     if (!text.is_valid()) {
159*993b0882SAndroid Build Coastguard Worker       return false;
160*993b0882SAndroid Build Coastguard Worker     }
161*993b0882SAndroid Build Coastguard Worker     // In addition to that, we declare that a valid UTF8 is when the number of
162*993b0882SAndroid Build Coastguard Worker     // codepoints in the string as measured by ICU is the same as the number of
163*993b0882SAndroid Build Coastguard Worker     // codepoints as measured by UnicodeText. Because if we don't do this check,
164*993b0882SAndroid Build Coastguard Worker     // the indices might differ, and cause trouble, because the assumption
165*993b0882SAndroid Build Coastguard Worker     // throughout the code is that ICU indices and UnicodeText indices are the
166*993b0882SAndroid Build Coastguard Worker     // same.
167*993b0882SAndroid Build Coastguard Worker     // NOTE: This is not perfect, as this doesn't check the alignment of the
168*993b0882SAndroid Build Coastguard Worker     // codepoints, but for the practical purposes should be enough.
169*993b0882SAndroid Build Coastguard Worker     const StatusOr<int32> icu_length = Length(text);
170*993b0882SAndroid Build Coastguard Worker     if (!icu_length.ok()) {
171*993b0882SAndroid Build Coastguard Worker       return false;
172*993b0882SAndroid Build Coastguard Worker     }
173*993b0882SAndroid Build Coastguard Worker 
174*993b0882SAndroid Build Coastguard Worker     if (icu_length.ValueOrDie() != text.size_codepoints()) {
175*993b0882SAndroid Build Coastguard Worker       return false;
176*993b0882SAndroid Build Coastguard Worker     }
177*993b0882SAndroid Build Coastguard Worker 
178*993b0882SAndroid Build Coastguard Worker     return true;
179*993b0882SAndroid Build Coastguard Worker   }
180*993b0882SAndroid Build Coastguard Worker };
181*993b0882SAndroid Build Coastguard Worker 
182*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
183*993b0882SAndroid Build Coastguard Worker #endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_
184