1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker *
4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker *
8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker *
10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker */
16*993b0882SAndroid Build Coastguard Worker
17*993b0882SAndroid Build Coastguard Worker // An implementation of Unilib that uses Android Java interfaces via JNI. The
18*993b0882SAndroid Build Coastguard Worker // performance critical ops have been re-implemented in C++.
19*993b0882SAndroid Build Coastguard Worker // Specifically, this class must be compatible with API level 14 (ICS).
20*993b0882SAndroid Build Coastguard Worker
21*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
22*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
23*993b0882SAndroid Build Coastguard Worker
24*993b0882SAndroid Build Coastguard Worker #include <jni.h>
25*993b0882SAndroid Build Coastguard Worker
26*993b0882SAndroid Build Coastguard Worker #include <memory>
27*993b0882SAndroid Build Coastguard Worker #include <mutex> // NOLINT
28*993b0882SAndroid Build Coastguard Worker #include <string>
29*993b0882SAndroid Build Coastguard Worker
30*993b0882SAndroid Build Coastguard Worker #include "utils/base/integral_types.h"
31*993b0882SAndroid Build Coastguard Worker #include "utils/java/jni-base.h"
32*993b0882SAndroid Build Coastguard Worker #include "utils/java/jni-cache.h"
33*993b0882SAndroid Build Coastguard Worker #include "utils/java/jni-helper.h"
34*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
35*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib-common.h"
36*993b0882SAndroid Build Coastguard Worker
37*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
38*993b0882SAndroid Build Coastguard Worker
39*993b0882SAndroid Build Coastguard Worker class UniLibBase {
40*993b0882SAndroid Build Coastguard Worker public:
41*993b0882SAndroid Build Coastguard Worker UniLibBase();
42*993b0882SAndroid Build Coastguard Worker explicit UniLibBase(const std::shared_ptr<JniCache>& jni_cache);
43*993b0882SAndroid Build Coastguard Worker
44*993b0882SAndroid Build Coastguard Worker bool ParseInt32(const UnicodeText& text, int32* result) const;
45*993b0882SAndroid Build Coastguard Worker bool ParseInt64(const UnicodeText& text, int64* result) const;
46*993b0882SAndroid Build Coastguard Worker bool ParseDouble(const UnicodeText& text, double* result) const;
47*993b0882SAndroid Build Coastguard Worker
48*993b0882SAndroid Build Coastguard Worker bool IsOpeningBracket(char32 codepoint) const;
49*993b0882SAndroid Build Coastguard Worker bool IsClosingBracket(char32 codepoint) const;
50*993b0882SAndroid Build Coastguard Worker bool IsWhitespace(char32 codepoint) const;
51*993b0882SAndroid Build Coastguard Worker bool IsDigit(char32 codepoint) const;
52*993b0882SAndroid Build Coastguard Worker bool IsLower(char32 codepoint) const;
53*993b0882SAndroid Build Coastguard Worker bool IsUpper(char32 codepoint) const;
54*993b0882SAndroid Build Coastguard Worker bool IsPunctuation(char32 codepoint) const;
55*993b0882SAndroid Build Coastguard Worker
56*993b0882SAndroid Build Coastguard Worker char32 ToLower(char32 codepoint) const;
57*993b0882SAndroid Build Coastguard Worker char32 ToUpper(char32 codepoint) const;
58*993b0882SAndroid Build Coastguard Worker char32 GetPairedBracket(char32 codepoint) const;
59*993b0882SAndroid Build Coastguard Worker
60*993b0882SAndroid Build Coastguard Worker StatusOr<int32> Length(const UnicodeText& text) const;
61*993b0882SAndroid Build Coastguard Worker
62*993b0882SAndroid Build Coastguard Worker // Forward declaration for friend.
63*993b0882SAndroid Build Coastguard Worker class RegexPattern;
64*993b0882SAndroid Build Coastguard Worker
65*993b0882SAndroid Build Coastguard Worker class RegexMatcher {
66*993b0882SAndroid Build Coastguard Worker public:
67*993b0882SAndroid Build Coastguard Worker static constexpr int kError = -1;
68*993b0882SAndroid Build Coastguard Worker static constexpr int kNoError = 0;
69*993b0882SAndroid Build Coastguard Worker
70*993b0882SAndroid Build Coastguard Worker // Checks whether the input text matches the pattern exactly.
71*993b0882SAndroid Build Coastguard Worker bool Matches(int* status) const;
72*993b0882SAndroid Build Coastguard Worker
73*993b0882SAndroid Build Coastguard Worker // Approximate Matches() implementation implemented using Find(). It uses
74*993b0882SAndroid Build Coastguard Worker // the first Find() result and then checks that it spans the whole input.
75*993b0882SAndroid Build Coastguard Worker // NOTE: Unlike Matches() it can result in false negatives.
76*993b0882SAndroid Build Coastguard Worker // NOTE: Resets the matcher, so the current Find() state will be lost.
77*993b0882SAndroid Build Coastguard Worker bool ApproximatelyMatches(int* status);
78*993b0882SAndroid Build Coastguard Worker
79*993b0882SAndroid Build Coastguard Worker // Finds occurrences of the pattern in the input text.
80*993b0882SAndroid Build Coastguard Worker // Can be called repeatedly to find all occurrences. A call will update
81*993b0882SAndroid Build Coastguard Worker // internal state, so that 'Start', 'End' and 'Group' can be called to get
82*993b0882SAndroid Build Coastguard Worker // information about the match.
83*993b0882SAndroid Build Coastguard Worker // NOTE: Any call to ApproximatelyMatches() in between Find() calls will
84*993b0882SAndroid Build Coastguard Worker // modify the state.
85*993b0882SAndroid Build Coastguard Worker bool Find(int* status);
86*993b0882SAndroid Build Coastguard Worker
87*993b0882SAndroid Build Coastguard Worker // Gets the start offset of the last match (from 'Find').
88*993b0882SAndroid Build Coastguard Worker // Sets status to 'kError' if 'Find'
89*993b0882SAndroid Build Coastguard Worker // was not called previously.
90*993b0882SAndroid Build Coastguard Worker int Start(int* status) const;
91*993b0882SAndroid Build Coastguard Worker
92*993b0882SAndroid Build Coastguard Worker // Gets the start offset of the specified group of the last match.
93*993b0882SAndroid Build Coastguard Worker // (from 'Find').
94*993b0882SAndroid Build Coastguard Worker // Sets status to 'kError' if an invalid group was specified or if 'Find'
95*993b0882SAndroid Build Coastguard Worker // was not called previously.
96*993b0882SAndroid Build Coastguard Worker int Start(int group_idx, int* status) const;
97*993b0882SAndroid Build Coastguard Worker
98*993b0882SAndroid Build Coastguard Worker // Gets the end offset of the last match (from 'Find').
99*993b0882SAndroid Build Coastguard Worker // Sets status to 'kError' if 'Find'
100*993b0882SAndroid Build Coastguard Worker // was not called previously.
101*993b0882SAndroid Build Coastguard Worker int End(int* status) const;
102*993b0882SAndroid Build Coastguard Worker
103*993b0882SAndroid Build Coastguard Worker // Gets the end offset of the specified group of the last match.
104*993b0882SAndroid Build Coastguard Worker // (from 'Find').
105*993b0882SAndroid Build Coastguard Worker // Sets status to 'kError' if an invalid group was specified or if 'Find'
106*993b0882SAndroid Build Coastguard Worker // was not called previously.
107*993b0882SAndroid Build Coastguard Worker int End(int group_idx, int* status) const;
108*993b0882SAndroid Build Coastguard Worker
109*993b0882SAndroid Build Coastguard Worker // Gets the text of the last match (from 'Find').
110*993b0882SAndroid Build Coastguard Worker // Sets status to 'kError' if 'Find' was not called previously.
111*993b0882SAndroid Build Coastguard Worker UnicodeText Group(int* status) const;
112*993b0882SAndroid Build Coastguard Worker
113*993b0882SAndroid Build Coastguard Worker // Gets the text of the specified group of the last match (from 'Find').
114*993b0882SAndroid Build Coastguard Worker // Sets status to 'kError' if an invalid group was specified or if 'Find'
115*993b0882SAndroid Build Coastguard Worker // was not called previously.
116*993b0882SAndroid Build Coastguard Worker UnicodeText Group(int group_idx, int* status) const;
117*993b0882SAndroid Build Coastguard Worker
118*993b0882SAndroid Build Coastguard Worker // Returns the matched text (the 0th capturing group).
Text()119*993b0882SAndroid Build Coastguard Worker std::string Text() const {
120*993b0882SAndroid Build Coastguard Worker StatusOr<std::string> status_or_result =
121*993b0882SAndroid Build Coastguard Worker JStringToUtf8String(jni_cache_->GetEnv(), text_.get());
122*993b0882SAndroid Build Coastguard Worker if (!status_or_result.ok()) {
123*993b0882SAndroid Build Coastguard Worker TC3_LOG(ERROR) << "JStringToUtf8String failed.";
124*993b0882SAndroid Build Coastguard Worker return "";
125*993b0882SAndroid Build Coastguard Worker }
126*993b0882SAndroid Build Coastguard Worker return status_or_result.ValueOrDie();
127*993b0882SAndroid Build Coastguard Worker }
128*993b0882SAndroid Build Coastguard Worker
129*993b0882SAndroid Build Coastguard Worker private:
130*993b0882SAndroid Build Coastguard Worker friend class RegexPattern;
131*993b0882SAndroid Build Coastguard Worker RegexMatcher(const JniCache* jni_cache, ScopedGlobalRef<jobject> matcher,
132*993b0882SAndroid Build Coastguard Worker ScopedGlobalRef<jstring> text);
133*993b0882SAndroid Build Coastguard Worker bool UpdateLastFindOffset() const;
134*993b0882SAndroid Build Coastguard Worker
135*993b0882SAndroid Build Coastguard Worker const JniCache* jni_cache_;
136*993b0882SAndroid Build Coastguard Worker ScopedGlobalRef<jobject> matcher_;
137*993b0882SAndroid Build Coastguard Worker ScopedGlobalRef<jstring> text_;
138*993b0882SAndroid Build Coastguard Worker mutable int last_find_offset_ = 0;
139*993b0882SAndroid Build Coastguard Worker mutable int last_find_offset_codepoints_ = 0;
140*993b0882SAndroid Build Coastguard Worker mutable bool last_find_offset_dirty_ = true;
141*993b0882SAndroid Build Coastguard Worker };
142*993b0882SAndroid Build Coastguard Worker
143*993b0882SAndroid Build Coastguard Worker class RegexPattern {
144*993b0882SAndroid Build Coastguard Worker public:
145*993b0882SAndroid Build Coastguard Worker std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& context) const;
146*993b0882SAndroid Build Coastguard Worker
147*993b0882SAndroid Build Coastguard Worker private:
148*993b0882SAndroid Build Coastguard Worker friend class UniLibBase;
149*993b0882SAndroid Build Coastguard Worker RegexPattern(const JniCache* jni_cache, const UnicodeText& pattern,
150*993b0882SAndroid Build Coastguard Worker bool lazy);
151*993b0882SAndroid Build Coastguard Worker Status LockedInitializeIfNotAlready() const;
152*993b0882SAndroid Build Coastguard Worker
153*993b0882SAndroid Build Coastguard Worker const JniCache* jni_cache_;
154*993b0882SAndroid Build Coastguard Worker
155*993b0882SAndroid Build Coastguard Worker // These members need to be mutable because of the lazy initialization.
156*993b0882SAndroid Build Coastguard Worker // NOTE: The Matcher method first ensures (using a lock) that the
157*993b0882SAndroid Build Coastguard Worker // initialization was attempted (by using LockedInitializeIfNotAlready) and
158*993b0882SAndroid Build Coastguard Worker // then can access them without locking.
159*993b0882SAndroid Build Coastguard Worker mutable std::mutex mutex_;
160*993b0882SAndroid Build Coastguard Worker mutable ScopedGlobalRef<jobject> pattern_;
161*993b0882SAndroid Build Coastguard Worker mutable bool initialized_;
162*993b0882SAndroid Build Coastguard Worker mutable bool initialization_failure_;
163*993b0882SAndroid Build Coastguard Worker mutable UnicodeText pattern_text_;
164*993b0882SAndroid Build Coastguard Worker };
165*993b0882SAndroid Build Coastguard Worker
166*993b0882SAndroid Build Coastguard Worker class BreakIterator {
167*993b0882SAndroid Build Coastguard Worker public:
168*993b0882SAndroid Build Coastguard Worker int Next();
169*993b0882SAndroid Build Coastguard Worker
170*993b0882SAndroid Build Coastguard Worker static constexpr int kDone = -1;
171*993b0882SAndroid Build Coastguard Worker
172*993b0882SAndroid Build Coastguard Worker private:
173*993b0882SAndroid Build Coastguard Worker friend class UniLibBase;
174*993b0882SAndroid Build Coastguard Worker BreakIterator(const JniCache* jni_cache, const UnicodeText& text);
175*993b0882SAndroid Build Coastguard Worker
176*993b0882SAndroid Build Coastguard Worker const JniCache* jni_cache_;
177*993b0882SAndroid Build Coastguard Worker ScopedGlobalRef<jstring> text_;
178*993b0882SAndroid Build Coastguard Worker ScopedGlobalRef<jobject> iterator_;
179*993b0882SAndroid Build Coastguard Worker int last_break_index_;
180*993b0882SAndroid Build Coastguard Worker int last_unicode_index_;
181*993b0882SAndroid Build Coastguard Worker };
182*993b0882SAndroid Build Coastguard Worker
183*993b0882SAndroid Build Coastguard Worker std::unique_ptr<RegexPattern> CreateRegexPattern(
184*993b0882SAndroid Build Coastguard Worker const UnicodeText& regex) const;
185*993b0882SAndroid Build Coastguard Worker std::unique_ptr<RegexPattern> CreateLazyRegexPattern(
186*993b0882SAndroid Build Coastguard Worker const UnicodeText& regex) const;
187*993b0882SAndroid Build Coastguard Worker std::unique_ptr<BreakIterator> CreateBreakIterator(
188*993b0882SAndroid Build Coastguard Worker const UnicodeText& text) const;
189*993b0882SAndroid Build Coastguard Worker
190*993b0882SAndroid Build Coastguard Worker private:
191*993b0882SAndroid Build Coastguard Worker template <class T>
192*993b0882SAndroid Build Coastguard Worker bool ParseInt(const UnicodeText& text, T* result) const;
193*993b0882SAndroid Build Coastguard Worker
194*993b0882SAndroid Build Coastguard Worker std::shared_ptr<JniCache> jni_cache_;
195*993b0882SAndroid Build Coastguard Worker };
196*993b0882SAndroid Build Coastguard Worker
197*993b0882SAndroid Build Coastguard Worker template <class T>
ParseInt(const UnicodeText & text,T * result)198*993b0882SAndroid Build Coastguard Worker bool UniLibBase::ParseInt(const UnicodeText& text, T* result) const {
199*993b0882SAndroid Build Coastguard Worker if (!jni_cache_) {
200*993b0882SAndroid Build Coastguard Worker return false;
201*993b0882SAndroid Build Coastguard Worker }
202*993b0882SAndroid Build Coastguard Worker
203*993b0882SAndroid Build Coastguard Worker // Avoid throwing exceptions when the text is unlikely to be a number.
204*993b0882SAndroid Build Coastguard Worker int32 result32 = 0;
205*993b0882SAndroid Build Coastguard Worker if (!PassesIntPreChesks(text, result32)) {
206*993b0882SAndroid Build Coastguard Worker return false;
207*993b0882SAndroid Build Coastguard Worker }
208*993b0882SAndroid Build Coastguard Worker
209*993b0882SAndroid Build Coastguard Worker JNIEnv* env = jni_cache_->GetEnv();
210*993b0882SAndroid Build Coastguard Worker TC3_ASSIGN_OR_RETURN_FALSE(const ScopedLocalRef<jstring> text_java,
211*993b0882SAndroid Build Coastguard Worker jni_cache_->ConvertToJavaString(text));
212*993b0882SAndroid Build Coastguard Worker TC3_ASSIGN_OR_RETURN_FALSE(
213*993b0882SAndroid Build Coastguard Worker *result,
214*993b0882SAndroid Build Coastguard Worker JniHelper::CallStaticIntMethod<T>(
215*993b0882SAndroid Build Coastguard Worker env,
216*993b0882SAndroid Build Coastguard Worker /*print_exception_on_error=*/false, jni_cache_->integer_class.get(),
217*993b0882SAndroid Build Coastguard Worker jni_cache_->integer_parse_int, text_java.get()));
218*993b0882SAndroid Build Coastguard Worker return true;
219*993b0882SAndroid Build Coastguard Worker }
220*993b0882SAndroid Build Coastguard Worker
221*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3
222*993b0882SAndroid Build Coastguard Worker
223*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
224