xref: /aosp_15_r20/external/libtextclassifier/native/annotator/number/number.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker #include <string>
21*993b0882SAndroid Build Coastguard Worker #include <unordered_set>
22*993b0882SAndroid Build Coastguard Worker #include <vector>
23*993b0882SAndroid Build Coastguard Worker 
24*993b0882SAndroid Build Coastguard Worker #include "annotator/model_generated.h"
25*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h"
26*993b0882SAndroid Build Coastguard Worker #include "utils/base/logging.h"
27*993b0882SAndroid Build Coastguard Worker #include "utils/container/sorted-strings-table.h"
28*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer.h"
29*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
30*993b0882SAndroid Build Coastguard Worker 
31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
32*993b0882SAndroid Build Coastguard Worker 
33*993b0882SAndroid Build Coastguard Worker // Annotator of numbers in text.
34*993b0882SAndroid Build Coastguard Worker //
35*993b0882SAndroid Build Coastguard Worker // Integer supported values are in range [-1 000 000 000, 1 000 000 000].
36*993b0882SAndroid Build Coastguard Worker // Doble supposted values are in range [-999999999.999999999,
37*993b0882SAndroid Build Coastguard Worker // 999999999.999999999].
38*993b0882SAndroid Build Coastguard Worker class NumberAnnotator {
39*993b0882SAndroid Build Coastguard Worker  public:
NumberAnnotator(const NumberAnnotatorOptions * options,const UniLib * unilib)40*993b0882SAndroid Build Coastguard Worker   explicit NumberAnnotator(const NumberAnnotatorOptions* options,
41*993b0882SAndroid Build Coastguard Worker                            const UniLib* unilib)
42*993b0882SAndroid Build Coastguard Worker       : options_(options),
43*993b0882SAndroid Build Coastguard Worker         unilib_(unilib),
44*993b0882SAndroid Build Coastguard Worker         tokenizer_(Tokenizer(TokenizationType_LETTER_DIGIT, unilib,
45*993b0882SAndroid Build Coastguard Worker                              /*codepoint_ranges=*/{},
46*993b0882SAndroid Build Coastguard Worker                              /*internal_tokenizer_codepoint_ranges=*/{},
47*993b0882SAndroid Build Coastguard Worker                              /*split_on_script_change=*/false,
48*993b0882SAndroid Build Coastguard Worker                              /*icu_preserve_whitespace_tokens=*/true)),
49*993b0882SAndroid Build Coastguard Worker         percent_suffixes_(FromFlatbufferStringToUnordredSet(
50*993b0882SAndroid Build Coastguard Worker             options_->percentage_pieces_string())),
51*993b0882SAndroid Build Coastguard Worker         max_number_of_digits_(options->max_number_of_digits()) {}
52*993b0882SAndroid Build Coastguard Worker 
53*993b0882SAndroid Build Coastguard Worker   // Classifies given text, and if it is a number, it passes the result in
54*993b0882SAndroid Build Coastguard Worker   // 'classification_result' and returns true, otherwise returns false.
55*993b0882SAndroid Build Coastguard Worker   bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
56*993b0882SAndroid Build Coastguard Worker                     AnnotationUsecase annotation_usecase,
57*993b0882SAndroid Build Coastguard Worker                     ClassificationResult* classification_result) const;
58*993b0882SAndroid Build Coastguard Worker 
59*993b0882SAndroid Build Coastguard Worker   // Finds all number instances in the input text. Returns true in any case.
60*993b0882SAndroid Build Coastguard Worker   bool FindAll(const UnicodeText& context_unicode,
61*993b0882SAndroid Build Coastguard Worker                AnnotationUsecase annotation_usecase, ModeFlag mode,
62*993b0882SAndroid Build Coastguard Worker                std::vector<AnnotatedSpan>* result) const;
63*993b0882SAndroid Build Coastguard Worker 
64*993b0882SAndroid Build Coastguard Worker  private:
65*993b0882SAndroid Build Coastguard Worker   // Converts a Flatbuffer string containing zero-separated percent suffixes
66*993b0882SAndroid Build Coastguard Worker   // to an unordered set.
67*993b0882SAndroid Build Coastguard Worker   static std::unordered_set<std::string> FromFlatbufferStringToUnordredSet(
68*993b0882SAndroid Build Coastguard Worker       const flatbuffers::String* flatbuffer_percent_strings);
69*993b0882SAndroid Build Coastguard Worker 
70*993b0882SAndroid Build Coastguard Worker   // Checks if the annotated numbers from the context represent percentages.
71*993b0882SAndroid Build Coastguard Worker   // If yes, replaces the collection type and the annotation boundary in the
72*993b0882SAndroid Build Coastguard Worker   // result.
73*993b0882SAndroid Build Coastguard Worker   void FindPercentages(const UnicodeText& context,
74*993b0882SAndroid Build Coastguard Worker                        std::vector<AnnotatedSpan>* result) const;
75*993b0882SAndroid Build Coastguard Worker 
76*993b0882SAndroid Build Coastguard Worker   // Checks if the tokens from in the interval [start_index-2, start_index] are
77*993b0882SAndroid Build Coastguard Worker   // valid characters that can preced a number context.
78*993b0882SAndroid Build Coastguard Worker   bool TokensAreValidStart(const std::vector<Token>& tokens,
79*993b0882SAndroid Build Coastguard Worker                            int start_index) const;
80*993b0882SAndroid Build Coastguard Worker 
81*993b0882SAndroid Build Coastguard Worker   // Checks if the tokens in the interval (..., prefix_end_index] are a valid
82*993b0882SAndroid Build Coastguard Worker   // number prefix.
83*993b0882SAndroid Build Coastguard Worker   bool TokensAreValidNumberPrefix(const std::vector<Token>& tokens,
84*993b0882SAndroid Build Coastguard Worker                                   int prefix_end_index) const;
85*993b0882SAndroid Build Coastguard Worker 
86*993b0882SAndroid Build Coastguard Worker   // Checks if the tokens from in the interval [ending_index, ending_index+2]
87*993b0882SAndroid Build Coastguard Worker   // are valid characters that can follow a number context.
88*993b0882SAndroid Build Coastguard Worker   bool TokensAreValidEnding(const std::vector<Token>& tokens,
89*993b0882SAndroid Build Coastguard Worker                             int ending_index) const;
90*993b0882SAndroid Build Coastguard Worker 
91*993b0882SAndroid Build Coastguard Worker   // Checks if the tokens in the interval [suffix_start_index, ...) are a valid
92*993b0882SAndroid Build Coastguard Worker   // number suffix.
93*993b0882SAndroid Build Coastguard Worker   bool TokensAreValidNumberSuffix(const std::vector<Token>& tokens,
94*993b0882SAndroid Build Coastguard Worker                                   int suffix_start_index) const;
95*993b0882SAndroid Build Coastguard Worker 
96*993b0882SAndroid Build Coastguard Worker   // Checks if the tokens in the interval [suffix_start_index, ...) are a valid
97*993b0882SAndroid Build Coastguard Worker   // percent suffix. If false, returns -1, else returns the end codepoint.
98*993b0882SAndroid Build Coastguard Worker   int FindPercentSuffixEndCodepoint(const std::vector<Token>& tokens,
99*993b0882SAndroid Build Coastguard Worker                                     int suffix_token_start_index) const;
100*993b0882SAndroid Build Coastguard Worker 
101*993b0882SAndroid Build Coastguard Worker   // Checks if the given text represents a number (either int or double).
102*993b0882SAndroid Build Coastguard Worker   bool TryParseNumber(const UnicodeText& token_text, bool is_negative,
103*993b0882SAndroid Build Coastguard Worker                       int64* parsed_int_value,
104*993b0882SAndroid Build Coastguard Worker                       double* parsed_double_value) const;
105*993b0882SAndroid Build Coastguard Worker 
106*993b0882SAndroid Build Coastguard Worker   // Checks if a word contains only CJT characters.
107*993b0882SAndroid Build Coastguard Worker   bool IsCJTterm(UnicodeText::const_iterator token_begin_it,
108*993b0882SAndroid Build Coastguard Worker                  int token_length) const;
109*993b0882SAndroid Build Coastguard Worker 
110*993b0882SAndroid Build Coastguard Worker   AnnotatedSpan CreateAnnotatedSpan(int start, int end, int int_value,
111*993b0882SAndroid Build Coastguard Worker                                     double double_value,
112*993b0882SAndroid Build Coastguard Worker                                     const std::string collection, float score,
113*993b0882SAndroid Build Coastguard Worker                                     float priority_score) const;
114*993b0882SAndroid Build Coastguard Worker 
115*993b0882SAndroid Build Coastguard Worker   const NumberAnnotatorOptions* options_;
116*993b0882SAndroid Build Coastguard Worker   const UniLib* unilib_;
117*993b0882SAndroid Build Coastguard Worker   const Tokenizer tokenizer_;
118*993b0882SAndroid Build Coastguard Worker   const std::unordered_set<std::string> percent_suffixes_;
119*993b0882SAndroid Build Coastguard Worker   const int max_number_of_digits_;
120*993b0882SAndroid Build Coastguard Worker };
121*993b0882SAndroid Build Coastguard Worker 
122*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
123*993b0882SAndroid Build Coastguard Worker 
124*993b0882SAndroid Build Coastguard Worker #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
125