1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_ 18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include <string> 21*993b0882SAndroid Build Coastguard Worker #include <unordered_set> 22*993b0882SAndroid Build Coastguard Worker #include <vector> 23*993b0882SAndroid Build Coastguard Worker 24*993b0882SAndroid Build Coastguard Worker #include "annotator/model_generated.h" 25*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h" 26*993b0882SAndroid Build Coastguard Worker #include "utils/base/logging.h" 27*993b0882SAndroid Build Coastguard Worker #include "utils/container/sorted-strings-table.h" 28*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer.h" 29*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h" 30*993b0882SAndroid Build Coastguard Worker 31*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 32*993b0882SAndroid Build Coastguard Worker 33*993b0882SAndroid Build Coastguard Worker // Annotator of numbers in text. 34*993b0882SAndroid Build Coastguard Worker // 35*993b0882SAndroid Build Coastguard Worker // Integer supported values are in range [-1 000 000 000, 1 000 000 000]. 36*993b0882SAndroid Build Coastguard Worker // Doble supposted values are in range [-999999999.999999999, 37*993b0882SAndroid Build Coastguard Worker // 999999999.999999999]. 38*993b0882SAndroid Build Coastguard Worker class NumberAnnotator { 39*993b0882SAndroid Build Coastguard Worker public: NumberAnnotator(const NumberAnnotatorOptions * options,const UniLib * unilib)40*993b0882SAndroid Build Coastguard Worker explicit NumberAnnotator(const NumberAnnotatorOptions* options, 41*993b0882SAndroid Build Coastguard Worker const UniLib* unilib) 42*993b0882SAndroid Build Coastguard Worker : options_(options), 43*993b0882SAndroid Build Coastguard Worker unilib_(unilib), 44*993b0882SAndroid Build Coastguard Worker tokenizer_(Tokenizer(TokenizationType_LETTER_DIGIT, unilib, 45*993b0882SAndroid Build Coastguard Worker /*codepoint_ranges=*/{}, 46*993b0882SAndroid Build Coastguard Worker /*internal_tokenizer_codepoint_ranges=*/{}, 47*993b0882SAndroid Build Coastguard Worker /*split_on_script_change=*/false, 48*993b0882SAndroid Build Coastguard Worker /*icu_preserve_whitespace_tokens=*/true)), 49*993b0882SAndroid Build Coastguard Worker percent_suffixes_(FromFlatbufferStringToUnordredSet( 50*993b0882SAndroid Build Coastguard Worker options_->percentage_pieces_string())), 51*993b0882SAndroid Build Coastguard Worker max_number_of_digits_(options->max_number_of_digits()) {} 52*993b0882SAndroid Build Coastguard Worker 53*993b0882SAndroid Build Coastguard Worker // Classifies given text, and if it is a number, it passes the result in 54*993b0882SAndroid Build Coastguard Worker // 'classification_result' and returns true, otherwise returns false. 55*993b0882SAndroid Build Coastguard Worker bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices, 56*993b0882SAndroid Build Coastguard Worker AnnotationUsecase annotation_usecase, 57*993b0882SAndroid Build Coastguard Worker ClassificationResult* classification_result) const; 58*993b0882SAndroid Build Coastguard Worker 59*993b0882SAndroid Build Coastguard Worker // Finds all number instances in the input text. Returns true in any case. 60*993b0882SAndroid Build Coastguard Worker bool FindAll(const UnicodeText& context_unicode, 61*993b0882SAndroid Build Coastguard Worker AnnotationUsecase annotation_usecase, ModeFlag mode, 62*993b0882SAndroid Build Coastguard Worker std::vector<AnnotatedSpan>* result) const; 63*993b0882SAndroid Build Coastguard Worker 64*993b0882SAndroid Build Coastguard Worker private: 65*993b0882SAndroid Build Coastguard Worker // Converts a Flatbuffer string containing zero-separated percent suffixes 66*993b0882SAndroid Build Coastguard Worker // to an unordered set. 67*993b0882SAndroid Build Coastguard Worker static std::unordered_set<std::string> FromFlatbufferStringToUnordredSet( 68*993b0882SAndroid Build Coastguard Worker const flatbuffers::String* flatbuffer_percent_strings); 69*993b0882SAndroid Build Coastguard Worker 70*993b0882SAndroid Build Coastguard Worker // Checks if the annotated numbers from the context represent percentages. 71*993b0882SAndroid Build Coastguard Worker // If yes, replaces the collection type and the annotation boundary in the 72*993b0882SAndroid Build Coastguard Worker // result. 73*993b0882SAndroid Build Coastguard Worker void FindPercentages(const UnicodeText& context, 74*993b0882SAndroid Build Coastguard Worker std::vector<AnnotatedSpan>* result) const; 75*993b0882SAndroid Build Coastguard Worker 76*993b0882SAndroid Build Coastguard Worker // Checks if the tokens from in the interval [start_index-2, start_index] are 77*993b0882SAndroid Build Coastguard Worker // valid characters that can preced a number context. 78*993b0882SAndroid Build Coastguard Worker bool TokensAreValidStart(const std::vector<Token>& tokens, 79*993b0882SAndroid Build Coastguard Worker int start_index) const; 80*993b0882SAndroid Build Coastguard Worker 81*993b0882SAndroid Build Coastguard Worker // Checks if the tokens in the interval (..., prefix_end_index] are a valid 82*993b0882SAndroid Build Coastguard Worker // number prefix. 83*993b0882SAndroid Build Coastguard Worker bool TokensAreValidNumberPrefix(const std::vector<Token>& tokens, 84*993b0882SAndroid Build Coastguard Worker int prefix_end_index) const; 85*993b0882SAndroid Build Coastguard Worker 86*993b0882SAndroid Build Coastguard Worker // Checks if the tokens from in the interval [ending_index, ending_index+2] 87*993b0882SAndroid Build Coastguard Worker // are valid characters that can follow a number context. 88*993b0882SAndroid Build Coastguard Worker bool TokensAreValidEnding(const std::vector<Token>& tokens, 89*993b0882SAndroid Build Coastguard Worker int ending_index) const; 90*993b0882SAndroid Build Coastguard Worker 91*993b0882SAndroid Build Coastguard Worker // Checks if the tokens in the interval [suffix_start_index, ...) are a valid 92*993b0882SAndroid Build Coastguard Worker // number suffix. 93*993b0882SAndroid Build Coastguard Worker bool TokensAreValidNumberSuffix(const std::vector<Token>& tokens, 94*993b0882SAndroid Build Coastguard Worker int suffix_start_index) const; 95*993b0882SAndroid Build Coastguard Worker 96*993b0882SAndroid Build Coastguard Worker // Checks if the tokens in the interval [suffix_start_index, ...) are a valid 97*993b0882SAndroid Build Coastguard Worker // percent suffix. If false, returns -1, else returns the end codepoint. 98*993b0882SAndroid Build Coastguard Worker int FindPercentSuffixEndCodepoint(const std::vector<Token>& tokens, 99*993b0882SAndroid Build Coastguard Worker int suffix_token_start_index) const; 100*993b0882SAndroid Build Coastguard Worker 101*993b0882SAndroid Build Coastguard Worker // Checks if the given text represents a number (either int or double). 102*993b0882SAndroid Build Coastguard Worker bool TryParseNumber(const UnicodeText& token_text, bool is_negative, 103*993b0882SAndroid Build Coastguard Worker int64* parsed_int_value, 104*993b0882SAndroid Build Coastguard Worker double* parsed_double_value) const; 105*993b0882SAndroid Build Coastguard Worker 106*993b0882SAndroid Build Coastguard Worker // Checks if a word contains only CJT characters. 107*993b0882SAndroid Build Coastguard Worker bool IsCJTterm(UnicodeText::const_iterator token_begin_it, 108*993b0882SAndroid Build Coastguard Worker int token_length) const; 109*993b0882SAndroid Build Coastguard Worker 110*993b0882SAndroid Build Coastguard Worker AnnotatedSpan CreateAnnotatedSpan(int start, int end, int int_value, 111*993b0882SAndroid Build Coastguard Worker double double_value, 112*993b0882SAndroid Build Coastguard Worker const std::string collection, float score, 113*993b0882SAndroid Build Coastguard Worker float priority_score) const; 114*993b0882SAndroid Build Coastguard Worker 115*993b0882SAndroid Build Coastguard Worker const NumberAnnotatorOptions* options_; 116*993b0882SAndroid Build Coastguard Worker const UniLib* unilib_; 117*993b0882SAndroid Build Coastguard Worker const Tokenizer tokenizer_; 118*993b0882SAndroid Build Coastguard Worker const std::unordered_set<std::string> percent_suffixes_; 119*993b0882SAndroid Build Coastguard Worker const int max_number_of_digits_; 120*993b0882SAndroid Build Coastguard Worker }; 121*993b0882SAndroid Build Coastguard Worker 122*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3 123*993b0882SAndroid Build Coastguard Worker 124*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_ 125