1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker // A lexer that (splits) and classifies tokens. 18*993b0882SAndroid Build Coastguard Worker // 19*993b0882SAndroid Build Coastguard Worker // Any whitespace gets absorbed into the token that follows them in the text. 20*993b0882SAndroid Build Coastguard Worker // For example, if the text contains: 21*993b0882SAndroid Build Coastguard Worker // 22*993b0882SAndroid Build Coastguard Worker // ...hello there world... 23*993b0882SAndroid Build Coastguard Worker // | | | 24*993b0882SAndroid Build Coastguard Worker // offset=16 39 52 25*993b0882SAndroid Build Coastguard Worker // 26*993b0882SAndroid Build Coastguard Worker // then the output will be: 27*993b0882SAndroid Build Coastguard Worker // 28*993b0882SAndroid Build Coastguard Worker // "hello" [?, 16) 29*993b0882SAndroid Build Coastguard Worker // "there" [16, 44) <-- note "16" NOT "39" 30*993b0882SAndroid Build Coastguard Worker // "world" [44, ?) <-- note "44" NOT "52" 31*993b0882SAndroid Build Coastguard Worker // 32*993b0882SAndroid Build Coastguard Worker // This makes it appear to the Matcher as if the tokens are adjacent. 33*993b0882SAndroid Build Coastguard Worker 34*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ 35*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ 36*993b0882SAndroid Build Coastguard Worker 37*993b0882SAndroid Build Coastguard Worker #include <vector> 38*993b0882SAndroid Build Coastguard Worker 39*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h" 40*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/parse-tree.h" 41*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/types.h" 42*993b0882SAndroid Build Coastguard Worker #include "utils/strings/stringpiece.h" 43*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h" 44*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h" 45*993b0882SAndroid Build Coastguard Worker 46*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar { 47*993b0882SAndroid Build Coastguard Worker 48*993b0882SAndroid Build Coastguard Worker // A lexical symbol with an identified meaning that represents raw tokens, 49*993b0882SAndroid Build Coastguard Worker // token categories or predefined text matches. 50*993b0882SAndroid Build Coastguard Worker // It is the unit fed to the grammar matcher. 51*993b0882SAndroid Build Coastguard Worker struct Symbol { 52*993b0882SAndroid Build Coastguard Worker // The type of the lexical symbol. 53*993b0882SAndroid Build Coastguard Worker enum class Type { 54*993b0882SAndroid Build Coastguard Worker // A raw token. 55*993b0882SAndroid Build Coastguard Worker TYPE_TERM, 56*993b0882SAndroid Build Coastguard Worker 57*993b0882SAndroid Build Coastguard Worker // A symbol representing a string of digits. 58*993b0882SAndroid Build Coastguard Worker TYPE_DIGITS, 59*993b0882SAndroid Build Coastguard Worker 60*993b0882SAndroid Build Coastguard Worker // Punctuation characters. 61*993b0882SAndroid Build Coastguard Worker TYPE_PUNCTUATION, 62*993b0882SAndroid Build Coastguard Worker 63*993b0882SAndroid Build Coastguard Worker // A predefined parse tree. 64*993b0882SAndroid Build Coastguard Worker TYPE_PARSE_TREE 65*993b0882SAndroid Build Coastguard Worker }; 66*993b0882SAndroid Build Coastguard Worker 67*993b0882SAndroid Build Coastguard Worker explicit Symbol() = default; 68*993b0882SAndroid Build Coastguard Worker 69*993b0882SAndroid Build Coastguard Worker // Constructs a symbol of a given type with an anchor in the text. SymbolSymbol70*993b0882SAndroid Build Coastguard Worker Symbol(const Type type, const CodepointSpan codepoint_span, 71*993b0882SAndroid Build Coastguard Worker const int match_offset, StringPiece lexeme) 72*993b0882SAndroid Build Coastguard Worker : type(type), 73*993b0882SAndroid Build Coastguard Worker codepoint_span(codepoint_span), 74*993b0882SAndroid Build Coastguard Worker match_offset(match_offset), 75*993b0882SAndroid Build Coastguard Worker lexeme(lexeme) {} 76*993b0882SAndroid Build Coastguard Worker 77*993b0882SAndroid Build Coastguard Worker // Constructs a symbol from a pre-defined parse tree. SymbolSymbol78*993b0882SAndroid Build Coastguard Worker explicit Symbol(ParseTree* parse_tree) 79*993b0882SAndroid Build Coastguard Worker : type(Type::TYPE_PARSE_TREE), 80*993b0882SAndroid Build Coastguard Worker codepoint_span(parse_tree->codepoint_span), 81*993b0882SAndroid Build Coastguard Worker match_offset(parse_tree->match_offset), 82*993b0882SAndroid Build Coastguard Worker parse_tree(parse_tree) {} 83*993b0882SAndroid Build Coastguard Worker 84*993b0882SAndroid Build Coastguard Worker // The type of the symbol. 85*993b0882SAndroid Build Coastguard Worker Type type; 86*993b0882SAndroid Build Coastguard Worker 87*993b0882SAndroid Build Coastguard Worker // The span in the text as codepoint offsets. 88*993b0882SAndroid Build Coastguard Worker CodepointSpan codepoint_span; 89*993b0882SAndroid Build Coastguard Worker 90*993b0882SAndroid Build Coastguard Worker // The match start offset (including preceding whitespace) as codepoint 91*993b0882SAndroid Build Coastguard Worker // offset. 92*993b0882SAndroid Build Coastguard Worker int match_offset; 93*993b0882SAndroid Build Coastguard Worker 94*993b0882SAndroid Build Coastguard Worker // The symbol text value. 95*993b0882SAndroid Build Coastguard Worker StringPiece lexeme; 96*993b0882SAndroid Build Coastguard Worker 97*993b0882SAndroid Build Coastguard Worker // The predefined parse tree. 98*993b0882SAndroid Build Coastguard Worker ParseTree* parse_tree; 99*993b0882SAndroid Build Coastguard Worker }; 100*993b0882SAndroid Build Coastguard Worker 101*993b0882SAndroid Build Coastguard Worker class Lexer { 102*993b0882SAndroid Build Coastguard Worker public: Lexer(const UniLib * unilib)103*993b0882SAndroid Build Coastguard Worker explicit Lexer(const UniLib* unilib) : unilib_(*unilib) {} 104*993b0882SAndroid Build Coastguard Worker 105*993b0882SAndroid Build Coastguard Worker // Processes a single token. 106*993b0882SAndroid Build Coastguard Worker // Splits a token into classified symbols. 107*993b0882SAndroid Build Coastguard Worker void AppendTokenSymbols(const StringPiece value, int match_offset, 108*993b0882SAndroid Build Coastguard Worker const CodepointSpan codepoint_span, 109*993b0882SAndroid Build Coastguard Worker std::vector<Symbol>* symbols) const; 110*993b0882SAndroid Build Coastguard Worker 111*993b0882SAndroid Build Coastguard Worker private: 112*993b0882SAndroid Build Coastguard Worker // Gets the type of a character. 113*993b0882SAndroid Build Coastguard Worker Symbol::Type GetSymbolType(const UnicodeText::const_iterator& it) const; 114*993b0882SAndroid Build Coastguard Worker 115*993b0882SAndroid Build Coastguard Worker const UniLib& unilib_; 116*993b0882SAndroid Build Coastguard Worker }; 117*993b0882SAndroid Build Coastguard Worker 118*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3::grammar 119*993b0882SAndroid Build Coastguard Worker 120*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_ 121