xref: /aosp_15_r20/external/libtextclassifier/native/utils/grammar/parsing/lexer.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker // A lexer that (splits) and classifies tokens.
18*993b0882SAndroid Build Coastguard Worker //
19*993b0882SAndroid Build Coastguard Worker // Any whitespace gets absorbed into the token that follows them in the text.
20*993b0882SAndroid Build Coastguard Worker // For example, if the text contains:
21*993b0882SAndroid Build Coastguard Worker //
22*993b0882SAndroid Build Coastguard Worker //      ...hello                       there        world...
23*993b0882SAndroid Build Coastguard Worker //              |                      |            |
24*993b0882SAndroid Build Coastguard Worker //              offset=16              39           52
25*993b0882SAndroid Build Coastguard Worker //
26*993b0882SAndroid Build Coastguard Worker // then the output will be:
27*993b0882SAndroid Build Coastguard Worker //
28*993b0882SAndroid Build Coastguard Worker //      "hello" [?, 16)
29*993b0882SAndroid Build Coastguard Worker //      "there" [16, 44)      <-- note "16" NOT "39"
30*993b0882SAndroid Build Coastguard Worker //      "world" [44, ?)       <-- note "44" NOT "52"
31*993b0882SAndroid Build Coastguard Worker //
32*993b0882SAndroid Build Coastguard Worker // This makes it appear to the Matcher as if the tokens are adjacent.
33*993b0882SAndroid Build Coastguard Worker 
34*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_
35*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_
36*993b0882SAndroid Build Coastguard Worker 
37*993b0882SAndroid Build Coastguard Worker #include <vector>
38*993b0882SAndroid Build Coastguard Worker 
39*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h"
40*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/parse-tree.h"
41*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/types.h"
42*993b0882SAndroid Build Coastguard Worker #include "utils/strings/stringpiece.h"
43*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h"
44*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h"
45*993b0882SAndroid Build Coastguard Worker 
46*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar {
47*993b0882SAndroid Build Coastguard Worker 
48*993b0882SAndroid Build Coastguard Worker // A lexical symbol with an identified meaning that represents raw tokens,
49*993b0882SAndroid Build Coastguard Worker // token categories or predefined text matches.
50*993b0882SAndroid Build Coastguard Worker // It is the unit fed to the grammar matcher.
51*993b0882SAndroid Build Coastguard Worker struct Symbol {
52*993b0882SAndroid Build Coastguard Worker   // The type of the lexical symbol.
53*993b0882SAndroid Build Coastguard Worker   enum class Type {
54*993b0882SAndroid Build Coastguard Worker     // A raw token.
55*993b0882SAndroid Build Coastguard Worker     TYPE_TERM,
56*993b0882SAndroid Build Coastguard Worker 
57*993b0882SAndroid Build Coastguard Worker     // A symbol representing a string of digits.
58*993b0882SAndroid Build Coastguard Worker     TYPE_DIGITS,
59*993b0882SAndroid Build Coastguard Worker 
60*993b0882SAndroid Build Coastguard Worker     // Punctuation characters.
61*993b0882SAndroid Build Coastguard Worker     TYPE_PUNCTUATION,
62*993b0882SAndroid Build Coastguard Worker 
63*993b0882SAndroid Build Coastguard Worker     // A predefined parse tree.
64*993b0882SAndroid Build Coastguard Worker     TYPE_PARSE_TREE
65*993b0882SAndroid Build Coastguard Worker   };
66*993b0882SAndroid Build Coastguard Worker 
67*993b0882SAndroid Build Coastguard Worker   explicit Symbol() = default;
68*993b0882SAndroid Build Coastguard Worker 
69*993b0882SAndroid Build Coastguard Worker   // Constructs a symbol of a given type with an anchor in the text.
SymbolSymbol70*993b0882SAndroid Build Coastguard Worker   Symbol(const Type type, const CodepointSpan codepoint_span,
71*993b0882SAndroid Build Coastguard Worker          const int match_offset, StringPiece lexeme)
72*993b0882SAndroid Build Coastguard Worker       : type(type),
73*993b0882SAndroid Build Coastguard Worker         codepoint_span(codepoint_span),
74*993b0882SAndroid Build Coastguard Worker         match_offset(match_offset),
75*993b0882SAndroid Build Coastguard Worker         lexeme(lexeme) {}
76*993b0882SAndroid Build Coastguard Worker 
77*993b0882SAndroid Build Coastguard Worker   // Constructs a symbol from a pre-defined parse tree.
SymbolSymbol78*993b0882SAndroid Build Coastguard Worker   explicit Symbol(ParseTree* parse_tree)
79*993b0882SAndroid Build Coastguard Worker       : type(Type::TYPE_PARSE_TREE),
80*993b0882SAndroid Build Coastguard Worker         codepoint_span(parse_tree->codepoint_span),
81*993b0882SAndroid Build Coastguard Worker         match_offset(parse_tree->match_offset),
82*993b0882SAndroid Build Coastguard Worker         parse_tree(parse_tree) {}
83*993b0882SAndroid Build Coastguard Worker 
84*993b0882SAndroid Build Coastguard Worker   // The type of the symbol.
85*993b0882SAndroid Build Coastguard Worker   Type type;
86*993b0882SAndroid Build Coastguard Worker 
87*993b0882SAndroid Build Coastguard Worker   // The span in the text as codepoint offsets.
88*993b0882SAndroid Build Coastguard Worker   CodepointSpan codepoint_span;
89*993b0882SAndroid Build Coastguard Worker 
90*993b0882SAndroid Build Coastguard Worker   // The match start offset (including preceding whitespace) as codepoint
91*993b0882SAndroid Build Coastguard Worker   // offset.
92*993b0882SAndroid Build Coastguard Worker   int match_offset;
93*993b0882SAndroid Build Coastguard Worker 
94*993b0882SAndroid Build Coastguard Worker   // The symbol text value.
95*993b0882SAndroid Build Coastguard Worker   StringPiece lexeme;
96*993b0882SAndroid Build Coastguard Worker 
97*993b0882SAndroid Build Coastguard Worker   // The predefined parse tree.
98*993b0882SAndroid Build Coastguard Worker   ParseTree* parse_tree;
99*993b0882SAndroid Build Coastguard Worker };
100*993b0882SAndroid Build Coastguard Worker 
101*993b0882SAndroid Build Coastguard Worker class Lexer {
102*993b0882SAndroid Build Coastguard Worker  public:
Lexer(const UniLib * unilib)103*993b0882SAndroid Build Coastguard Worker   explicit Lexer(const UniLib* unilib) : unilib_(*unilib) {}
104*993b0882SAndroid Build Coastguard Worker 
105*993b0882SAndroid Build Coastguard Worker   // Processes a single token.
106*993b0882SAndroid Build Coastguard Worker   // Splits a token into classified symbols.
107*993b0882SAndroid Build Coastguard Worker   void AppendTokenSymbols(const StringPiece value, int match_offset,
108*993b0882SAndroid Build Coastguard Worker                           const CodepointSpan codepoint_span,
109*993b0882SAndroid Build Coastguard Worker                           std::vector<Symbol>* symbols) const;
110*993b0882SAndroid Build Coastguard Worker 
111*993b0882SAndroid Build Coastguard Worker  private:
112*993b0882SAndroid Build Coastguard Worker   // Gets the type of a character.
113*993b0882SAndroid Build Coastguard Worker   Symbol::Type GetSymbolType(const UnicodeText::const_iterator& it) const;
114*993b0882SAndroid Build Coastguard Worker 
115*993b0882SAndroid Build Coastguard Worker   const UniLib& unilib_;
116*993b0882SAndroid Build Coastguard Worker };
117*993b0882SAndroid Build Coastguard Worker 
118*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3::grammar
119*993b0882SAndroid Build Coastguard Worker 
120*993b0882SAndroid Build Coastguard Worker #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_LEXER_H_
121