xref: /aosp_15_r20/external/icing/icing/query/advanced_query_parser/lexer.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_
16 #define ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_
17 
18 #include <cstdint>
19 #include <string>
20 #include <string_view>
21 #include <vector>
22 
23 #include "icing/text_classifier/lib3/utils/base/statusor.h"
24 
25 namespace icing {
26 namespace lib {
27 
28 class Lexer {
29  public:
30   enum class Language { QUERY, SCORING };
31 
32   // The maximum number of tokens allowed, in order to prevent stack overflow
33   // issues in the parsers or visitors.
34   static constexpr uint32_t kMaxNumTokens = 2048;
35 
36   enum class TokenType {
37     COMMA,       // ','
38     DOT,         // '.'
39     PLUS,        // '+'            Not allowed in QUERY language.
40     MINUS,       // '-'
41     STAR,        // '*'            Not allowed in SCORING language.
42     TIMES,       // '*'            Not allowed in QUERY language.
43     DIV,         // '/'            Not allowed in QUERY language.
44     LPAREN,      // '('
45     RPAREN,      // ')'
46     COMPARATOR,  // '<=' | '<' | '>=' | '>' | '!=' | '==' | ':'
47                  //                Not allowed in SCORING language.
48     AND,         // 'AND' | '&&'   Not allowed in SCORING language.
49     OR,          // 'OR' | '||'    Not allowed in SCORING language.
50     NOT,         // 'NOT'          Not allowed in SCORING language.
51     STRING,      // String literal surrounded by quotation marks. The
52                  // original_text of a STRING token will not include quotation
53                  // marks.
54     TEXT,        // A sequence of chars that are not any above-listed operator
55     FUNCTION_NAME,  // A TEXT followed by LPAREN.
56     // Whitespaces not inside a string literal will be skipped.
57     // WS: " " | "\t" | "\n" | "\r" | "\f" -> skip ;
58   };
59 
60   struct LexerToken {
61     // For STRING, text will contain the raw original text of the token
62     // in between quotation marks, without unescaping.
63     //
64     // For TEXT, text will contain the text of the token after unescaping all
65     // escaped characters.
66     //
67     // For FUNCTION_NAME, this field will contain the name of the function.
68     //
69     // For COMPARATOR, this field will contain the comparator.
70     //
71     // For other types, this field will be empty.
72     std::string text;
73 
74     // Lifecycle is dependent on the lifecycle of the string pointed to by
75     // query_.
76     std::string_view original_text;
77 
78     // The type of the token.
79     TokenType type;
80   };
81 
Lexer(std::string_view query,Language language)82   explicit Lexer(std::string_view query, Language language)
83       : query_(query), language_(language) {
84     Advance();
85   }
86 
87   // Get a vector of LexerToken after lexing the query given in the constructor.
88   //
89   // Returns:
90   //   A vector of LexerToken on success
91   //   INVALID_ARGUMENT on syntax error.
92   libtextclassifier3::StatusOr<std::vector<LexerToken>> ExtractTokens() &&;
93 
94  private:
95   // Advance to current_index_ + n.
96   void Advance(uint32_t n = 1) {
97     if (current_index_ + n >= query_.size()) {
98       current_index_ = query_.size();
99       current_char_ = '\0';
100     } else {
101       current_index_ += n;
102       current_char_ = query_[current_index_];
103     }
104   }
105 
106   // Get the character at current_index_ + n.
107   char PeekNext(uint32_t n = 1) {
108     if (current_index_ + n >= query_.size()) {
109       return '\0';
110     } else {
111       return query_[current_index_ + n];
112     }
113   }
114 
SyntaxError(std::string error)115   void SyntaxError(std::string error) {
116     current_index_ = query_.size();
117     current_char_ = '\0';
118     error_ = std::move(error);
119   }
120 
121   // Try to match a whitespace token and skip it.
122   bool ConsumeWhitespace();
123 
124   // Try to match a single-char token other than '<' and '>'.
125   bool ConsumeSingleChar();
126   bool ConsumeQuerySingleChar();
127   bool ConsumeScoringSingleChar();
128   bool ConsumeGeneralSingleChar();
129 
130   // Try to match a comparator token other than ':'.
131   bool ConsumeComparator();
132 
133   // Try to match '&&' and '||'.
134   // 'AND' and 'OR' will be handled in Text() instead, so that 'ANDfoo' and
135   // 'fooOR' is a TEXT, instead of an 'AND' or 'OR'.
136   bool ConsumeAndOr();
137 
138   // Try to match a string literal.
139   bool ConsumeStringLiteral();
140 
141   // Try to match a non-text.
ConsumeNonText()142   bool ConsumeNonText() {
143     return ConsumeWhitespace() || ConsumeSingleChar() ||
144            (language_ == Language::QUERY && ConsumeComparator()) ||
145            (language_ == Language::QUERY && ConsumeAndOr()) ||
146            ConsumeStringLiteral();
147   }
148 
149   // Try to match TEXT, FUNCTION_NAME, 'AND', 'OR' and 'NOT'.
150   // REQUIRES: ConsumeNonText() must be called immediately before calling this
151   // function.
152   bool ConsumeText();
153 
154   std::string_view query_;
155   std::string error_;
156   Language language_;
157   int32_t current_index_ = -1;
158   char current_char_ = '\0';
159   std::vector<LexerToken> tokens_;
160 
161   // Stores whether the lexer is currently inspecting a TEXT segment while
162   // handling current_char_.
163   bool in_text_ = false;
164 };
165 
166 }  // namespace lib
167 }  // namespace icing
168 
169 #endif  // ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_
170