1 // Copyright (C) 2022 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_ 16 #define ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_ 17 18 #include <cstdint> 19 #include <string> 20 #include <string_view> 21 #include <vector> 22 23 #include "icing/text_classifier/lib3/utils/base/statusor.h" 24 25 namespace icing { 26 namespace lib { 27 28 class Lexer { 29 public: 30 enum class Language { QUERY, SCORING }; 31 32 // The maximum number of tokens allowed, in order to prevent stack overflow 33 // issues in the parsers or visitors. 34 static constexpr uint32_t kMaxNumTokens = 2048; 35 36 enum class TokenType { 37 COMMA, // ',' 38 DOT, // '.' 39 PLUS, // '+' Not allowed in QUERY language. 40 MINUS, // '-' 41 STAR, // '*' Not allowed in SCORING language. 42 TIMES, // '*' Not allowed in QUERY language. 43 DIV, // '/' Not allowed in QUERY language. 44 LPAREN, // '(' 45 RPAREN, // ')' 46 COMPARATOR, // '<=' | '<' | '>=' | '>' | '!=' | '==' | ':' 47 // Not allowed in SCORING language. 48 AND, // 'AND' | '&&' Not allowed in SCORING language. 49 OR, // 'OR' | '||' Not allowed in SCORING language. 50 NOT, // 'NOT' Not allowed in SCORING language. 51 STRING, // String literal surrounded by quotation marks. The 52 // original_text of a STRING token will not include quotation 53 // marks. 54 TEXT, // A sequence of chars that are not any above-listed operator 55 FUNCTION_NAME, // A TEXT followed by LPAREN. 56 // Whitespaces not inside a string literal will be skipped. 57 // WS: " " | "\t" | "\n" | "\r" | "\f" -> skip ; 58 }; 59 60 struct LexerToken { 61 // For STRING, text will contain the raw original text of the token 62 // in between quotation marks, without unescaping. 63 // 64 // For TEXT, text will contain the text of the token after unescaping all 65 // escaped characters. 66 // 67 // For FUNCTION_NAME, this field will contain the name of the function. 68 // 69 // For COMPARATOR, this field will contain the comparator. 70 // 71 // For other types, this field will be empty. 72 std::string text; 73 74 // Lifecycle is dependent on the lifecycle of the string pointed to by 75 // query_. 76 std::string_view original_text; 77 78 // The type of the token. 79 TokenType type; 80 }; 81 Lexer(std::string_view query,Language language)82 explicit Lexer(std::string_view query, Language language) 83 : query_(query), language_(language) { 84 Advance(); 85 } 86 87 // Get a vector of LexerToken after lexing the query given in the constructor. 88 // 89 // Returns: 90 // A vector of LexerToken on success 91 // INVALID_ARGUMENT on syntax error. 92 libtextclassifier3::StatusOr<std::vector<LexerToken>> ExtractTokens() &&; 93 94 private: 95 // Advance to current_index_ + n. 96 void Advance(uint32_t n = 1) { 97 if (current_index_ + n >= query_.size()) { 98 current_index_ = query_.size(); 99 current_char_ = '\0'; 100 } else { 101 current_index_ += n; 102 current_char_ = query_[current_index_]; 103 } 104 } 105 106 // Get the character at current_index_ + n. 107 char PeekNext(uint32_t n = 1) { 108 if (current_index_ + n >= query_.size()) { 109 return '\0'; 110 } else { 111 return query_[current_index_ + n]; 112 } 113 } 114 SyntaxError(std::string error)115 void SyntaxError(std::string error) { 116 current_index_ = query_.size(); 117 current_char_ = '\0'; 118 error_ = std::move(error); 119 } 120 121 // Try to match a whitespace token and skip it. 122 bool ConsumeWhitespace(); 123 124 // Try to match a single-char token other than '<' and '>'. 125 bool ConsumeSingleChar(); 126 bool ConsumeQuerySingleChar(); 127 bool ConsumeScoringSingleChar(); 128 bool ConsumeGeneralSingleChar(); 129 130 // Try to match a comparator token other than ':'. 131 bool ConsumeComparator(); 132 133 // Try to match '&&' and '||'. 134 // 'AND' and 'OR' will be handled in Text() instead, so that 'ANDfoo' and 135 // 'fooOR' is a TEXT, instead of an 'AND' or 'OR'. 136 bool ConsumeAndOr(); 137 138 // Try to match a string literal. 139 bool ConsumeStringLiteral(); 140 141 // Try to match a non-text. ConsumeNonText()142 bool ConsumeNonText() { 143 return ConsumeWhitespace() || ConsumeSingleChar() || 144 (language_ == Language::QUERY && ConsumeComparator()) || 145 (language_ == Language::QUERY && ConsumeAndOr()) || 146 ConsumeStringLiteral(); 147 } 148 149 // Try to match TEXT, FUNCTION_NAME, 'AND', 'OR' and 'NOT'. 150 // REQUIRES: ConsumeNonText() must be called immediately before calling this 151 // function. 152 bool ConsumeText(); 153 154 std::string_view query_; 155 std::string error_; 156 Language language_; 157 int32_t current_index_ = -1; 158 char current_char_ = '\0'; 159 std::vector<LexerToken> tokens_; 160 161 // Stores whether the lexer is currently inspecting a TEXT segment while 162 // handling current_char_. 163 bool in_text_ = false; 164 }; 165 166 } // namespace lib 167 } // namespace icing 168 169 #endif // ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_ 170