xref: /aosp_15_r20/external/icing/icing/tokenization/token.h (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TOKENIZATION_TOKEN_H_
16 #define ICING_TOKENIZATION_TOKEN_H_
17 
18 #include <string_view>
19 
20 namespace icing {
21 namespace lib {
22 
23 struct Token {
24   enum class Type {
25     // Common types
26     REGULAR,  // A token without special meanings, the value of it will be
27               // indexed or searched directly
28 
29     VERBATIM,  // A token that should be indexed and searched without any
30                // modifications to the raw text
31 
32     // An RFC822 section with the content in RFC822_TOKEN tokenizes as follows:
33     RFC822_NAME,                     // "User", "Johnsson"
34     RFC822_COMMENT,                  // "A", "comment", "here"
35     RFC822_LOCAL_ADDRESS,            // "user.name"
36     RFC822_HOST_ADDRESS,             // "domain.name.com"
37     RFC822_ADDRESS,                  // "[email protected]"
38     RFC822_ADDRESS_COMPONENT_LOCAL,  // "user", "name",
39     RFC822_ADDRESS_COMPONENT_HOST,   // "domain", "name", "com"
40     RFC822_TOKEN,  // "User Johnsson (A comment) <[email protected]>"
41 
42     // Types only used in raw query
43     QUERY_OR,         // Indicates OR logic between its left and right tokens
44     QUERY_EXCLUSION,  // Indicates exclusion operation on next token
45     QUERY_PROPERTY,   // Indicates property restrict on next token
46     QUERY_LEFT_PARENTHESES,   // Left parentheses
47     QUERY_RIGHT_PARENTHESES,  // Right parentheses
48 
49     // Types used in URL tokenization
50     URL_SCHEME,  // "http", "https", "ftp", "content"
51     URL_USERNAME,
52     URL_PASSWORD,
53     URL_HOST_COMMON_PART,  // Hosts are split into two types, common and
54                            // significant. Common are e.g: www, ww2, .com, etc.
55     URL_HOST_SIGNIFICANT_PART,
56     URL_PORT,
57     URL_PATH_PART,  // Tokenized path, e.g. /abc-d/e.fg-> [abc-d], [e.fg]
58     URL_QUERY,      // After ?, before #, e.g. "param1=value-1&param2=value-2
59     URL_REF,        // Anything after #. Could be anything
60     URL_SUFFIX,
61     URL_SUFFIX_INNERMOST,
62 
63     TRIGRAM,  // Trigram token of the text
64 
65     // Indicates errors
66     INVALID,
67   };
68 
69   // The input text should outlive the Token instance.
70   explicit Token(Type type_in, std::string_view text_in = "")
typeToken71       : type(type_in), text(text_in) {}
72 
73   // The type of token
74   Type type;
75 
76   // The content of token
77   std::string_view text;
78 };
79 
80 }  // namespace lib
81 }  // namespace icing
82 
83 #endif  // ICING_TOKENIZATION_TOKEN_H_
84