xref: /aosp_15_r20/external/icing/icing/tokenization/rfc822-tokenizer.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/tokenization/rfc822-tokenizer.h"
16 
17 #include <algorithm>
18 #include <deque>
19 #include <queue>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23 
24 #include "icing/tokenization/token.h"
25 #include "icing/tokenization/tokenizer.h"
26 #include "icing/util/character-iterator.h"
27 #include "icing/util/i18n-utils.h"
28 #include "icing/util/status-macros.h"
29 #include "unicode/umachine.h"
30 
31 namespace icing {
32 namespace lib {
33 
34 namespace {
IsDelimiter(UChar32 c)35 bool IsDelimiter(UChar32 c) { return c == ',' || c == ';' || c == '\n'; }
36 }  // namespace
37 
38 class Rfc822TokenIterator : public Tokenizer::Iterator {
39  public:
40   // Cursor is the index into the string_view, text_end_ is the length.
Rfc822TokenIterator(std::string_view text)41   explicit Rfc822TokenIterator(std::string_view text)
42       : text_(std::move(text)),
43         iterator_(text, 0, 0, 0),
44         text_end_(text.length()),
45         token_index_(-1) {}
46 
47   // Advance will move token_index_ past the end of tokens_
Advance()48   bool Advance() override {
49     // Stop the token index on a RFC822 token, or one past the end, where the
50     // next RFC822 token will be if more are generated.
51     do {
52       token_index_++;
53     } while (token_index_ < tokens_.size() &&
54              tokens_[token_index_].type != Token::Type::RFC822_TOKEN);
55 
56     // There is still something left, possible if we rewinded and call Advance
57     if (token_index_ < tokens_.size()) {
58       return true;
59     }
60 
61     // Done with the entire string_view.
62     if (iterator_.utf8_index() >= text_end_) {
63       return false;
64     }
65 
66     // Parsing a new email, update the current email marker.
67     AdvancePastWhitespace();
68 
69     // This may return false, as in the case of "<alex>,,", where after
70     // processing <alex>, there are no more tokens.
71     return GetNextRfc822Token();
72   }
73 
74   // Returns the current token group, an RFC822_TOKEN along with all it's
75   // subtokens. For example, "[email protected]" will return all tokens generated
76   // from that text.
77   //
78   // Returns:
79   //   A vector of Tokens on success
80   //   An empty vector if the token list is empty
81   //   An empty vector if the index is past the end of the token list
GetTokens() const82   std::vector<Token> GetTokens() const override {
83     std::vector<Token> result;
84     if (token_index_ < tokens_.size() && token_index_ >= 0) {
85       int index = token_index_;
86       do {
87         result.push_back(tokens_[index]);
88       } while (++index < tokens_.size() &&
89                tokens_[index].type != Token::Type::RFC822_TOKEN);
90     }
91     return result;
92   }
93 
ResetToTokenStartingAfter(int32_t utf32_offset)94   bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
95     CharacterIterator tracker(text_);
96     for (int new_index = 0; new_index < tokens_.size(); ++new_index) {
97       const Token& t = tokens_[new_index];
98       if (t.type != Token::Type::RFC822_TOKEN) {
99         continue;
100       }
101 
102       tracker.AdvanceToUtf8(t.text.begin() - text_.begin());
103       if (tracker.utf32_index() > utf32_offset) {
104         token_index_ = new_index;
105         return true;
106       }
107     }
108 
109     return false;
110   }
111 
112   // This will attempt to reset the token_index to point to the last token
113   // ending before an offset. If it fails, due to there not being any tokens
114   // before the offset, the token index will become -1.
ResetToTokenEndingBefore(int32_t utf32_offset)115   bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
116     // First, advance until we pass offset or Advance is false
117     if (tokens_.empty()) {
118       if (!Advance()) {
119         // No tokens available, and Advancing doesn't get more, so return false.
120         return false;
121       }
122     }
123 
124     CharacterIterator tracker(text_);
125 
126     // Keep advancing until we parse all the emails, or run past the offset.
127     // Advance will always make token_index_ point to an RFC822_TOKEN, so we can
128     // look at that tokens text end to determine if it ends before the offset.
129     // This first loop will guarantee that we end up either past the offset or
130     // at the end.
131     do {
132       tracker.AdvanceToUtf8(tokens_[token_index_].text.end() - text_.begin());
133 
134       // When we Advance and have to convert names to email addresses, it's
135       // possible that multiple RFC822 tokens are added. We need to advance
136       // through these one at a time, we cannot skip to the top of the line.
137     } while (tracker.utf32_index() <= utf32_offset && Advance());
138 
139     // We are either past the offset or at the end. Either way, we now work
140     // backwards and reset to the first (highest index) RFC822_TOKEN we find.
141     while (--token_index_ >= 0) {
142       if (tokens_[token_index_].type != Token::Type::RFC822_TOKEN) {
143         continue;
144       }
145 
146       tracker.MoveToUtf8(tokens_[token_index_].text.end() - text_.begin());
147       if (tracker.utf32_index() <= utf32_offset) {
148         return true;
149       }
150     }
151     return false;
152   }
153 
154   // Returns a character iterator to the start of the token.
CalculateTokenStart()155   libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
156       override {
157     CharacterIterator token_start = iterator_;
158     token_start.MoveToUtf8(GetTokens().at(0).text.begin() - text_.begin());
159     return token_start;
160   }
161 
162   // Returns a character iterator to right after the end of the token.
CalculateTokenEndExclusive()163   libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
164       override {
165     CharacterIterator token_end = iterator_;
166     token_end.MoveToUtf8(GetTokens().at(0).text.end() - text_.begin());
167     return token_end;
168   }
169 
170   // Reset to start moves to the state we're in after the first Advance().
ResetToStart()171   bool ResetToStart() override {
172     token_index_ = -1;
173     return Advance();
174   }
175 
176  private:
177   // Advance until the next email delimiter, generating as many tokens as
178   // necessary.
GetNextRfc822Token()179   bool GetNextRfc822Token() {
180     if (iterator_.utf8_index() >= text_end_) {
181       return false;
182     }
183 
184     int token_start = iterator_.utf8_index();
185     bool address_found = false;
186     bool name_found = false;
187     std::vector<Token> next_tokens;
188     Token rfc822(Token::Type::RFC822_TOKEN);
189 
190     // We start at unquoted and run until a ",;\n<( .
191     while (iterator_.utf8_index() < text_end_) {
192       UChar32 c = iterator_.GetCurrentChar();
193       if (IsDelimiter(c)) {
194         // End of the token, advance cursor past all delimiters then quit.
195         rfc822.text =
196             text_.substr(token_start, iterator_.utf8_index() - token_start);
197 
198         UChar32 delimiter;
199         do {
200           AdvanceCursor();
201           delimiter = iterator_.GetCurrentChar();
202           // If we get current char on the end, it is not a delimiter so this
203           // loop will end
204         } while (IsDelimiter(delimiter));
205 
206         break;
207       }
208 
209       std::vector<Token> consume_result;
210       if (c == '"') {
211         consume_result = ConsumeQuotedSection();
212         name_found |= !consume_result.empty();
213       } else if (c == '(') {
214         consume_result = ConsumeParenthesizedSection();
215       } else if (c == '<') {
216         // Only set address_found to true if ConsumeAdress returns true.
217         // Otherwise, keep address_found as is to prevent setting address_found
218         // back to false if it is true.
219         consume_result = ConsumeAddress();
220         address_found |= !consume_result.empty();
221       } else {
222         consume_result = ConsumeUnquotedSection();
223         name_found |= !consume_result.empty();
224       }
225       next_tokens.insert(next_tokens.end(), consume_result.begin(),
226                          consume_result.end());
227     }
228     if (iterator_.utf8_index() >= text_end_) {
229       rfc822.text = text_.substr(token_start, text_end_ - token_start);
230     }
231 
232     // If an address is found, use the tokens we have.
233     // If an address isn't found, and a name isn't found, also use the tokens
234     // we have.
235     // If an address isn't found but a name is, convert name Tokens to email
236     // Tokens.
237     if (!address_found && name_found) {
238       // We don't add the rfc822 token, as it will be handled by
239       // ConvertNameToEmail.
240       std::vector<Token> converted_tokens = ConvertNameToEmail(next_tokens);
241       tokens_.insert(tokens_.end(), converted_tokens.begin(),
242                      converted_tokens.end());
243     } else {
244       if (next_tokens.empty()) {
245         // Tokens may not be generated in the case of ",,,,,,"
246         return false;
247       } else {
248         // If tokens were generated, push back the RFC822 token for them
249         tokens_.push_back(rfc822);
250         tokens_.insert(tokens_.end(), next_tokens.begin(), next_tokens.end());
251       }
252     }
253 
254     return true;
255   }
256 
257   // We allow for the "First Last <email>" format, but if there is no email in
258   // brackets, we won't allow for unquoted spaces. For example, the input
259   // "[email protected] [email protected]" has an unquoted space, so we will split
260   // it into two emails. We don't need to find more tokens, we just need to
261   // find @ signs and spaces and convert name tokens to parts of the email.
ConvertNameToEmail(std::vector<Token> & name_tokens)262   std::vector<Token> ConvertNameToEmail(std::vector<Token>& name_tokens) {
263     if (name_tokens.empty()) {
264       return name_tokens;
265     }
266 
267     // There will only be names and comments, and they will be in order.
268     std::vector<Token> converted_tokens;
269 
270     // Start at the beginning of the current email.
271     CharacterIterator scanner(text_);
272 
273     scanner.MoveToUtf8(name_tokens[0].text.begin() - text_.begin());
274     int token_processed_index = 0;
275 
276     bool in_quote = false;
277     // Setting at_sign_index to before the beginning, it'll only be set to
278     // something else if we find an @ sign
279     const char* at_sign_index = nullptr;
280 
281     // Run to the end
282     while (scanner.utf8_index() < iterator_.utf8_index()) {
283       const char* end_of_token = nullptr;
284       UChar32 c = scanner.GetCurrentChar();
285       if (c == '\\') {
286         // Skip the slash, as well as the following token.
287         scanner.AdvanceToUtf32(scanner.utf32_index() + 1);
288         scanner.AdvanceToUtf32(scanner.utf32_index() + 1);
289         continue;
290       }
291       if (c == '"') {
292         in_quote = !in_quote;
293       }
294       if (c == '@') {
295         at_sign_index = text_.begin() + scanner.utf8_index();
296       }
297 
298       // If the next character is the end OR we hit an unquoted space.
299       if (scanner.utf8_index() + i18n_utils::GetUtf8Length(c) ==
300               iterator_.utf8_index() ||
301           (!in_quote && c == ' ')) {
302         if (!in_quote && c == ' ') {
303           end_of_token = text_.begin() + scanner.utf8_index();
304         } else {
305           end_of_token = text_.begin() + iterator_.utf8_index();
306         }
307         std::deque<Token> more_tokens = ConvertOneNameToEmail(
308             name_tokens, at_sign_index, end_of_token, token_processed_index);
309         converted_tokens.insert(converted_tokens.end(), more_tokens.begin(),
310                                 more_tokens.end());
311         // Reset the at_sign_index
312         at_sign_index = nullptr;
313       }
314       scanner.AdvanceToUtf32(scanner.utf32_index() + 1);
315     }
316 
317     // It's possible we left something out.
318     if (token_processed_index < name_tokens.size()) {
319       std::deque<Token> more_tokens =
320           ConvertOneNameToEmail(name_tokens, at_sign_index,
321                                 name_tokens[name_tokens.size() - 1].text.end(),
322                                 token_processed_index);
323       converted_tokens.insert(converted_tokens.end(), more_tokens.begin(),
324                               more_tokens.end());
325     }
326 
327     return converted_tokens;
328   }
329 
330   // Once a name is determined to be an address, convert its tokens to address
331   // tokens.
ConvertOneNameToEmail(const std::vector<Token> & name_tokens,const char * at_sign_index,const char * end_of_token,int & token_processed_index)332   std::deque<Token> ConvertOneNameToEmail(const std::vector<Token>& name_tokens,
333                                           const char* at_sign_index,
334                                           const char* end_of_token,
335                                           int& token_processed_index) {
336     const char* address_start = nullptr;
337     const char* local_address_end = nullptr;
338     const char* host_address_start = nullptr;
339     const char* address_end = nullptr;
340     const char* token_start = nullptr;
341     const char* token_end = nullptr;
342     std::deque<Token> converted_tokens;
343 
344     // Transform tokens up to end of token pointer.
345 
346     for (; token_processed_index < name_tokens.size();
347          ++token_processed_index) {
348       const Token& token = name_tokens[token_processed_index];
349 
350       if (token.text.end() > end_of_token) {
351         break;
352       }
353       std::string_view text = token.text;
354       // We need to do this both for comment and name tokens. Comment tokens
355       // will get a corresponding RFC822 token, but not an address or local
356       // address.
357       if (token_start == nullptr) {
358         token_start = text.begin();
359       }
360       token_end = text.end();
361 
362       if (token.type == Token::Type::RFC822_COMMENT) {
363         // Comment tokens will stay as they are.
364         converted_tokens.push_back(token);
365       } else if (token.type == Token::Type::RFC822_NAME) {
366         // Names need to be converted to address tokens. We keep the order of
367         // which the name tokens appeared. Name tokens that appear before an
368         // @ sign in the name will become RFC822_ADDRESS_COMPONENT_LOCAL, and
369         // those after will become RFC822_ADDRESS_COMPONENT_HOST. We aren't
370         // able to determine RFC822_ADDRESS, RFC822_LOCAL_ADDRESS, and
371         // RFC_HOST_ADDRESS before checking the name tokens, so they will be
372         // added after the component tokens.
373         if (address_start == nullptr) {
374           address_start = text.begin();
375         }
376         address_end = text.end();
377         if (text.begin() > at_sign_index) {
378           if (host_address_start == nullptr) {
379             host_address_start = text.begin();
380           }
381           // Once this is hit, we switch to COMPONENT_HOST and mark end of the
382           // local address
383           converted_tokens.push_back(
384               Token(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, token.text));
385         } else {
386           local_address_end = text.end();
387           converted_tokens.push_back(
388               Token(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, token.text));
389         }
390       }
391     }
392 
393     if (address_start != nullptr) {
394       converted_tokens.push_back(
395           Token(Token::Type::RFC822_ADDRESS,
396                 std::string_view(address_start, address_end - address_start)));
397       if (local_address_end != nullptr) {
398         converted_tokens.push_back(
399             Token(Token::Type::RFC822_LOCAL_ADDRESS,
400                   std::string_view(address_start,
401                                    local_address_end - address_start)));
402       }
403     }
404 
405     if (host_address_start != nullptr && host_address_start < address_end) {
406       converted_tokens.push_back(
407           Token(Token::Type::RFC822_HOST_ADDRESS,
408                 text_.substr(host_address_start - text_.begin(),
409                              address_end - host_address_start)));
410     }
411 
412     if (token_start != nullptr) {
413       converted_tokens.push_front(
414           Token(Token::Type::RFC822_TOKEN,
415                 std::string_view(token_start, token_end - token_start)));
416     }
417 
418     return converted_tokens;
419   }
420 
421   // Returns name tokens in an unquoted section. This is useful in case we do
422   // not find an address and have to use the name. An unquoted section may look
423   // like "Alex Sav", or "[email protected]". In the absense of a bracketed email
424   // address, the unquoted section will be used as the email address along with
425   // the quoted section.
ConsumeUnquotedSection()426   std::vector<Token> ConsumeUnquotedSection() {
427     UChar32 c;
428 
429     int token_start = -1;
430     std::vector<Token> next_tokens;
431 
432     // Advance to another state or a character marking the end of token, one
433     // of \n,; .
434     while (iterator_.utf8_index() < text_end_) {
435       c = iterator_.GetCurrentChar();
436 
437       if (i18n_utils::IsAlphaNumeric(c)) {
438         if (token_start == -1) {
439           // Start recording
440           token_start = iterator_.utf8_index();
441         }
442         AdvanceCursor();
443 
444       } else {
445         if (token_start != -1) {
446           // The character is non alphabetic, save a token.
447           next_tokens.push_back(Token(
448               Token::Type::RFC822_NAME,
449               text_.substr(token_start, iterator_.utf8_index() - token_start)));
450           token_start = -1;
451         }
452 
453         if (c == '"' || c == '<' || c == '(' || IsDelimiter(c)) {
454           // Stay on the token.
455           break;
456         }
457 
458         AdvanceCursor();
459       }
460     }
461     if (token_start != -1) {
462       next_tokens.push_back(Token(
463           Token::Type::RFC822_NAME,
464           text_.substr(token_start, iterator_.utf8_index() - token_start)));
465     }
466     return next_tokens;
467   }
468 
469   // Names that are within quotes should have all characters blindly
470   // unescaped. When a name is made into an address, it isn't re-escaped.
471 
472   // Returns name tokens found in a quoted section. This is useful in case we do
473   // not find an address and have to use the name. The quoted section may
474   // contain whitespaces.
ConsumeQuotedSection()475   std::vector<Token> ConsumeQuotedSection() {
476     // Get past the first quote.
477     AdvanceCursor();
478 
479     bool end_quote_found = false;
480     std::vector<Token> next_tokens;
481     UChar32 c;
482 
483     int token_start = -1;
484 
485     while (!end_quote_found && (iterator_.utf8_index() < text_end_)) {
486       c = iterator_.GetCurrentChar();
487 
488       if (i18n_utils::IsAlphaNumeric(c)) {
489         if (token_start == -1) {
490           // Start tracking the token.
491           token_start = iterator_.utf8_index();
492         }
493         AdvanceCursor();
494 
495       } else {
496         // Non- alphabetic
497         if (c == '\\') {
498           // A backslash, let's look at the next character.
499           CharacterIterator temp = iterator_;
500           temp.AdvanceToUtf32(iterator_.utf32_index() + 1);
501           UChar32 n = temp.GetCurrentChar();
502           if (i18n_utils::IsAlphaNumeric(n)) {
503             // The next character is alphabetic, skip the slash and don't end
504             // the last token. For quoted sections, the only things that are
505             // escaped are double quotes and slashes. For example, in "a\lex",
506             // an l appears after the slash. We want to treat this as if it
507             // was just "alex". So we tokenize it as <RFC822_NAME, "a\lex">.
508             AdvanceCursor();
509           } else {
510             // Not alphabetic, so save the last token if necessary.
511             if (token_start != -1) {
512               next_tokens.push_back(
513                   Token(Token::Type::RFC822_NAME,
514                         text_.substr(token_start,
515                                      iterator_.utf8_index() - token_start)));
516               token_start = -1;
517             }
518 
519             // Skip the backslash.
520             AdvanceCursor();
521 
522             if (n == '"' || n == '\\' || n == '@') {
523               // Skip these too if they're next.
524               AdvanceCursor();
525             }
526           }
527         } else {
528           // Not a backslash.
529 
530           if (token_start != -1) {
531             next_tokens.push_back(
532                 Token(Token::Type::RFC822_NAME,
533                       text_.substr(token_start,
534                                    iterator_.utf8_index() - token_start)));
535             token_start = -1;
536           }
537 
538           if (c == '"') {
539             end_quote_found = true;
540           }
541           // Advance one more time to get past the non-alphabetic character.
542           AdvanceCursor();
543         }
544       }
545     }
546     if (token_start != -1) {
547       next_tokens.push_back(Token(
548           Token::Type::RFC822_NAME,
549           text_.substr(token_start, iterator_.utf8_index() - token_start)));
550     }
551     return next_tokens;
552   }
553 
554   // '(', ')', '\\' chars should be escaped. All other escaped chars should be
555   // unescaped.
ConsumeParenthesizedSection()556   std::vector<Token> ConsumeParenthesizedSection() {
557     // Skip the initial (
558     AdvanceCursor();
559 
560     int paren_layer = 1;
561     UChar32 c;
562     std::vector<Token> next_tokens;
563 
564     int token_start = -1;
565 
566     while (paren_layer > 0 && (iterator_.utf8_index() < text_end_)) {
567       c = iterator_.GetCurrentChar();
568 
569       if (i18n_utils::IsAlphaNumeric(c)) {
570         if (token_start == -1) {
571           // Start tracking a token.
572           token_start = iterator_.utf8_index();
573         }
574         AdvanceCursor();
575       } else {
576         // Non alphabetic.
577         if (c == '\\') {
578           // A backslash, let's look at the next character.
579           UChar32 n = i18n_utils::GetUChar32At(text_.begin(), text_.length(),
580                                                iterator_.utf8_index() + 1);
581           if (i18n_utils::IsAlphaNumeric(n)) {
582             // Alphabetic, skip the slash and don't end the last token.
583             AdvanceCursor();
584           } else {
585             // Not alphabetic, save the last token if necessary.
586             if (token_start != -1) {
587               next_tokens.push_back(
588                   Token(Token::Type::RFC822_COMMENT,
589                         text_.substr(token_start,
590                                      iterator_.utf8_index() - token_start)));
591               token_start = -1;
592             }
593 
594             // Skip the backslash.
595             AdvanceCursor();
596 
597             if (n == ')' || n == '(' || n == '\\') {
598               // Skip these too if they're next.
599               AdvanceCursor();
600             }
601           }
602         } else {
603           // Not a backslash.
604           if (token_start != -1) {
605             next_tokens.push_back(
606                 Token(Token::Type::RFC822_COMMENT,
607                       text_.substr(token_start,
608                                    iterator_.utf8_index() - token_start)));
609             token_start = -1;
610           }
611 
612           if (c == '(') {
613             paren_layer++;
614           } else if (c == ')') {
615             paren_layer--;
616           }
617           AdvanceCursor();
618         }
619       }
620     }
621 
622     if (token_start != -1) {
623       // Ran past the end of text_ without getting the last token.
624 
625       // substr returns "a view of the substring [pos, pos + // rcount), where
626       // rcount is the smaller of count and size() - pos" therefore the count
627       // argument can be any value >= this->cursor - token_start. Therefore,
628       // ignoring the mutation warning.
629       next_tokens.push_back(Token(
630           Token::Type::RFC822_COMMENT,
631           text_.substr(token_start, iterator_.utf8_index() - token_start)));
632     }
633     return next_tokens;
634   }
635 
636   // Returns tokens found in the address.
ConsumeAddress()637   std::vector<Token> ConsumeAddress() {
638     // Skip the first <.
639     AdvanceCursor();
640 
641     // Save the start position.
642     CharacterIterator address_start_iterator = iterator_;
643     std::vector<Token> next_tokens;
644 
645     // Place the at sign on the '<', so that if no at_sign is found, the default
646     // is that the entire address is the host part.
647     int at_sign = -1;
648     int address_end = -1;
649 
650     UChar32 c = iterator_.GetCurrentChar();
651     // Quick scan for @ and > signs.
652     while (c != '>' && iterator_.utf8_index() < text_end_) {
653       AdvanceCursor();
654       c = iterator_.GetCurrentChar();
655       if (c == '@') {
656         at_sign = iterator_.utf8_index();
657       }
658     }
659 
660     if (iterator_.utf8_index() <= address_start_iterator.utf8_index()) {
661       // There is nothing between the brackets, either we have "<" or "<>".
662       return next_tokens;
663     }
664 
665     // Either we find a > or run to the end, either way this is the end of the
666     // address. The ending bracket will be handled by ConsumeUnquoted.
667     address_end = iterator_.utf8_index();
668 
669     // Reset to the start.
670     iterator_ = address_start_iterator;
671 
672     int address_start = address_start_iterator.utf8_index();
673 
674     Token::Type type = Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL;
675 
676     // Create a local address token.
677     if (at_sign != -1) {
678       next_tokens.push_back(
679           Token(Token::Type::RFC822_LOCAL_ADDRESS,
680                 text_.substr(address_start, at_sign - address_start)));
681     } else {
682       // All the tokens in the address are host components.
683       type = Token::Type::RFC822_ADDRESS_COMPONENT_HOST;
684       // If no @ is found, treat the entire address as the host address.
685       at_sign = address_start - 1;
686     }
687 
688     // The only case where we don't have a host address part is something like
689     // <localaddress@>. If there is no @, the at_sign is the default -1, and the
690     // host address is [0, address_end).
691     int host_address_start = at_sign + 1;
692     if (host_address_start < address_end) {
693       next_tokens.push_back(Token(
694           Token::Type::RFC822_HOST_ADDRESS,
695           text_.substr(host_address_start, address_end - host_address_start)));
696     }
697 
698     next_tokens.push_back(
699         Token(Token::Type::RFC822_ADDRESS,
700               text_.substr(address_start, address_end - address_start)));
701 
702     int token_start = -1;
703 
704     while (iterator_.utf8_index() < address_end) {
705       c = iterator_.GetCurrentChar();
706 
707       if (i18n_utils::IsAlphaNumeric(c)) {
708         if (token_start == -1) {
709           token_start = iterator_.utf8_index();
710         }
711       } else {
712         // non alphabetic
713         if (c == '\\') {
714           // A backslash, let's look at the next character.
715           CharacterIterator temp = iterator_;
716           temp.AdvanceToUtf32(iterator_.utf32_index() + 1);
717           UChar32 n = temp.GetCurrentChar();
718           if (!i18n_utils::IsAlphaNumeric(n)) {
719             // Not alphabetic, end the last token if necessary.
720             if (token_start != -1) {
721               next_tokens.push_back(Token(
722                   type, text_.substr(token_start,
723                                      iterator_.utf8_index() - token_start)));
724               token_start = -1;
725             }
726           }
727         } else {
728           // Not backslash.
729           if (token_start != -1) {
730             next_tokens.push_back(Token(
731                 type, text_.substr(token_start,
732                                    iterator_.utf8_index() - token_start)));
733             token_start = -1;
734           }
735           // Switch to host component tokens.
736           if (iterator_.utf8_index() == at_sign) {
737             type = Token::Type::RFC822_ADDRESS_COMPONENT_HOST;
738           }
739         }
740       }
741       AdvanceCursor();
742     }
743     if (token_start != -1) {
744       next_tokens.push_back(Token(
745           type,
746           text_.substr(token_start, iterator_.utf8_index() - token_start)));
747     }
748     // Unquoted will handle the closing bracket > if these is one.
749     return next_tokens;
750   }
751 
AdvanceCursor()752   void AdvanceCursor() {
753     iterator_.AdvanceToUtf32(iterator_.utf32_index() + 1);
754   }
755 
AdvancePastWhitespace()756   void AdvancePastWhitespace() {
757     while (i18n_utils::IsWhitespaceAt(text_, iterator_.utf8_index())) {
758       AdvanceCursor();
759     }
760   }
761 
762   std::string_view text_;
763   CharacterIterator iterator_;
764   int text_end_;
765 
766   // A temporary store of Tokens. As we advance through the provided string,
767   // we parse entire addresses at a time rather than one token at a time.
768   // However, since we call the tokenizer with Advance() alternating with
769   // GetToken(), we need to store tokens for subsequent GetToken calls if
770   // Advance generates multiple tokens (it usually does). A vector is used as
771   // we need to iterate back and forth through tokens during snippeting. It is
772   // cleared by the destructor.
773   std::vector<Token> tokens_;
774   // Index to keep track of where we are in tokens_. This will always be set to
775   // point to an RFC822_TOKEN, or one past the end of the tokens_ vector. The
776   // only exception is before the first Advance call.
777   int token_index_;
778 };
779 
780 libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
Tokenize(std::string_view text) const781 Rfc822Tokenizer::Tokenize(std::string_view text) const {
782   return std::make_unique<Rfc822TokenIterator>(text);
783 }
784 
TokenizeAll(std::string_view text) const785 libtextclassifier3::StatusOr<std::vector<Token>> Rfc822Tokenizer::TokenizeAll(
786     std::string_view text) const {
787   ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
788                          Tokenize(text));
789   std::vector<Token> tokens;
790   while (iterator->Advance()) {
791     std::vector<Token> batch_tokens = iterator->GetTokens();
792     tokens.insert(tokens.end(), batch_tokens.begin(), batch_tokens.end());
793   }
794   return tokens;
795 }
796 
797 }  // namespace lib
798 }  // namespace icing
799