1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/tokenization/rfc822-tokenizer.h"
16
17 #include <algorithm>
18 #include <deque>
19 #include <queue>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23
24 #include "icing/tokenization/token.h"
25 #include "icing/tokenization/tokenizer.h"
26 #include "icing/util/character-iterator.h"
27 #include "icing/util/i18n-utils.h"
28 #include "icing/util/status-macros.h"
29 #include "unicode/umachine.h"
30
31 namespace icing {
32 namespace lib {
33
34 namespace {
IsDelimiter(UChar32 c)35 bool IsDelimiter(UChar32 c) { return c == ',' || c == ';' || c == '\n'; }
36 } // namespace
37
38 class Rfc822TokenIterator : public Tokenizer::Iterator {
39 public:
40 // Cursor is the index into the string_view, text_end_ is the length.
Rfc822TokenIterator(std::string_view text)41 explicit Rfc822TokenIterator(std::string_view text)
42 : text_(std::move(text)),
43 iterator_(text, 0, 0, 0),
44 text_end_(text.length()),
45 token_index_(-1) {}
46
47 // Advance will move token_index_ past the end of tokens_
Advance()48 bool Advance() override {
49 // Stop the token index on a RFC822 token, or one past the end, where the
50 // next RFC822 token will be if more are generated.
51 do {
52 token_index_++;
53 } while (token_index_ < tokens_.size() &&
54 tokens_[token_index_].type != Token::Type::RFC822_TOKEN);
55
56 // There is still something left, possible if we rewinded and call Advance
57 if (token_index_ < tokens_.size()) {
58 return true;
59 }
60
61 // Done with the entire string_view.
62 if (iterator_.utf8_index() >= text_end_) {
63 return false;
64 }
65
66 // Parsing a new email, update the current email marker.
67 AdvancePastWhitespace();
68
69 // This may return false, as in the case of "<alex>,,", where after
70 // processing <alex>, there are no more tokens.
71 return GetNextRfc822Token();
72 }
73
74 // Returns the current token group, an RFC822_TOKEN along with all it's
75 // subtokens. For example, "[email protected]" will return all tokens generated
76 // from that text.
77 //
78 // Returns:
79 // A vector of Tokens on success
80 // An empty vector if the token list is empty
81 // An empty vector if the index is past the end of the token list
GetTokens() const82 std::vector<Token> GetTokens() const override {
83 std::vector<Token> result;
84 if (token_index_ < tokens_.size() && token_index_ >= 0) {
85 int index = token_index_;
86 do {
87 result.push_back(tokens_[index]);
88 } while (++index < tokens_.size() &&
89 tokens_[index].type != Token::Type::RFC822_TOKEN);
90 }
91 return result;
92 }
93
ResetToTokenStartingAfter(int32_t utf32_offset)94 bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
95 CharacterIterator tracker(text_);
96 for (int new_index = 0; new_index < tokens_.size(); ++new_index) {
97 const Token& t = tokens_[new_index];
98 if (t.type != Token::Type::RFC822_TOKEN) {
99 continue;
100 }
101
102 tracker.AdvanceToUtf8(t.text.begin() - text_.begin());
103 if (tracker.utf32_index() > utf32_offset) {
104 token_index_ = new_index;
105 return true;
106 }
107 }
108
109 return false;
110 }
111
112 // This will attempt to reset the token_index to point to the last token
113 // ending before an offset. If it fails, due to there not being any tokens
114 // before the offset, the token index will become -1.
ResetToTokenEndingBefore(int32_t utf32_offset)115 bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
116 // First, advance until we pass offset or Advance is false
117 if (tokens_.empty()) {
118 if (!Advance()) {
119 // No tokens available, and Advancing doesn't get more, so return false.
120 return false;
121 }
122 }
123
124 CharacterIterator tracker(text_);
125
126 // Keep advancing until we parse all the emails, or run past the offset.
127 // Advance will always make token_index_ point to an RFC822_TOKEN, so we can
128 // look at that tokens text end to determine if it ends before the offset.
129 // This first loop will guarantee that we end up either past the offset or
130 // at the end.
131 do {
132 tracker.AdvanceToUtf8(tokens_[token_index_].text.end() - text_.begin());
133
134 // When we Advance and have to convert names to email addresses, it's
135 // possible that multiple RFC822 tokens are added. We need to advance
136 // through these one at a time, we cannot skip to the top of the line.
137 } while (tracker.utf32_index() <= utf32_offset && Advance());
138
139 // We are either past the offset or at the end. Either way, we now work
140 // backwards and reset to the first (highest index) RFC822_TOKEN we find.
141 while (--token_index_ >= 0) {
142 if (tokens_[token_index_].type != Token::Type::RFC822_TOKEN) {
143 continue;
144 }
145
146 tracker.MoveToUtf8(tokens_[token_index_].text.end() - text_.begin());
147 if (tracker.utf32_index() <= utf32_offset) {
148 return true;
149 }
150 }
151 return false;
152 }
153
154 // Returns a character iterator to the start of the token.
CalculateTokenStart()155 libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
156 override {
157 CharacterIterator token_start = iterator_;
158 token_start.MoveToUtf8(GetTokens().at(0).text.begin() - text_.begin());
159 return token_start;
160 }
161
162 // Returns a character iterator to right after the end of the token.
CalculateTokenEndExclusive()163 libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
164 override {
165 CharacterIterator token_end = iterator_;
166 token_end.MoveToUtf8(GetTokens().at(0).text.end() - text_.begin());
167 return token_end;
168 }
169
170 // Reset to start moves to the state we're in after the first Advance().
ResetToStart()171 bool ResetToStart() override {
172 token_index_ = -1;
173 return Advance();
174 }
175
176 private:
177 // Advance until the next email delimiter, generating as many tokens as
178 // necessary.
GetNextRfc822Token()179 bool GetNextRfc822Token() {
180 if (iterator_.utf8_index() >= text_end_) {
181 return false;
182 }
183
184 int token_start = iterator_.utf8_index();
185 bool address_found = false;
186 bool name_found = false;
187 std::vector<Token> next_tokens;
188 Token rfc822(Token::Type::RFC822_TOKEN);
189
190 // We start at unquoted and run until a ",;\n<( .
191 while (iterator_.utf8_index() < text_end_) {
192 UChar32 c = iterator_.GetCurrentChar();
193 if (IsDelimiter(c)) {
194 // End of the token, advance cursor past all delimiters then quit.
195 rfc822.text =
196 text_.substr(token_start, iterator_.utf8_index() - token_start);
197
198 UChar32 delimiter;
199 do {
200 AdvanceCursor();
201 delimiter = iterator_.GetCurrentChar();
202 // If we get current char on the end, it is not a delimiter so this
203 // loop will end
204 } while (IsDelimiter(delimiter));
205
206 break;
207 }
208
209 std::vector<Token> consume_result;
210 if (c == '"') {
211 consume_result = ConsumeQuotedSection();
212 name_found |= !consume_result.empty();
213 } else if (c == '(') {
214 consume_result = ConsumeParenthesizedSection();
215 } else if (c == '<') {
216 // Only set address_found to true if ConsumeAdress returns true.
217 // Otherwise, keep address_found as is to prevent setting address_found
218 // back to false if it is true.
219 consume_result = ConsumeAddress();
220 address_found |= !consume_result.empty();
221 } else {
222 consume_result = ConsumeUnquotedSection();
223 name_found |= !consume_result.empty();
224 }
225 next_tokens.insert(next_tokens.end(), consume_result.begin(),
226 consume_result.end());
227 }
228 if (iterator_.utf8_index() >= text_end_) {
229 rfc822.text = text_.substr(token_start, text_end_ - token_start);
230 }
231
232 // If an address is found, use the tokens we have.
233 // If an address isn't found, and a name isn't found, also use the tokens
234 // we have.
235 // If an address isn't found but a name is, convert name Tokens to email
236 // Tokens.
237 if (!address_found && name_found) {
238 // We don't add the rfc822 token, as it will be handled by
239 // ConvertNameToEmail.
240 std::vector<Token> converted_tokens = ConvertNameToEmail(next_tokens);
241 tokens_.insert(tokens_.end(), converted_tokens.begin(),
242 converted_tokens.end());
243 } else {
244 if (next_tokens.empty()) {
245 // Tokens may not be generated in the case of ",,,,,,"
246 return false;
247 } else {
248 // If tokens were generated, push back the RFC822 token for them
249 tokens_.push_back(rfc822);
250 tokens_.insert(tokens_.end(), next_tokens.begin(), next_tokens.end());
251 }
252 }
253
254 return true;
255 }
256
257 // We allow for the "First Last <email>" format, but if there is no email in
258 // brackets, we won't allow for unquoted spaces. For example, the input
259 // "[email protected] [email protected]" has an unquoted space, so we will split
260 // it into two emails. We don't need to find more tokens, we just need to
261 // find @ signs and spaces and convert name tokens to parts of the email.
ConvertNameToEmail(std::vector<Token> & name_tokens)262 std::vector<Token> ConvertNameToEmail(std::vector<Token>& name_tokens) {
263 if (name_tokens.empty()) {
264 return name_tokens;
265 }
266
267 // There will only be names and comments, and they will be in order.
268 std::vector<Token> converted_tokens;
269
270 // Start at the beginning of the current email.
271 CharacterIterator scanner(text_);
272
273 scanner.MoveToUtf8(name_tokens[0].text.begin() - text_.begin());
274 int token_processed_index = 0;
275
276 bool in_quote = false;
277 // Setting at_sign_index to before the beginning, it'll only be set to
278 // something else if we find an @ sign
279 const char* at_sign_index = nullptr;
280
281 // Run to the end
282 while (scanner.utf8_index() < iterator_.utf8_index()) {
283 const char* end_of_token = nullptr;
284 UChar32 c = scanner.GetCurrentChar();
285 if (c == '\\') {
286 // Skip the slash, as well as the following token.
287 scanner.AdvanceToUtf32(scanner.utf32_index() + 1);
288 scanner.AdvanceToUtf32(scanner.utf32_index() + 1);
289 continue;
290 }
291 if (c == '"') {
292 in_quote = !in_quote;
293 }
294 if (c == '@') {
295 at_sign_index = text_.begin() + scanner.utf8_index();
296 }
297
298 // If the next character is the end OR we hit an unquoted space.
299 if (scanner.utf8_index() + i18n_utils::GetUtf8Length(c) ==
300 iterator_.utf8_index() ||
301 (!in_quote && c == ' ')) {
302 if (!in_quote && c == ' ') {
303 end_of_token = text_.begin() + scanner.utf8_index();
304 } else {
305 end_of_token = text_.begin() + iterator_.utf8_index();
306 }
307 std::deque<Token> more_tokens = ConvertOneNameToEmail(
308 name_tokens, at_sign_index, end_of_token, token_processed_index);
309 converted_tokens.insert(converted_tokens.end(), more_tokens.begin(),
310 more_tokens.end());
311 // Reset the at_sign_index
312 at_sign_index = nullptr;
313 }
314 scanner.AdvanceToUtf32(scanner.utf32_index() + 1);
315 }
316
317 // It's possible we left something out.
318 if (token_processed_index < name_tokens.size()) {
319 std::deque<Token> more_tokens =
320 ConvertOneNameToEmail(name_tokens, at_sign_index,
321 name_tokens[name_tokens.size() - 1].text.end(),
322 token_processed_index);
323 converted_tokens.insert(converted_tokens.end(), more_tokens.begin(),
324 more_tokens.end());
325 }
326
327 return converted_tokens;
328 }
329
330 // Once a name is determined to be an address, convert its tokens to address
331 // tokens.
ConvertOneNameToEmail(const std::vector<Token> & name_tokens,const char * at_sign_index,const char * end_of_token,int & token_processed_index)332 std::deque<Token> ConvertOneNameToEmail(const std::vector<Token>& name_tokens,
333 const char* at_sign_index,
334 const char* end_of_token,
335 int& token_processed_index) {
336 const char* address_start = nullptr;
337 const char* local_address_end = nullptr;
338 const char* host_address_start = nullptr;
339 const char* address_end = nullptr;
340 const char* token_start = nullptr;
341 const char* token_end = nullptr;
342 std::deque<Token> converted_tokens;
343
344 // Transform tokens up to end of token pointer.
345
346 for (; token_processed_index < name_tokens.size();
347 ++token_processed_index) {
348 const Token& token = name_tokens[token_processed_index];
349
350 if (token.text.end() > end_of_token) {
351 break;
352 }
353 std::string_view text = token.text;
354 // We need to do this both for comment and name tokens. Comment tokens
355 // will get a corresponding RFC822 token, but not an address or local
356 // address.
357 if (token_start == nullptr) {
358 token_start = text.begin();
359 }
360 token_end = text.end();
361
362 if (token.type == Token::Type::RFC822_COMMENT) {
363 // Comment tokens will stay as they are.
364 converted_tokens.push_back(token);
365 } else if (token.type == Token::Type::RFC822_NAME) {
366 // Names need to be converted to address tokens. We keep the order of
367 // which the name tokens appeared. Name tokens that appear before an
368 // @ sign in the name will become RFC822_ADDRESS_COMPONENT_LOCAL, and
369 // those after will become RFC822_ADDRESS_COMPONENT_HOST. We aren't
370 // able to determine RFC822_ADDRESS, RFC822_LOCAL_ADDRESS, and
371 // RFC_HOST_ADDRESS before checking the name tokens, so they will be
372 // added after the component tokens.
373 if (address_start == nullptr) {
374 address_start = text.begin();
375 }
376 address_end = text.end();
377 if (text.begin() > at_sign_index) {
378 if (host_address_start == nullptr) {
379 host_address_start = text.begin();
380 }
381 // Once this is hit, we switch to COMPONENT_HOST and mark end of the
382 // local address
383 converted_tokens.push_back(
384 Token(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, token.text));
385 } else {
386 local_address_end = text.end();
387 converted_tokens.push_back(
388 Token(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, token.text));
389 }
390 }
391 }
392
393 if (address_start != nullptr) {
394 converted_tokens.push_back(
395 Token(Token::Type::RFC822_ADDRESS,
396 std::string_view(address_start, address_end - address_start)));
397 if (local_address_end != nullptr) {
398 converted_tokens.push_back(
399 Token(Token::Type::RFC822_LOCAL_ADDRESS,
400 std::string_view(address_start,
401 local_address_end - address_start)));
402 }
403 }
404
405 if (host_address_start != nullptr && host_address_start < address_end) {
406 converted_tokens.push_back(
407 Token(Token::Type::RFC822_HOST_ADDRESS,
408 text_.substr(host_address_start - text_.begin(),
409 address_end - host_address_start)));
410 }
411
412 if (token_start != nullptr) {
413 converted_tokens.push_front(
414 Token(Token::Type::RFC822_TOKEN,
415 std::string_view(token_start, token_end - token_start)));
416 }
417
418 return converted_tokens;
419 }
420
421 // Returns name tokens in an unquoted section. This is useful in case we do
422 // not find an address and have to use the name. An unquoted section may look
423 // like "Alex Sav", or "[email protected]". In the absense of a bracketed email
424 // address, the unquoted section will be used as the email address along with
425 // the quoted section.
ConsumeUnquotedSection()426 std::vector<Token> ConsumeUnquotedSection() {
427 UChar32 c;
428
429 int token_start = -1;
430 std::vector<Token> next_tokens;
431
432 // Advance to another state or a character marking the end of token, one
433 // of \n,; .
434 while (iterator_.utf8_index() < text_end_) {
435 c = iterator_.GetCurrentChar();
436
437 if (i18n_utils::IsAlphaNumeric(c)) {
438 if (token_start == -1) {
439 // Start recording
440 token_start = iterator_.utf8_index();
441 }
442 AdvanceCursor();
443
444 } else {
445 if (token_start != -1) {
446 // The character is non alphabetic, save a token.
447 next_tokens.push_back(Token(
448 Token::Type::RFC822_NAME,
449 text_.substr(token_start, iterator_.utf8_index() - token_start)));
450 token_start = -1;
451 }
452
453 if (c == '"' || c == '<' || c == '(' || IsDelimiter(c)) {
454 // Stay on the token.
455 break;
456 }
457
458 AdvanceCursor();
459 }
460 }
461 if (token_start != -1) {
462 next_tokens.push_back(Token(
463 Token::Type::RFC822_NAME,
464 text_.substr(token_start, iterator_.utf8_index() - token_start)));
465 }
466 return next_tokens;
467 }
468
469 // Names that are within quotes should have all characters blindly
470 // unescaped. When a name is made into an address, it isn't re-escaped.
471
472 // Returns name tokens found in a quoted section. This is useful in case we do
473 // not find an address and have to use the name. The quoted section may
474 // contain whitespaces.
ConsumeQuotedSection()475 std::vector<Token> ConsumeQuotedSection() {
476 // Get past the first quote.
477 AdvanceCursor();
478
479 bool end_quote_found = false;
480 std::vector<Token> next_tokens;
481 UChar32 c;
482
483 int token_start = -1;
484
485 while (!end_quote_found && (iterator_.utf8_index() < text_end_)) {
486 c = iterator_.GetCurrentChar();
487
488 if (i18n_utils::IsAlphaNumeric(c)) {
489 if (token_start == -1) {
490 // Start tracking the token.
491 token_start = iterator_.utf8_index();
492 }
493 AdvanceCursor();
494
495 } else {
496 // Non- alphabetic
497 if (c == '\\') {
498 // A backslash, let's look at the next character.
499 CharacterIterator temp = iterator_;
500 temp.AdvanceToUtf32(iterator_.utf32_index() + 1);
501 UChar32 n = temp.GetCurrentChar();
502 if (i18n_utils::IsAlphaNumeric(n)) {
503 // The next character is alphabetic, skip the slash and don't end
504 // the last token. For quoted sections, the only things that are
505 // escaped are double quotes and slashes. For example, in "a\lex",
506 // an l appears after the slash. We want to treat this as if it
507 // was just "alex". So we tokenize it as <RFC822_NAME, "a\lex">.
508 AdvanceCursor();
509 } else {
510 // Not alphabetic, so save the last token if necessary.
511 if (token_start != -1) {
512 next_tokens.push_back(
513 Token(Token::Type::RFC822_NAME,
514 text_.substr(token_start,
515 iterator_.utf8_index() - token_start)));
516 token_start = -1;
517 }
518
519 // Skip the backslash.
520 AdvanceCursor();
521
522 if (n == '"' || n == '\\' || n == '@') {
523 // Skip these too if they're next.
524 AdvanceCursor();
525 }
526 }
527 } else {
528 // Not a backslash.
529
530 if (token_start != -1) {
531 next_tokens.push_back(
532 Token(Token::Type::RFC822_NAME,
533 text_.substr(token_start,
534 iterator_.utf8_index() - token_start)));
535 token_start = -1;
536 }
537
538 if (c == '"') {
539 end_quote_found = true;
540 }
541 // Advance one more time to get past the non-alphabetic character.
542 AdvanceCursor();
543 }
544 }
545 }
546 if (token_start != -1) {
547 next_tokens.push_back(Token(
548 Token::Type::RFC822_NAME,
549 text_.substr(token_start, iterator_.utf8_index() - token_start)));
550 }
551 return next_tokens;
552 }
553
554 // '(', ')', '\\' chars should be escaped. All other escaped chars should be
555 // unescaped.
ConsumeParenthesizedSection()556 std::vector<Token> ConsumeParenthesizedSection() {
557 // Skip the initial (
558 AdvanceCursor();
559
560 int paren_layer = 1;
561 UChar32 c;
562 std::vector<Token> next_tokens;
563
564 int token_start = -1;
565
566 while (paren_layer > 0 && (iterator_.utf8_index() < text_end_)) {
567 c = iterator_.GetCurrentChar();
568
569 if (i18n_utils::IsAlphaNumeric(c)) {
570 if (token_start == -1) {
571 // Start tracking a token.
572 token_start = iterator_.utf8_index();
573 }
574 AdvanceCursor();
575 } else {
576 // Non alphabetic.
577 if (c == '\\') {
578 // A backslash, let's look at the next character.
579 UChar32 n = i18n_utils::GetUChar32At(text_.begin(), text_.length(),
580 iterator_.utf8_index() + 1);
581 if (i18n_utils::IsAlphaNumeric(n)) {
582 // Alphabetic, skip the slash and don't end the last token.
583 AdvanceCursor();
584 } else {
585 // Not alphabetic, save the last token if necessary.
586 if (token_start != -1) {
587 next_tokens.push_back(
588 Token(Token::Type::RFC822_COMMENT,
589 text_.substr(token_start,
590 iterator_.utf8_index() - token_start)));
591 token_start = -1;
592 }
593
594 // Skip the backslash.
595 AdvanceCursor();
596
597 if (n == ')' || n == '(' || n == '\\') {
598 // Skip these too if they're next.
599 AdvanceCursor();
600 }
601 }
602 } else {
603 // Not a backslash.
604 if (token_start != -1) {
605 next_tokens.push_back(
606 Token(Token::Type::RFC822_COMMENT,
607 text_.substr(token_start,
608 iterator_.utf8_index() - token_start)));
609 token_start = -1;
610 }
611
612 if (c == '(') {
613 paren_layer++;
614 } else if (c == ')') {
615 paren_layer--;
616 }
617 AdvanceCursor();
618 }
619 }
620 }
621
622 if (token_start != -1) {
623 // Ran past the end of text_ without getting the last token.
624
625 // substr returns "a view of the substring [pos, pos + // rcount), where
626 // rcount is the smaller of count and size() - pos" therefore the count
627 // argument can be any value >= this->cursor - token_start. Therefore,
628 // ignoring the mutation warning.
629 next_tokens.push_back(Token(
630 Token::Type::RFC822_COMMENT,
631 text_.substr(token_start, iterator_.utf8_index() - token_start)));
632 }
633 return next_tokens;
634 }
635
636 // Returns tokens found in the address.
ConsumeAddress()637 std::vector<Token> ConsumeAddress() {
638 // Skip the first <.
639 AdvanceCursor();
640
641 // Save the start position.
642 CharacterIterator address_start_iterator = iterator_;
643 std::vector<Token> next_tokens;
644
645 // Place the at sign on the '<', so that if no at_sign is found, the default
646 // is that the entire address is the host part.
647 int at_sign = -1;
648 int address_end = -1;
649
650 UChar32 c = iterator_.GetCurrentChar();
651 // Quick scan for @ and > signs.
652 while (c != '>' && iterator_.utf8_index() < text_end_) {
653 AdvanceCursor();
654 c = iterator_.GetCurrentChar();
655 if (c == '@') {
656 at_sign = iterator_.utf8_index();
657 }
658 }
659
660 if (iterator_.utf8_index() <= address_start_iterator.utf8_index()) {
661 // There is nothing between the brackets, either we have "<" or "<>".
662 return next_tokens;
663 }
664
665 // Either we find a > or run to the end, either way this is the end of the
666 // address. The ending bracket will be handled by ConsumeUnquoted.
667 address_end = iterator_.utf8_index();
668
669 // Reset to the start.
670 iterator_ = address_start_iterator;
671
672 int address_start = address_start_iterator.utf8_index();
673
674 Token::Type type = Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL;
675
676 // Create a local address token.
677 if (at_sign != -1) {
678 next_tokens.push_back(
679 Token(Token::Type::RFC822_LOCAL_ADDRESS,
680 text_.substr(address_start, at_sign - address_start)));
681 } else {
682 // All the tokens in the address are host components.
683 type = Token::Type::RFC822_ADDRESS_COMPONENT_HOST;
684 // If no @ is found, treat the entire address as the host address.
685 at_sign = address_start - 1;
686 }
687
688 // The only case where we don't have a host address part is something like
689 // <localaddress@>. If there is no @, the at_sign is the default -1, and the
690 // host address is [0, address_end).
691 int host_address_start = at_sign + 1;
692 if (host_address_start < address_end) {
693 next_tokens.push_back(Token(
694 Token::Type::RFC822_HOST_ADDRESS,
695 text_.substr(host_address_start, address_end - host_address_start)));
696 }
697
698 next_tokens.push_back(
699 Token(Token::Type::RFC822_ADDRESS,
700 text_.substr(address_start, address_end - address_start)));
701
702 int token_start = -1;
703
704 while (iterator_.utf8_index() < address_end) {
705 c = iterator_.GetCurrentChar();
706
707 if (i18n_utils::IsAlphaNumeric(c)) {
708 if (token_start == -1) {
709 token_start = iterator_.utf8_index();
710 }
711 } else {
712 // non alphabetic
713 if (c == '\\') {
714 // A backslash, let's look at the next character.
715 CharacterIterator temp = iterator_;
716 temp.AdvanceToUtf32(iterator_.utf32_index() + 1);
717 UChar32 n = temp.GetCurrentChar();
718 if (!i18n_utils::IsAlphaNumeric(n)) {
719 // Not alphabetic, end the last token if necessary.
720 if (token_start != -1) {
721 next_tokens.push_back(Token(
722 type, text_.substr(token_start,
723 iterator_.utf8_index() - token_start)));
724 token_start = -1;
725 }
726 }
727 } else {
728 // Not backslash.
729 if (token_start != -1) {
730 next_tokens.push_back(Token(
731 type, text_.substr(token_start,
732 iterator_.utf8_index() - token_start)));
733 token_start = -1;
734 }
735 // Switch to host component tokens.
736 if (iterator_.utf8_index() == at_sign) {
737 type = Token::Type::RFC822_ADDRESS_COMPONENT_HOST;
738 }
739 }
740 }
741 AdvanceCursor();
742 }
743 if (token_start != -1) {
744 next_tokens.push_back(Token(
745 type,
746 text_.substr(token_start, iterator_.utf8_index() - token_start)));
747 }
748 // Unquoted will handle the closing bracket > if these is one.
749 return next_tokens;
750 }
751
AdvanceCursor()752 void AdvanceCursor() {
753 iterator_.AdvanceToUtf32(iterator_.utf32_index() + 1);
754 }
755
AdvancePastWhitespace()756 void AdvancePastWhitespace() {
757 while (i18n_utils::IsWhitespaceAt(text_, iterator_.utf8_index())) {
758 AdvanceCursor();
759 }
760 }
761
762 std::string_view text_;
763 CharacterIterator iterator_;
764 int text_end_;
765
766 // A temporary store of Tokens. As we advance through the provided string,
767 // we parse entire addresses at a time rather than one token at a time.
768 // However, since we call the tokenizer with Advance() alternating with
769 // GetToken(), we need to store tokens for subsequent GetToken calls if
770 // Advance generates multiple tokens (it usually does). A vector is used as
771 // we need to iterate back and forth through tokens during snippeting. It is
772 // cleared by the destructor.
773 std::vector<Token> tokens_;
774 // Index to keep track of where we are in tokens_. This will always be set to
775 // point to an RFC822_TOKEN, or one past the end of the tokens_ vector. The
776 // only exception is before the first Advance call.
777 int token_index_;
778 };
779
780 libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
Tokenize(std::string_view text) const781 Rfc822Tokenizer::Tokenize(std::string_view text) const {
782 return std::make_unique<Rfc822TokenIterator>(text);
783 }
784
TokenizeAll(std::string_view text) const785 libtextclassifier3::StatusOr<std::vector<Token>> Rfc822Tokenizer::TokenizeAll(
786 std::string_view text) const {
787 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
788 Tokenize(text));
789 std::vector<Token> tokens;
790 while (iterator->Advance()) {
791 std::vector<Token> batch_tokens = iterator->GetTokens();
792 tokens.insert(tokens.end(), batch_tokens.begin(), batch_tokens.end());
793 }
794 return tokens;
795 }
796
797 } // namespace lib
798 } // namespace icing
799