1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker *
4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker *
8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker *
10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker */
16*993b0882SAndroid Build Coastguard Worker
17*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/lexer.h"
18*993b0882SAndroid Build Coastguard Worker
19*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar {
20*993b0882SAndroid Build Coastguard Worker
GetSymbolType(const UnicodeText::const_iterator & it) const21*993b0882SAndroid Build Coastguard Worker Symbol::Type Lexer::GetSymbolType(const UnicodeText::const_iterator& it) const {
22*993b0882SAndroid Build Coastguard Worker if (unilib_.IsPunctuation(*it)) {
23*993b0882SAndroid Build Coastguard Worker return Symbol::Type::TYPE_PUNCTUATION;
24*993b0882SAndroid Build Coastguard Worker } else if (unilib_.IsDigit(*it)) {
25*993b0882SAndroid Build Coastguard Worker return Symbol::Type::TYPE_DIGITS;
26*993b0882SAndroid Build Coastguard Worker } else {
27*993b0882SAndroid Build Coastguard Worker return Symbol::Type::TYPE_TERM;
28*993b0882SAndroid Build Coastguard Worker }
29*993b0882SAndroid Build Coastguard Worker }
30*993b0882SAndroid Build Coastguard Worker
AppendTokenSymbols(const StringPiece value,int match_offset,const CodepointSpan codepoint_span,std::vector<Symbol> * symbols) const31*993b0882SAndroid Build Coastguard Worker void Lexer::AppendTokenSymbols(const StringPiece value, int match_offset,
32*993b0882SAndroid Build Coastguard Worker const CodepointSpan codepoint_span,
33*993b0882SAndroid Build Coastguard Worker std::vector<Symbol>* symbols) const {
34*993b0882SAndroid Build Coastguard Worker // Possibly split token.
35*993b0882SAndroid Build Coastguard Worker UnicodeText token_unicode = UTF8ToUnicodeText(value.data(), value.size(),
36*993b0882SAndroid Build Coastguard Worker /*do_copy=*/false);
37*993b0882SAndroid Build Coastguard Worker int next_match_offset = match_offset;
38*993b0882SAndroid Build Coastguard Worker auto token_end = token_unicode.end();
39*993b0882SAndroid Build Coastguard Worker auto it = token_unicode.begin();
40*993b0882SAndroid Build Coastguard Worker Symbol::Type type = GetSymbolType(it);
41*993b0882SAndroid Build Coastguard Worker CodepointIndex sub_token_start = codepoint_span.first;
42*993b0882SAndroid Build Coastguard Worker while (it != token_end) {
43*993b0882SAndroid Build Coastguard Worker auto next = std::next(it);
44*993b0882SAndroid Build Coastguard Worker int num_codepoints = 1;
45*993b0882SAndroid Build Coastguard Worker Symbol::Type next_type;
46*993b0882SAndroid Build Coastguard Worker while (next != token_end) {
47*993b0882SAndroid Build Coastguard Worker next_type = GetSymbolType(next);
48*993b0882SAndroid Build Coastguard Worker if (type == Symbol::Type::TYPE_PUNCTUATION || next_type != type) {
49*993b0882SAndroid Build Coastguard Worker break;
50*993b0882SAndroid Build Coastguard Worker }
51*993b0882SAndroid Build Coastguard Worker ++next;
52*993b0882SAndroid Build Coastguard Worker ++num_codepoints;
53*993b0882SAndroid Build Coastguard Worker }
54*993b0882SAndroid Build Coastguard Worker symbols->emplace_back(
55*993b0882SAndroid Build Coastguard Worker type, CodepointSpan{sub_token_start, sub_token_start + num_codepoints},
56*993b0882SAndroid Build Coastguard Worker /*match_offset=*/next_match_offset,
57*993b0882SAndroid Build Coastguard Worker /*lexeme=*/
58*993b0882SAndroid Build Coastguard Worker StringPiece(it.utf8_data(), next.utf8_data() - it.utf8_data()));
59*993b0882SAndroid Build Coastguard Worker next_match_offset = sub_token_start + num_codepoints;
60*993b0882SAndroid Build Coastguard Worker it = next;
61*993b0882SAndroid Build Coastguard Worker type = next_type;
62*993b0882SAndroid Build Coastguard Worker sub_token_start = next_match_offset;
63*993b0882SAndroid Build Coastguard Worker }
64*993b0882SAndroid Build Coastguard Worker }
65*993b0882SAndroid Build Coastguard Worker
66*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3::grammar
67