xref: /aosp_15_r20/external/libtextclassifier/native/utils/grammar/parsing/matcher.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/matcher.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include <iostream>
20*993b0882SAndroid Build Coastguard Worker #include <limits>
21*993b0882SAndroid Build Coastguard Worker 
22*993b0882SAndroid Build Coastguard Worker #include "utils/base/endian.h"
23*993b0882SAndroid Build Coastguard Worker #include "utils/base/logging.h"
24*993b0882SAndroid Build Coastguard Worker #include "utils/base/macros.h"
25*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/types.h"
26*993b0882SAndroid Build Coastguard Worker #include "utils/strings/utf8.h"
27*993b0882SAndroid Build Coastguard Worker 
28*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar {
29*993b0882SAndroid Build Coastguard Worker namespace {
30*993b0882SAndroid Build Coastguard Worker 
31*993b0882SAndroid Build Coastguard Worker // Iterator that just enumerates the bytes in a utf8 text.
32*993b0882SAndroid Build Coastguard Worker struct ByteIterator {
ByteIteratorlibtextclassifier3::grammar::__anon014f103e0111::ByteIterator33*993b0882SAndroid Build Coastguard Worker   explicit ByteIterator(StringPiece text)
34*993b0882SAndroid Build Coastguard Worker       : data(text.data()), end(text.data() + text.size()) {}
35*993b0882SAndroid Build Coastguard Worker 
Nextlibtextclassifier3::grammar::__anon014f103e0111::ByteIterator36*993b0882SAndroid Build Coastguard Worker   inline char Next() {
37*993b0882SAndroid Build Coastguard Worker     TC3_DCHECK(HasNext());
38*993b0882SAndroid Build Coastguard Worker     const char c = data[0];
39*993b0882SAndroid Build Coastguard Worker     data++;
40*993b0882SAndroid Build Coastguard Worker     return c;
41*993b0882SAndroid Build Coastguard Worker   }
HasNextlibtextclassifier3::grammar::__anon014f103e0111::ByteIterator42*993b0882SAndroid Build Coastguard Worker   inline bool HasNext() const { return data < end; }
43*993b0882SAndroid Build Coastguard Worker 
44*993b0882SAndroid Build Coastguard Worker   const char* data;
45*993b0882SAndroid Build Coastguard Worker   const char* end;
46*993b0882SAndroid Build Coastguard Worker };
47*993b0882SAndroid Build Coastguard Worker 
48*993b0882SAndroid Build Coastguard Worker // Iterator that lowercases a utf8 string on the fly and enumerates the bytes.
49*993b0882SAndroid Build Coastguard Worker struct LowercasingByteIterator {
LowercasingByteIteratorlibtextclassifier3::grammar::__anon014f103e0111::LowercasingByteIterator50*993b0882SAndroid Build Coastguard Worker   LowercasingByteIterator(const UniLib* unilib, StringPiece text)
51*993b0882SAndroid Build Coastguard Worker       : unilib(*unilib),
52*993b0882SAndroid Build Coastguard Worker         data(text.data()),
53*993b0882SAndroid Build Coastguard Worker         end(text.data() + text.size()),
54*993b0882SAndroid Build Coastguard Worker         buffer_pos(0),
55*993b0882SAndroid Build Coastguard Worker         buffer_size(0) {}
56*993b0882SAndroid Build Coastguard Worker 
Nextlibtextclassifier3::grammar::__anon014f103e0111::LowercasingByteIterator57*993b0882SAndroid Build Coastguard Worker   inline char Next() {
58*993b0882SAndroid Build Coastguard Worker     // Queue next character.
59*993b0882SAndroid Build Coastguard Worker     if (buffer_pos >= buffer_size) {
60*993b0882SAndroid Build Coastguard Worker       buffer_pos = 0;
61*993b0882SAndroid Build Coastguard Worker 
62*993b0882SAndroid Build Coastguard Worker       // Lower-case the next character. The character and its lower-cased
63*993b0882SAndroid Build Coastguard Worker       // counterpart may be represented with a different number of bytes in
64*993b0882SAndroid Build Coastguard Worker       // utf8.
65*993b0882SAndroid Build Coastguard Worker       buffer_size =
66*993b0882SAndroid Build Coastguard Worker           ValidRuneToChar(unilib.ToLower(ValidCharToRune(data)), buffer);
67*993b0882SAndroid Build Coastguard Worker       data += GetNumBytesForUTF8Char(data);
68*993b0882SAndroid Build Coastguard Worker     }
69*993b0882SAndroid Build Coastguard Worker     TC3_DCHECK_LT(buffer_pos, buffer_size);
70*993b0882SAndroid Build Coastguard Worker     return buffer[buffer_pos++];
71*993b0882SAndroid Build Coastguard Worker   }
72*993b0882SAndroid Build Coastguard Worker 
HasNextlibtextclassifier3::grammar::__anon014f103e0111::LowercasingByteIterator73*993b0882SAndroid Build Coastguard Worker   inline bool HasNext() const {
74*993b0882SAndroid Build Coastguard Worker     // Either we are not at the end of the data or didn't consume all bytes of
75*993b0882SAndroid Build Coastguard Worker     // the current character.
76*993b0882SAndroid Build Coastguard Worker     return (data < end || buffer_pos < buffer_size);
77*993b0882SAndroid Build Coastguard Worker   }
78*993b0882SAndroid Build Coastguard Worker 
79*993b0882SAndroid Build Coastguard Worker   const UniLib& unilib;
80*993b0882SAndroid Build Coastguard Worker   const char* data;
81*993b0882SAndroid Build Coastguard Worker   const char* end;
82*993b0882SAndroid Build Coastguard Worker 
83*993b0882SAndroid Build Coastguard Worker   // Each unicode codepoint can have up to 4 utf8 encoding bytes.
84*993b0882SAndroid Build Coastguard Worker   char buffer[4];
85*993b0882SAndroid Build Coastguard Worker   int buffer_pos;
86*993b0882SAndroid Build Coastguard Worker   int buffer_size;
87*993b0882SAndroid Build Coastguard Worker };
88*993b0882SAndroid Build Coastguard Worker 
89*993b0882SAndroid Build Coastguard Worker // Searches a terminal match within a sorted table of terminals.
90*993b0882SAndroid Build Coastguard Worker // Using `LowercasingByteIterator` allows to lower-case the query string on the
91*993b0882SAndroid Build Coastguard Worker // fly.
92*993b0882SAndroid Build Coastguard Worker template <typename T>
FindTerminal(T input_iterator,const char * strings,const uint32 * offsets,const int num_terminals,int * terminal_index)93*993b0882SAndroid Build Coastguard Worker const char* FindTerminal(T input_iterator, const char* strings,
94*993b0882SAndroid Build Coastguard Worker                          const uint32* offsets, const int num_terminals,
95*993b0882SAndroid Build Coastguard Worker                          int* terminal_index) {
96*993b0882SAndroid Build Coastguard Worker   int left = 0;
97*993b0882SAndroid Build Coastguard Worker   int right = num_terminals;
98*993b0882SAndroid Build Coastguard Worker   int span_size = right - left;
99*993b0882SAndroid Build Coastguard Worker   int match_length = 0;
100*993b0882SAndroid Build Coastguard Worker 
101*993b0882SAndroid Build Coastguard Worker   // Loop invariant:
102*993b0882SAndroid Build Coastguard Worker   // At the ith iteration, all strings in the range `left` ... `right` match the
103*993b0882SAndroid Build Coastguard Worker   // input on the first `match_length` characters.
104*993b0882SAndroid Build Coastguard Worker   while (true) {
105*993b0882SAndroid Build Coastguard Worker     const unsigned char c =
106*993b0882SAndroid Build Coastguard Worker         static_cast<const unsigned char>(input_iterator.Next());
107*993b0882SAndroid Build Coastguard Worker 
108*993b0882SAndroid Build Coastguard Worker     // We find the possible range of strings in `left` ... `right` matching the
109*993b0882SAndroid Build Coastguard Worker     // `match_length` + 1 character with two binary searches:
110*993b0882SAndroid Build Coastguard Worker     //    1) `lower_bound` to find the start of the range of matching strings.
111*993b0882SAndroid Build Coastguard Worker     //    2) `upper_bound` to find the non-inclusive end of the range.
112*993b0882SAndroid Build Coastguard Worker     left =
113*993b0882SAndroid Build Coastguard Worker         (std::lower_bound(
114*993b0882SAndroid Build Coastguard Worker              offsets + left, offsets + right, c,
115*993b0882SAndroid Build Coastguard Worker              [strings, match_length](uint32 string_offset, uint32 c) -> bool {
116*993b0882SAndroid Build Coastguard Worker                return static_cast<unsigned char>(
117*993b0882SAndroid Build Coastguard Worker                           strings[string_offset + match_length]) <
118*993b0882SAndroid Build Coastguard Worker                       LittleEndian::ToHost32(c);
119*993b0882SAndroid Build Coastguard Worker              }) -
120*993b0882SAndroid Build Coastguard Worker          offsets);
121*993b0882SAndroid Build Coastguard Worker     right =
122*993b0882SAndroid Build Coastguard Worker         (std::upper_bound(
123*993b0882SAndroid Build Coastguard Worker              offsets + left, offsets + right, c,
124*993b0882SAndroid Build Coastguard Worker              [strings, match_length](uint32 c, uint32 string_offset) -> bool {
125*993b0882SAndroid Build Coastguard Worker                return LittleEndian::ToHost32(c) <
126*993b0882SAndroid Build Coastguard Worker                       static_cast<unsigned char>(
127*993b0882SAndroid Build Coastguard Worker                           strings[string_offset + match_length]);
128*993b0882SAndroid Build Coastguard Worker              }) -
129*993b0882SAndroid Build Coastguard Worker          offsets);
130*993b0882SAndroid Build Coastguard Worker     span_size = right - left;
131*993b0882SAndroid Build Coastguard Worker     if (span_size <= 0) {
132*993b0882SAndroid Build Coastguard Worker       return nullptr;
133*993b0882SAndroid Build Coastguard Worker     }
134*993b0882SAndroid Build Coastguard Worker     ++match_length;
135*993b0882SAndroid Build Coastguard Worker 
136*993b0882SAndroid Build Coastguard Worker     // By the loop invariant and due to the fact that the strings are sorted,
137*993b0882SAndroid Build Coastguard Worker     // a matching string will be at `left` now.
138*993b0882SAndroid Build Coastguard Worker     if (!input_iterator.HasNext()) {
139*993b0882SAndroid Build Coastguard Worker       const int string_offset = LittleEndian::ToHost32(offsets[left]);
140*993b0882SAndroid Build Coastguard Worker       if (strings[string_offset + match_length] == 0) {
141*993b0882SAndroid Build Coastguard Worker         *terminal_index = left;
142*993b0882SAndroid Build Coastguard Worker         return &strings[string_offset];
143*993b0882SAndroid Build Coastguard Worker       }
144*993b0882SAndroid Build Coastguard Worker       return nullptr;
145*993b0882SAndroid Build Coastguard Worker     }
146*993b0882SAndroid Build Coastguard Worker   }
147*993b0882SAndroid Build Coastguard Worker 
148*993b0882SAndroid Build Coastguard Worker   // No match found.
149*993b0882SAndroid Build Coastguard Worker   return nullptr;
150*993b0882SAndroid Build Coastguard Worker }
151*993b0882SAndroid Build Coastguard Worker 
152*993b0882SAndroid Build Coastguard Worker // Finds terminal matches in the terminal rules hash tables.
153*993b0882SAndroid Build Coastguard Worker // In case a match is found, `terminal` will be set to point into the
154*993b0882SAndroid Build Coastguard Worker // terminals string pool.
155*993b0882SAndroid Build Coastguard Worker template <typename T>
FindTerminalMatches(T input_iterator,const RulesSet * rules_set,const RulesSet_::Rules_::TerminalRulesMap * terminal_rules,StringPiece * terminal)156*993b0882SAndroid Build Coastguard Worker const RulesSet_::LhsSet* FindTerminalMatches(
157*993b0882SAndroid Build Coastguard Worker     T input_iterator, const RulesSet* rules_set,
158*993b0882SAndroid Build Coastguard Worker     const RulesSet_::Rules_::TerminalRulesMap* terminal_rules,
159*993b0882SAndroid Build Coastguard Worker     StringPiece* terminal) {
160*993b0882SAndroid Build Coastguard Worker   const int terminal_size = terminal->size();
161*993b0882SAndroid Build Coastguard Worker   if (terminal_size < terminal_rules->min_terminal_length() ||
162*993b0882SAndroid Build Coastguard Worker       terminal_size > terminal_rules->max_terminal_length()) {
163*993b0882SAndroid Build Coastguard Worker     return nullptr;
164*993b0882SAndroid Build Coastguard Worker   }
165*993b0882SAndroid Build Coastguard Worker   int terminal_index;
166*993b0882SAndroid Build Coastguard Worker   if (const char* terminal_match = FindTerminal(
167*993b0882SAndroid Build Coastguard Worker           input_iterator, rules_set->terminals()->data(),
168*993b0882SAndroid Build Coastguard Worker           terminal_rules->terminal_offsets()->data(),
169*993b0882SAndroid Build Coastguard Worker           terminal_rules->terminal_offsets()->size(), &terminal_index)) {
170*993b0882SAndroid Build Coastguard Worker     *terminal = StringPiece(terminal_match, terminal->length());
171*993b0882SAndroid Build Coastguard Worker     return rules_set->lhs_set()->Get(
172*993b0882SAndroid Build Coastguard Worker         terminal_rules->lhs_set_index()->Get(terminal_index));
173*993b0882SAndroid Build Coastguard Worker   }
174*993b0882SAndroid Build Coastguard Worker   return nullptr;
175*993b0882SAndroid Build Coastguard Worker }
176*993b0882SAndroid Build Coastguard Worker 
177*993b0882SAndroid Build Coastguard Worker // Finds unary rules matches.
FindUnaryRulesMatches(const RulesSet * rules_set,const RulesSet_::Rules * rules,const Nonterm nonterminal)178*993b0882SAndroid Build Coastguard Worker const RulesSet_::LhsSet* FindUnaryRulesMatches(const RulesSet* rules_set,
179*993b0882SAndroid Build Coastguard Worker                                                const RulesSet_::Rules* rules,
180*993b0882SAndroid Build Coastguard Worker                                                const Nonterm nonterminal) {
181*993b0882SAndroid Build Coastguard Worker   if (!rules->unary_rules()) {
182*993b0882SAndroid Build Coastguard Worker     return nullptr;
183*993b0882SAndroid Build Coastguard Worker   }
184*993b0882SAndroid Build Coastguard Worker   if (const RulesSet_::Rules_::UnaryRulesEntry* entry =
185*993b0882SAndroid Build Coastguard Worker           rules->unary_rules()->LookupByKey(nonterminal)) {
186*993b0882SAndroid Build Coastguard Worker     return rules_set->lhs_set()->Get(entry->value());
187*993b0882SAndroid Build Coastguard Worker   }
188*993b0882SAndroid Build Coastguard Worker   return nullptr;
189*993b0882SAndroid Build Coastguard Worker }
190*993b0882SAndroid Build Coastguard Worker 
191*993b0882SAndroid Build Coastguard Worker // Finds binary rules matches.
FindBinaryRulesMatches(const RulesSet * rules_set,const RulesSet_::Rules * rules,const TwoNonterms nonterminals)192*993b0882SAndroid Build Coastguard Worker const RulesSet_::LhsSet* FindBinaryRulesMatches(
193*993b0882SAndroid Build Coastguard Worker     const RulesSet* rules_set, const RulesSet_::Rules* rules,
194*993b0882SAndroid Build Coastguard Worker     const TwoNonterms nonterminals) {
195*993b0882SAndroid Build Coastguard Worker   if (!rules->binary_rules()) {
196*993b0882SAndroid Build Coastguard Worker     return nullptr;
197*993b0882SAndroid Build Coastguard Worker   }
198*993b0882SAndroid Build Coastguard Worker 
199*993b0882SAndroid Build Coastguard Worker   // Lookup in rules hash table.
200*993b0882SAndroid Build Coastguard Worker   const uint32 bucket_index =
201*993b0882SAndroid Build Coastguard Worker       BinaryRuleHasher()(nonterminals) % rules->binary_rules()->size();
202*993b0882SAndroid Build Coastguard Worker 
203*993b0882SAndroid Build Coastguard Worker   // Get hash table bucket.
204*993b0882SAndroid Build Coastguard Worker   if (const RulesSet_::Rules_::BinaryRuleTableBucket* bucket =
205*993b0882SAndroid Build Coastguard Worker           rules->binary_rules()->Get(bucket_index)) {
206*993b0882SAndroid Build Coastguard Worker     if (bucket->rules() == nullptr) {
207*993b0882SAndroid Build Coastguard Worker       return nullptr;
208*993b0882SAndroid Build Coastguard Worker     }
209*993b0882SAndroid Build Coastguard Worker 
210*993b0882SAndroid Build Coastguard Worker     // Check all entries in the chain.
211*993b0882SAndroid Build Coastguard Worker     for (const RulesSet_::Rules_::BinaryRule* rule : *bucket->rules()) {
212*993b0882SAndroid Build Coastguard Worker       if (rule->rhs_first() == nonterminals.first &&
213*993b0882SAndroid Build Coastguard Worker           rule->rhs_second() == nonterminals.second) {
214*993b0882SAndroid Build Coastguard Worker         return rules_set->lhs_set()->Get(rule->lhs_set_index());
215*993b0882SAndroid Build Coastguard Worker       }
216*993b0882SAndroid Build Coastguard Worker     }
217*993b0882SAndroid Build Coastguard Worker   }
218*993b0882SAndroid Build Coastguard Worker 
219*993b0882SAndroid Build Coastguard Worker   return nullptr;
220*993b0882SAndroid Build Coastguard Worker }
221*993b0882SAndroid Build Coastguard Worker 
GetLhs(const RulesSet * rules_set,const int lhs_entry,Nonterm * nonterminal,CallbackId * callback,int64 * param,int8 * max_whitespace_gap)222*993b0882SAndroid Build Coastguard Worker inline void GetLhs(const RulesSet* rules_set, const int lhs_entry,
223*993b0882SAndroid Build Coastguard Worker                    Nonterm* nonterminal, CallbackId* callback, int64* param,
224*993b0882SAndroid Build Coastguard Worker                    int8* max_whitespace_gap) {
225*993b0882SAndroid Build Coastguard Worker   if (lhs_entry > 0) {
226*993b0882SAndroid Build Coastguard Worker     // Direct encoding of the nonterminal.
227*993b0882SAndroid Build Coastguard Worker     *nonterminal = lhs_entry;
228*993b0882SAndroid Build Coastguard Worker     *callback = kNoCallback;
229*993b0882SAndroid Build Coastguard Worker     *param = 0;
230*993b0882SAndroid Build Coastguard Worker     *max_whitespace_gap = -1;
231*993b0882SAndroid Build Coastguard Worker   } else {
232*993b0882SAndroid Build Coastguard Worker     const RulesSet_::Lhs* lhs = rules_set->lhs()->Get(-lhs_entry);
233*993b0882SAndroid Build Coastguard Worker     *nonterminal = lhs->nonterminal();
234*993b0882SAndroid Build Coastguard Worker     *callback = lhs->callback_id();
235*993b0882SAndroid Build Coastguard Worker     *param = lhs->callback_param();
236*993b0882SAndroid Build Coastguard Worker     *max_whitespace_gap = lhs->max_whitespace_gap();
237*993b0882SAndroid Build Coastguard Worker   }
238*993b0882SAndroid Build Coastguard Worker }
239*993b0882SAndroid Build Coastguard Worker 
240*993b0882SAndroid Build Coastguard Worker }  // namespace
241*993b0882SAndroid Build Coastguard Worker 
Finish()242*993b0882SAndroid Build Coastguard Worker void Matcher::Finish() {
243*993b0882SAndroid Build Coastguard Worker   // Check any pending items.
244*993b0882SAndroid Build Coastguard Worker   ProcessPendingExclusionMatches();
245*993b0882SAndroid Build Coastguard Worker }
246*993b0882SAndroid Build Coastguard Worker 
QueueForProcessing(ParseTree * item)247*993b0882SAndroid Build Coastguard Worker void Matcher::QueueForProcessing(ParseTree* item) {
248*993b0882SAndroid Build Coastguard Worker   // Push element to the front.
249*993b0882SAndroid Build Coastguard Worker   item->next = pending_items_;
250*993b0882SAndroid Build Coastguard Worker   pending_items_ = item;
251*993b0882SAndroid Build Coastguard Worker }
252*993b0882SAndroid Build Coastguard Worker 
QueueForPostCheck(ExclusionNode * item)253*993b0882SAndroid Build Coastguard Worker void Matcher::QueueForPostCheck(ExclusionNode* item) {
254*993b0882SAndroid Build Coastguard Worker   // Push element to the front.
255*993b0882SAndroid Build Coastguard Worker   item->next = pending_exclusion_items_;
256*993b0882SAndroid Build Coastguard Worker   pending_exclusion_items_ = item;
257*993b0882SAndroid Build Coastguard Worker }
258*993b0882SAndroid Build Coastguard Worker 
AddTerminal(const CodepointSpan codepoint_span,const int match_offset,StringPiece terminal)259*993b0882SAndroid Build Coastguard Worker void Matcher::AddTerminal(const CodepointSpan codepoint_span,
260*993b0882SAndroid Build Coastguard Worker                           const int match_offset, StringPiece terminal) {
261*993b0882SAndroid Build Coastguard Worker   TC3_CHECK_GE(codepoint_span.second, last_end_);
262*993b0882SAndroid Build Coastguard Worker 
263*993b0882SAndroid Build Coastguard Worker   // Finish any pending post-checks.
264*993b0882SAndroid Build Coastguard Worker   if (codepoint_span.second > last_end_) {
265*993b0882SAndroid Build Coastguard Worker     ProcessPendingExclusionMatches();
266*993b0882SAndroid Build Coastguard Worker   }
267*993b0882SAndroid Build Coastguard Worker 
268*993b0882SAndroid Build Coastguard Worker   last_end_ = codepoint_span.second;
269*993b0882SAndroid Build Coastguard Worker   for (const RulesSet_::Rules* shard : rules_shards_) {
270*993b0882SAndroid Build Coastguard Worker     // Try case-sensitive matches.
271*993b0882SAndroid Build Coastguard Worker     if (const RulesSet_::LhsSet* lhs_set =
272*993b0882SAndroid Build Coastguard Worker             FindTerminalMatches(ByteIterator(terminal), rules_,
273*993b0882SAndroid Build Coastguard Worker                                 shard->terminal_rules(), &terminal)) {
274*993b0882SAndroid Build Coastguard Worker       // `terminal` points now into the rules string pool, providing a
275*993b0882SAndroid Build Coastguard Worker       // stable reference.
276*993b0882SAndroid Build Coastguard Worker       ExecuteLhsSet(
277*993b0882SAndroid Build Coastguard Worker           codepoint_span, match_offset,
278*993b0882SAndroid Build Coastguard Worker           /*whitespace_gap=*/(codepoint_span.first - match_offset),
279*993b0882SAndroid Build Coastguard Worker           [terminal](ParseTree* parse_tree) {
280*993b0882SAndroid Build Coastguard Worker             parse_tree->terminal = terminal.data();
281*993b0882SAndroid Build Coastguard Worker             parse_tree->rhs2 = nullptr;
282*993b0882SAndroid Build Coastguard Worker           },
283*993b0882SAndroid Build Coastguard Worker           lhs_set);
284*993b0882SAndroid Build Coastguard Worker     }
285*993b0882SAndroid Build Coastguard Worker 
286*993b0882SAndroid Build Coastguard Worker     // Try case-insensitive matches.
287*993b0882SAndroid Build Coastguard Worker     if (const RulesSet_::LhsSet* lhs_set = FindTerminalMatches(
288*993b0882SAndroid Build Coastguard Worker             LowercasingByteIterator(&unilib_, terminal), rules_,
289*993b0882SAndroid Build Coastguard Worker             shard->lowercase_terminal_rules(), &terminal)) {
290*993b0882SAndroid Build Coastguard Worker       // `terminal` points now into the rules string pool, providing a
291*993b0882SAndroid Build Coastguard Worker       // stable reference.
292*993b0882SAndroid Build Coastguard Worker       ExecuteLhsSet(
293*993b0882SAndroid Build Coastguard Worker           codepoint_span, match_offset,
294*993b0882SAndroid Build Coastguard Worker           /*whitespace_gap=*/(codepoint_span.first - match_offset),
295*993b0882SAndroid Build Coastguard Worker           [terminal](ParseTree* parse_tree) {
296*993b0882SAndroid Build Coastguard Worker             parse_tree->terminal = terminal.data();
297*993b0882SAndroid Build Coastguard Worker             parse_tree->rhs2 = nullptr;
298*993b0882SAndroid Build Coastguard Worker           },
299*993b0882SAndroid Build Coastguard Worker           lhs_set);
300*993b0882SAndroid Build Coastguard Worker     }
301*993b0882SAndroid Build Coastguard Worker   }
302*993b0882SAndroid Build Coastguard Worker   ProcessPendingSet();
303*993b0882SAndroid Build Coastguard Worker }
304*993b0882SAndroid Build Coastguard Worker 
AddParseTree(ParseTree * parse_tree)305*993b0882SAndroid Build Coastguard Worker void Matcher::AddParseTree(ParseTree* parse_tree) {
306*993b0882SAndroid Build Coastguard Worker   TC3_CHECK_GE(parse_tree->codepoint_span.second, last_end_);
307*993b0882SAndroid Build Coastguard Worker 
308*993b0882SAndroid Build Coastguard Worker   // Finish any pending post-checks.
309*993b0882SAndroid Build Coastguard Worker   if (parse_tree->codepoint_span.second > last_end_) {
310*993b0882SAndroid Build Coastguard Worker     ProcessPendingExclusionMatches();
311*993b0882SAndroid Build Coastguard Worker   }
312*993b0882SAndroid Build Coastguard Worker 
313*993b0882SAndroid Build Coastguard Worker   last_end_ = parse_tree->codepoint_span.second;
314*993b0882SAndroid Build Coastguard Worker   QueueForProcessing(parse_tree);
315*993b0882SAndroid Build Coastguard Worker   ProcessPendingSet();
316*993b0882SAndroid Build Coastguard Worker }
317*993b0882SAndroid Build Coastguard Worker 
ExecuteLhsSet(const CodepointSpan codepoint_span,const int match_offset_bytes,const int whitespace_gap,const std::function<void (ParseTree *)> & initializer_fn,const RulesSet_::LhsSet * lhs_set)318*993b0882SAndroid Build Coastguard Worker void Matcher::ExecuteLhsSet(
319*993b0882SAndroid Build Coastguard Worker     const CodepointSpan codepoint_span, const int match_offset_bytes,
320*993b0882SAndroid Build Coastguard Worker     const int whitespace_gap,
321*993b0882SAndroid Build Coastguard Worker     const std::function<void(ParseTree*)>& initializer_fn,
322*993b0882SAndroid Build Coastguard Worker     const RulesSet_::LhsSet* lhs_set) {
323*993b0882SAndroid Build Coastguard Worker   TC3_CHECK(lhs_set);
324*993b0882SAndroid Build Coastguard Worker   ParseTree* parse_tree = nullptr;
325*993b0882SAndroid Build Coastguard Worker   Nonterm prev_lhs = kUnassignedNonterm;
326*993b0882SAndroid Build Coastguard Worker   for (const int32 lhs_entry : *lhs_set->lhs()) {
327*993b0882SAndroid Build Coastguard Worker     Nonterm lhs;
328*993b0882SAndroid Build Coastguard Worker     CallbackId callback_id;
329*993b0882SAndroid Build Coastguard Worker     int64 callback_param;
330*993b0882SAndroid Build Coastguard Worker     int8 max_whitespace_gap;
331*993b0882SAndroid Build Coastguard Worker     GetLhs(rules_, lhs_entry, &lhs, &callback_id, &callback_param,
332*993b0882SAndroid Build Coastguard Worker            &max_whitespace_gap);
333*993b0882SAndroid Build Coastguard Worker 
334*993b0882SAndroid Build Coastguard Worker     // Check that the allowed whitespace gap limit is followed.
335*993b0882SAndroid Build Coastguard Worker     if (max_whitespace_gap >= 0 && whitespace_gap > max_whitespace_gap) {
336*993b0882SAndroid Build Coastguard Worker       continue;
337*993b0882SAndroid Build Coastguard Worker     }
338*993b0882SAndroid Build Coastguard Worker 
339*993b0882SAndroid Build Coastguard Worker     // Handle callbacks.
340*993b0882SAndroid Build Coastguard Worker     switch (static_cast<DefaultCallback>(callback_id)) {
341*993b0882SAndroid Build Coastguard Worker       case DefaultCallback::kAssertion: {
342*993b0882SAndroid Build Coastguard Worker         AssertionNode* assertion_node = arena_->AllocAndInit<AssertionNode>(
343*993b0882SAndroid Build Coastguard Worker             lhs, codepoint_span, match_offset_bytes,
344*993b0882SAndroid Build Coastguard Worker             /*negative=*/(callback_param != 0));
345*993b0882SAndroid Build Coastguard Worker         initializer_fn(assertion_node);
346*993b0882SAndroid Build Coastguard Worker         QueueForProcessing(assertion_node);
347*993b0882SAndroid Build Coastguard Worker         continue;
348*993b0882SAndroid Build Coastguard Worker       }
349*993b0882SAndroid Build Coastguard Worker       case DefaultCallback::kMapping: {
350*993b0882SAndroid Build Coastguard Worker         MappingNode* mapping_node = arena_->AllocAndInit<MappingNode>(
351*993b0882SAndroid Build Coastguard Worker             lhs, codepoint_span, match_offset_bytes, /*id=*/callback_param);
352*993b0882SAndroid Build Coastguard Worker         initializer_fn(mapping_node);
353*993b0882SAndroid Build Coastguard Worker         QueueForProcessing(mapping_node);
354*993b0882SAndroid Build Coastguard Worker         continue;
355*993b0882SAndroid Build Coastguard Worker       }
356*993b0882SAndroid Build Coastguard Worker       case DefaultCallback::kExclusion: {
357*993b0882SAndroid Build Coastguard Worker         // We can only check the exclusion once all matches up to this position
358*993b0882SAndroid Build Coastguard Worker         // have been processed. Schedule and post check later.
359*993b0882SAndroid Build Coastguard Worker         ExclusionNode* exclusion_node = arena_->AllocAndInit<ExclusionNode>(
360*993b0882SAndroid Build Coastguard Worker             lhs, codepoint_span, match_offset_bytes,
361*993b0882SAndroid Build Coastguard Worker             /*exclusion_nonterm=*/callback_param);
362*993b0882SAndroid Build Coastguard Worker         initializer_fn(exclusion_node);
363*993b0882SAndroid Build Coastguard Worker         QueueForPostCheck(exclusion_node);
364*993b0882SAndroid Build Coastguard Worker         continue;
365*993b0882SAndroid Build Coastguard Worker       }
366*993b0882SAndroid Build Coastguard Worker       case DefaultCallback::kSemanticExpression: {
367*993b0882SAndroid Build Coastguard Worker         SemanticExpressionNode* expression_node =
368*993b0882SAndroid Build Coastguard Worker             arena_->AllocAndInit<SemanticExpressionNode>(
369*993b0882SAndroid Build Coastguard Worker                 lhs, codepoint_span, match_offset_bytes,
370*993b0882SAndroid Build Coastguard Worker                 /*expression=*/
371*993b0882SAndroid Build Coastguard Worker                 rules_->semantic_expression()->Get(callback_param));
372*993b0882SAndroid Build Coastguard Worker         initializer_fn(expression_node);
373*993b0882SAndroid Build Coastguard Worker         QueueForProcessing(expression_node);
374*993b0882SAndroid Build Coastguard Worker         continue;
375*993b0882SAndroid Build Coastguard Worker       }
376*993b0882SAndroid Build Coastguard Worker       default:
377*993b0882SAndroid Build Coastguard Worker         break;
378*993b0882SAndroid Build Coastguard Worker     }
379*993b0882SAndroid Build Coastguard Worker 
380*993b0882SAndroid Build Coastguard Worker     if (prev_lhs != lhs) {
381*993b0882SAndroid Build Coastguard Worker       prev_lhs = lhs;
382*993b0882SAndroid Build Coastguard Worker       parse_tree = arena_->AllocAndInit<ParseTree>(
383*993b0882SAndroid Build Coastguard Worker           lhs, codepoint_span, match_offset_bytes, ParseTree::Type::kDefault);
384*993b0882SAndroid Build Coastguard Worker       initializer_fn(parse_tree);
385*993b0882SAndroid Build Coastguard Worker       QueueForProcessing(parse_tree);
386*993b0882SAndroid Build Coastguard Worker     }
387*993b0882SAndroid Build Coastguard Worker 
388*993b0882SAndroid Build Coastguard Worker     if (static_cast<DefaultCallback>(callback_id) ==
389*993b0882SAndroid Build Coastguard Worker         DefaultCallback::kRootRule) {
390*993b0882SAndroid Build Coastguard Worker       chart_.AddDerivation(Derivation{parse_tree, /*rule_id=*/callback_param});
391*993b0882SAndroid Build Coastguard Worker     }
392*993b0882SAndroid Build Coastguard Worker   }
393*993b0882SAndroid Build Coastguard Worker }
394*993b0882SAndroid Build Coastguard Worker 
ProcessPendingSet()395*993b0882SAndroid Build Coastguard Worker void Matcher::ProcessPendingSet() {
396*993b0882SAndroid Build Coastguard Worker   while (pending_items_) {
397*993b0882SAndroid Build Coastguard Worker     // Process.
398*993b0882SAndroid Build Coastguard Worker     ParseTree* item = pending_items_;
399*993b0882SAndroid Build Coastguard Worker     pending_items_ = pending_items_->next;
400*993b0882SAndroid Build Coastguard Worker 
401*993b0882SAndroid Build Coastguard Worker     // Add it to the chart.
402*993b0882SAndroid Build Coastguard Worker     chart_.Add(item);
403*993b0882SAndroid Build Coastguard Worker 
404*993b0882SAndroid Build Coastguard Worker     // Check unary rules that trigger.
405*993b0882SAndroid Build Coastguard Worker     for (const RulesSet_::Rules* shard : rules_shards_) {
406*993b0882SAndroid Build Coastguard Worker       if (const RulesSet_::LhsSet* lhs_set =
407*993b0882SAndroid Build Coastguard Worker               FindUnaryRulesMatches(rules_, shard, item->lhs)) {
408*993b0882SAndroid Build Coastguard Worker         ExecuteLhsSet(
409*993b0882SAndroid Build Coastguard Worker             item->codepoint_span, item->match_offset,
410*993b0882SAndroid Build Coastguard Worker             /*whitespace_gap=*/
411*993b0882SAndroid Build Coastguard Worker             (item->codepoint_span.first - item->match_offset),
412*993b0882SAndroid Build Coastguard Worker             [item](ParseTree* parse_tree) {
413*993b0882SAndroid Build Coastguard Worker               parse_tree->rhs1 = nullptr;
414*993b0882SAndroid Build Coastguard Worker               parse_tree->rhs2 = item;
415*993b0882SAndroid Build Coastguard Worker             },
416*993b0882SAndroid Build Coastguard Worker             lhs_set);
417*993b0882SAndroid Build Coastguard Worker       }
418*993b0882SAndroid Build Coastguard Worker     }
419*993b0882SAndroid Build Coastguard Worker 
420*993b0882SAndroid Build Coastguard Worker     // Check binary rules that trigger.
421*993b0882SAndroid Build Coastguard Worker     // Lookup by begin.
422*993b0882SAndroid Build Coastguard Worker     for (Chart<>::Iterator it = chart_.MatchesEndingAt(item->match_offset);
423*993b0882SAndroid Build Coastguard Worker          !it.Done(); it.Next()) {
424*993b0882SAndroid Build Coastguard Worker       const ParseTree* prev = it.Item();
425*993b0882SAndroid Build Coastguard Worker       for (const RulesSet_::Rules* shard : rules_shards_) {
426*993b0882SAndroid Build Coastguard Worker         if (const RulesSet_::LhsSet* lhs_set =
427*993b0882SAndroid Build Coastguard Worker                 FindBinaryRulesMatches(rules_, shard, {prev->lhs, item->lhs})) {
428*993b0882SAndroid Build Coastguard Worker           ExecuteLhsSet(
429*993b0882SAndroid Build Coastguard Worker               /*codepoint_span=*/
430*993b0882SAndroid Build Coastguard Worker               {prev->codepoint_span.first, item->codepoint_span.second},
431*993b0882SAndroid Build Coastguard Worker               prev->match_offset,
432*993b0882SAndroid Build Coastguard Worker               /*whitespace_gap=*/
433*993b0882SAndroid Build Coastguard Worker               (item->codepoint_span.first -
434*993b0882SAndroid Build Coastguard Worker                item->match_offset),  // Whitespace gap is the gap
435*993b0882SAndroid Build Coastguard Worker                                      // between the two parts.
436*993b0882SAndroid Build Coastguard Worker               [prev, item](ParseTree* parse_tree) {
437*993b0882SAndroid Build Coastguard Worker                 parse_tree->rhs1 = prev;
438*993b0882SAndroid Build Coastguard Worker                 parse_tree->rhs2 = item;
439*993b0882SAndroid Build Coastguard Worker               },
440*993b0882SAndroid Build Coastguard Worker               lhs_set);
441*993b0882SAndroid Build Coastguard Worker         }
442*993b0882SAndroid Build Coastguard Worker       }
443*993b0882SAndroid Build Coastguard Worker     }
444*993b0882SAndroid Build Coastguard Worker   }
445*993b0882SAndroid Build Coastguard Worker }
446*993b0882SAndroid Build Coastguard Worker 
ProcessPendingExclusionMatches()447*993b0882SAndroid Build Coastguard Worker void Matcher::ProcessPendingExclusionMatches() {
448*993b0882SAndroid Build Coastguard Worker   while (pending_exclusion_items_) {
449*993b0882SAndroid Build Coastguard Worker     ExclusionNode* item = pending_exclusion_items_;
450*993b0882SAndroid Build Coastguard Worker     pending_exclusion_items_ = static_cast<ExclusionNode*>(item->next);
451*993b0882SAndroid Build Coastguard Worker 
452*993b0882SAndroid Build Coastguard Worker     // Check that the exclusion condition is fulfilled.
453*993b0882SAndroid Build Coastguard Worker     if (!chart_.HasMatch(item->exclusion_nonterm, item->codepoint_span)) {
454*993b0882SAndroid Build Coastguard Worker       AddParseTree(item);
455*993b0882SAndroid Build Coastguard Worker     }
456*993b0882SAndroid Build Coastguard Worker   }
457*993b0882SAndroid Build Coastguard Worker }
458*993b0882SAndroid Build Coastguard Worker 
459*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3::grammar
460