xref: /aosp_15_r20/external/libtextclassifier/native/utils/grammar/utils/rules.h (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker // Utility functions for pre-processing, creating and testing context free
18*993b0882SAndroid Build Coastguard Worker // grammars.
19*993b0882SAndroid Build Coastguard Worker 
20*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_
21*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_
22*993b0882SAndroid Build Coastguard Worker 
23*993b0882SAndroid Build Coastguard Worker #include <unordered_map>
24*993b0882SAndroid Build Coastguard Worker #include <vector>
25*993b0882SAndroid Build Coastguard Worker 
26*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/types.h"
27*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/utils/ir.h"
28*993b0882SAndroid Build Coastguard Worker 
29*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar {
30*993b0882SAndroid Build Coastguard Worker 
31*993b0882SAndroid Build Coastguard Worker // Special nonterminals.
32*993b0882SAndroid Build Coastguard Worker constexpr const char* kFiller = "<filler>";
33*993b0882SAndroid Build Coastguard Worker 
34*993b0882SAndroid Build Coastguard Worker // All rules for a grammar will be collected in a rules object.
35*993b0882SAndroid Build Coastguard Worker //
36*993b0882SAndroid Build Coastguard Worker //    Rules r;
37*993b0882SAndroid Build Coastguard Worker //    r.Add("<date>", {"<monthname>", "<day>", <year>"});
38*993b0882SAndroid Build Coastguard Worker //    r.Add("<monthname>", {"January"});
39*993b0882SAndroid Build Coastguard Worker //    ...
40*993b0882SAndroid Build Coastguard Worker //    r.Add("<monthname>", {"December"});
41*993b0882SAndroid Build Coastguard Worker //    r.Add("<day>", {"<string_of_digits>"});
42*993b0882SAndroid Build Coastguard Worker //    r.Add("<year>", {"<string_of_digits>"});
43*993b0882SAndroid Build Coastguard Worker //
44*993b0882SAndroid Build Coastguard Worker // The Add() method adds a rule with a given lhs, rhs/
45*993b0882SAndroid Build Coastguard Worker // The rhs is just a list of terminals and nonterminals.  Anything
46*993b0882SAndroid Build Coastguard Worker // surrounded in angle brackets is considered a nonterminal.  A "?" can follow
47*993b0882SAndroid Build Coastguard Worker // any element of the RHS, like this:
48*993b0882SAndroid Build Coastguard Worker //
49*993b0882SAndroid Build Coastguard Worker //    r.Add("<date>", {"<monthname>", "<day>?", ",?", "<year>"});
50*993b0882SAndroid Build Coastguard Worker //
51*993b0882SAndroid Build Coastguard Worker // This indicates that the <day> and "," parts of the rhs are optional.
52*993b0882SAndroid Build Coastguard Worker // (This is just notational shorthand for adding a bunch of rules.)
53*993b0882SAndroid Build Coastguard Worker //
54*993b0882SAndroid Build Coastguard Worker // Once you're done adding rules, r.Finalize() lowers the rule set into an
55*993b0882SAndroid Build Coastguard Worker // internal representation.
56*993b0882SAndroid Build Coastguard Worker class Rules {
57*993b0882SAndroid Build Coastguard Worker  public:
Rules(const LocaleShardMap & locale_shard_map)58*993b0882SAndroid Build Coastguard Worker   explicit Rules(const LocaleShardMap& locale_shard_map)
59*993b0882SAndroid Build Coastguard Worker       : locale_shard_map_(locale_shard_map) {}
60*993b0882SAndroid Build Coastguard Worker 
61*993b0882SAndroid Build Coastguard Worker   // Represents one item in a right-hand side, a single terminal or nonterminal.
62*993b0882SAndroid Build Coastguard Worker   struct RhsElement {
RhsElementRhsElement63*993b0882SAndroid Build Coastguard Worker     RhsElement() {}
RhsElementRhsElement64*993b0882SAndroid Build Coastguard Worker     explicit RhsElement(const std::string& terminal, const bool is_optional)
65*993b0882SAndroid Build Coastguard Worker         : is_terminal(true),
66*993b0882SAndroid Build Coastguard Worker           terminal(terminal),
67*993b0882SAndroid Build Coastguard Worker           is_optional(is_optional),
68*993b0882SAndroid Build Coastguard Worker           is_constituent(false) {}
69*993b0882SAndroid Build Coastguard Worker     explicit RhsElement(const int nonterminal, const bool is_optional,
70*993b0882SAndroid Build Coastguard Worker                         const bool is_constituent = true)
is_terminalRhsElement71*993b0882SAndroid Build Coastguard Worker         : is_terminal(false),
72*993b0882SAndroid Build Coastguard Worker           nonterminal(nonterminal),
73*993b0882SAndroid Build Coastguard Worker           is_optional(is_optional),
74*993b0882SAndroid Build Coastguard Worker           is_constituent(is_constituent) {}
75*993b0882SAndroid Build Coastguard Worker     bool is_terminal;
76*993b0882SAndroid Build Coastguard Worker     std::string terminal;
77*993b0882SAndroid Build Coastguard Worker     int nonterminal;
78*993b0882SAndroid Build Coastguard Worker     bool is_optional;
79*993b0882SAndroid Build Coastguard Worker 
80*993b0882SAndroid Build Coastguard Worker     // Whether the element is a constituent of a rule - these are the explicit
81*993b0882SAndroid Build Coastguard Worker     // nonterminals, but not terminals or implicitly added anchors.
82*993b0882SAndroid Build Coastguard Worker     bool is_constituent;
83*993b0882SAndroid Build Coastguard Worker   };
84*993b0882SAndroid Build Coastguard Worker 
85*993b0882SAndroid Build Coastguard Worker   // Represents the right-hand side, and possibly callback, of one rule.
86*993b0882SAndroid Build Coastguard Worker   struct Rule {
87*993b0882SAndroid Build Coastguard Worker     std::vector<RhsElement> rhs;
88*993b0882SAndroid Build Coastguard Worker     CallbackId callback = kNoCallback;
89*993b0882SAndroid Build Coastguard Worker     int64 callback_param = 0;
90*993b0882SAndroid Build Coastguard Worker     int8 max_whitespace_gap = -1;
91*993b0882SAndroid Build Coastguard Worker     bool case_sensitive = false;
92*993b0882SAndroid Build Coastguard Worker     int shard = 0;
93*993b0882SAndroid Build Coastguard Worker   };
94*993b0882SAndroid Build Coastguard Worker 
95*993b0882SAndroid Build Coastguard Worker   struct NontermInfo {
96*993b0882SAndroid Build Coastguard Worker     // The name of the non-terminal, if defined.
97*993b0882SAndroid Build Coastguard Worker     std::string name;
98*993b0882SAndroid Build Coastguard Worker 
99*993b0882SAndroid Build Coastguard Worker     // Whether the nonterminal is provided via an annotation.
100*993b0882SAndroid Build Coastguard Worker     bool from_annotation = false;
101*993b0882SAndroid Build Coastguard Worker 
102*993b0882SAndroid Build Coastguard Worker     // Rules that have this non-terminal as the lhs.
103*993b0882SAndroid Build Coastguard Worker     std::vector<int> rules;
104*993b0882SAndroid Build Coastguard Worker 
105*993b0882SAndroid Build Coastguard Worker     // Regex rules that have this non-terminal as the lhs.
106*993b0882SAndroid Build Coastguard Worker     std::vector<int> regex_rules;
107*993b0882SAndroid Build Coastguard Worker   };
108*993b0882SAndroid Build Coastguard Worker 
109*993b0882SAndroid Build Coastguard Worker   // Adds a rule `lhs ::= rhs` with the given callback id and parameter.
110*993b0882SAndroid Build Coastguard Worker   // Note: Nonterminal names are in angle brackets and cannot contain
111*993b0882SAndroid Build Coastguard Worker   // whitespace. The `rhs` is a list of components, each of which is either:
112*993b0882SAndroid Build Coastguard Worker   //  * A nonterminal name (in angle brackets)
113*993b0882SAndroid Build Coastguard Worker   //  * A terminal
114*993b0882SAndroid Build Coastguard Worker   // optionally followed by a `?` which indicates that the component is
115*993b0882SAndroid Build Coastguard Worker   // optional. The `rhs` must contain at least one non-optional component.
116*993b0882SAndroid Build Coastguard Worker   void Add(const std::string& lhs, const std::vector<std::string>& rhs,
117*993b0882SAndroid Build Coastguard Worker            const CallbackId callback = kNoCallback,
118*993b0882SAndroid Build Coastguard Worker            const int64 callback_param = 0, int8 max_whitespace_gap = -1,
119*993b0882SAndroid Build Coastguard Worker            bool case_sensitive = false, int shard = 0);
120*993b0882SAndroid Build Coastguard Worker 
121*993b0882SAndroid Build Coastguard Worker   // Adds a rule `lhs ::= rhs` with the given callback id and parameter.
122*993b0882SAndroid Build Coastguard Worker   // The `rhs` must contain at least one non-optional component.
123*993b0882SAndroid Build Coastguard Worker   void Add(int lhs, const std::vector<RhsElement>& rhs,
124*993b0882SAndroid Build Coastguard Worker            CallbackId callback = kNoCallback, int64 callback_param = 0,
125*993b0882SAndroid Build Coastguard Worker            int8 max_whitespace_gap = -1, bool case_sensitive = false,
126*993b0882SAndroid Build Coastguard Worker            int shard = 0);
127*993b0882SAndroid Build Coastguard Worker 
128*993b0882SAndroid Build Coastguard Worker   // Adds a rule `lhs ::= rhs` with exclusion.
129*993b0882SAndroid Build Coastguard Worker   // The rule only matches, if `excluded_nonterminal` doesn't match the same
130*993b0882SAndroid Build Coastguard Worker   // span.
131*993b0882SAndroid Build Coastguard Worker   void AddWithExclusion(const std::string& lhs,
132*993b0882SAndroid Build Coastguard Worker                         const std::vector<std::string>& rhs,
133*993b0882SAndroid Build Coastguard Worker                         const std::string& excluded_nonterminal,
134*993b0882SAndroid Build Coastguard Worker                         int8 max_whitespace_gap = -1,
135*993b0882SAndroid Build Coastguard Worker                         bool case_sensitive = false, int shard = 0);
136*993b0882SAndroid Build Coastguard Worker 
137*993b0882SAndroid Build Coastguard Worker   // Adds an assertion callback.
138*993b0882SAndroid Build Coastguard Worker   void AddAssertion(const std::string& lhs, const std::vector<std::string>& rhs,
139*993b0882SAndroid Build Coastguard Worker                     bool negative = true, int8 max_whitespace_gap = -1,
140*993b0882SAndroid Build Coastguard Worker                     bool case_sensitive = false, int shard = 0);
141*993b0882SAndroid Build Coastguard Worker 
142*993b0882SAndroid Build Coastguard Worker   // Adds a mapping callback.
143*993b0882SAndroid Build Coastguard Worker   void AddValueMapping(const std::string& lhs,
144*993b0882SAndroid Build Coastguard Worker                        const std::vector<std::string>& rhs, int64 value,
145*993b0882SAndroid Build Coastguard Worker                        int8 max_whitespace_gap = -1,
146*993b0882SAndroid Build Coastguard Worker                        bool case_sensitive = false, int shard = 0);
147*993b0882SAndroid Build Coastguard Worker   void AddValueMapping(int lhs, const std::vector<RhsElement>& rhs, int64 value,
148*993b0882SAndroid Build Coastguard Worker                        int8 max_whitespace_gap = -1,
149*993b0882SAndroid Build Coastguard Worker                        bool case_sensitive = false, int shard = 0);
150*993b0882SAndroid Build Coastguard Worker 
151*993b0882SAndroid Build Coastguard Worker   // Adds a regex rule.
152*993b0882SAndroid Build Coastguard Worker   void AddRegex(const std::string& lhs, const std::string& regex_pattern);
153*993b0882SAndroid Build Coastguard Worker   void AddRegex(int lhs, const std::string& regex_pattern);
154*993b0882SAndroid Build Coastguard Worker 
155*993b0882SAndroid Build Coastguard Worker   // Creates a nonterminal with the given name, if one doesn't already exist.
156*993b0882SAndroid Build Coastguard Worker   int AddNonterminal(const std::string& nonterminal_name);
157*993b0882SAndroid Build Coastguard Worker 
158*993b0882SAndroid Build Coastguard Worker   // Creates a new nonterminal.
159*993b0882SAndroid Build Coastguard Worker   int AddNewNonterminal();
160*993b0882SAndroid Build Coastguard Worker 
161*993b0882SAndroid Build Coastguard Worker   // Defines a nonterminal for an externally provided annotation.
162*993b0882SAndroid Build Coastguard Worker   int AddAnnotation(const std::string& annotation_name);
163*993b0882SAndroid Build Coastguard Worker 
164*993b0882SAndroid Build Coastguard Worker   // Defines a nonterminal for an externally provided annotation.
165*993b0882SAndroid Build Coastguard Worker   void BindAnnotation(const std::string& nonterminal_name,
166*993b0882SAndroid Build Coastguard Worker                       const std::string& annotation_name);
167*993b0882SAndroid Build Coastguard Worker 
168*993b0882SAndroid Build Coastguard Worker   // Adds an alias for a nonterminal. This is a separate name for the same
169*993b0882SAndroid Build Coastguard Worker   // nonterminal.
170*993b0882SAndroid Build Coastguard Worker   void AddAlias(const std::string& nonterminal_name, const std::string& alias);
171*993b0882SAndroid Build Coastguard Worker 
172*993b0882SAndroid Build Coastguard Worker   // Lowers the rule set into the intermediate representation.
173*993b0882SAndroid Build Coastguard Worker   // Treats nonterminals given by the argument `predefined_nonterminals` as
174*993b0882SAndroid Build Coastguard Worker   // defined externally. This allows to define rules that are dependent on
175*993b0882SAndroid Build Coastguard Worker   // non-terminals produced by e.g. existing text annotations and that will be
176*993b0882SAndroid Build Coastguard Worker   // fed to the matcher by the lexer.
177*993b0882SAndroid Build Coastguard Worker   Ir Finalize(const std::set<std::string>& predefined_nonterminals = {}) const;
178*993b0882SAndroid Build Coastguard Worker 
nonterminals()179*993b0882SAndroid Build Coastguard Worker   const std::vector<NontermInfo>& nonterminals() const { return nonterminals_; }
rules()180*993b0882SAndroid Build Coastguard Worker   const std::vector<Rule>& rules() const { return rules_; }
181*993b0882SAndroid Build Coastguard Worker 
182*993b0882SAndroid Build Coastguard Worker  private:
183*993b0882SAndroid Build Coastguard Worker   void ExpandOptionals(
184*993b0882SAndroid Build Coastguard Worker       int lhs, const std::vector<RhsElement>& rhs, CallbackId callback,
185*993b0882SAndroid Build Coastguard Worker       int64 callback_param, int8 max_whitespace_gap, bool case_sensitive,
186*993b0882SAndroid Build Coastguard Worker       int shard, std::vector<int>::const_iterator optional_element_indices,
187*993b0882SAndroid Build Coastguard Worker       std::vector<int>::const_iterator optional_element_indices_end,
188*993b0882SAndroid Build Coastguard Worker       std::vector<bool>* omit_these);
189*993b0882SAndroid Build Coastguard Worker 
190*993b0882SAndroid Build Coastguard Worker   // Applies optimizations to the right hand side of a rule.
191*993b0882SAndroid Build Coastguard Worker   std::vector<RhsElement> OptimizeRhs(const std::vector<RhsElement>& rhs,
192*993b0882SAndroid Build Coastguard Worker                                       int shard = 0);
193*993b0882SAndroid Build Coastguard Worker 
194*993b0882SAndroid Build Coastguard Worker   // Removes start and end anchors in case they are followed (respectively
195*993b0882SAndroid Build Coastguard Worker   // preceded) by unbounded filler.
196*993b0882SAndroid Build Coastguard Worker   std::vector<RhsElement> ResolveAnchors(
197*993b0882SAndroid Build Coastguard Worker       const std::vector<RhsElement>& rhs) const;
198*993b0882SAndroid Build Coastguard Worker 
199*993b0882SAndroid Build Coastguard Worker   // Rewrites fillers in a rule.
200*993b0882SAndroid Build Coastguard Worker   // Fillers in a rule such as `lhs ::= <a> <filler> <b>` could be lowered as
201*993b0882SAndroid Build Coastguard Worker   // <tokens> ::= <token>
202*993b0882SAndroid Build Coastguard Worker   // <tokens> ::= <tokens> <token>
203*993b0882SAndroid Build Coastguard Worker   // This has the disadvantage that it will produce a match for each possible
204*993b0882SAndroid Build Coastguard Worker   // span in the text, which is quadratic in the number of tokens.
205*993b0882SAndroid Build Coastguard Worker   // It can be more efficiently written as:
206*993b0882SAndroid Build Coastguard Worker   // `lhs ::= <a_with_tokens> <b>` with
207*993b0882SAndroid Build Coastguard Worker   // `<a_with_tokens> ::= <a>`
208*993b0882SAndroid Build Coastguard Worker   // `<a_with_tokens> ::= <a_with_tokens> <token>`
209*993b0882SAndroid Build Coastguard Worker   // In this each occurrence of `<a>` can start a sequence of tokens.
210*993b0882SAndroid Build Coastguard Worker   std::vector<RhsElement> ResolveFillers(const std::vector<RhsElement>& rhs,
211*993b0882SAndroid Build Coastguard Worker                                          int shard = 0);
212*993b0882SAndroid Build Coastguard Worker 
213*993b0882SAndroid Build Coastguard Worker   // Checks whether an element denotes a specific nonterminal.
214*993b0882SAndroid Build Coastguard Worker   bool IsNonterminalOfName(const RhsElement& element,
215*993b0882SAndroid Build Coastguard Worker                            const std::string& nonterminal) const;
216*993b0882SAndroid Build Coastguard Worker 
217*993b0882SAndroid Build Coastguard Worker   // Checks whether the fillers are used in any active rule.
218*993b0882SAndroid Build Coastguard Worker   bool UsesFillers() const;
219*993b0882SAndroid Build Coastguard Worker 
220*993b0882SAndroid Build Coastguard Worker   const LocaleShardMap& locale_shard_map_;
221*993b0882SAndroid Build Coastguard Worker 
222*993b0882SAndroid Build Coastguard Worker   // Non-terminal to id map.
223*993b0882SAndroid Build Coastguard Worker   std::unordered_map<std::string, int> nonterminal_names_;
224*993b0882SAndroid Build Coastguard Worker   std::vector<NontermInfo> nonterminals_;
225*993b0882SAndroid Build Coastguard Worker   std::unordered_map<std::string, std::string> nonterminal_alias_;
226*993b0882SAndroid Build Coastguard Worker   std::unordered_map<std::string, int> annotation_nonterminals_;
227*993b0882SAndroid Build Coastguard Worker 
228*993b0882SAndroid Build Coastguard Worker   // Rules.
229*993b0882SAndroid Build Coastguard Worker   std::vector<Rule> rules_;
230*993b0882SAndroid Build Coastguard Worker   std::vector<std::string> regex_rules_;
231*993b0882SAndroid Build Coastguard Worker };
232*993b0882SAndroid Build Coastguard Worker 
233*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3::grammar
234*993b0882SAndroid Build Coastguard Worker 
235*993b0882SAndroid Build Coastguard Worker #endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_
236