1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker // Utility functions for pre-processing, creating and testing context free 18*993b0882SAndroid Build Coastguard Worker // grammars. 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_ 21*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_ 22*993b0882SAndroid Build Coastguard Worker 23*993b0882SAndroid Build Coastguard Worker #include <unordered_map> 24*993b0882SAndroid Build Coastguard Worker #include <vector> 25*993b0882SAndroid Build Coastguard Worker 26*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/types.h" 27*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/utils/ir.h" 28*993b0882SAndroid Build Coastguard Worker 29*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar { 30*993b0882SAndroid Build Coastguard Worker 31*993b0882SAndroid Build Coastguard Worker // Special nonterminals. 32*993b0882SAndroid Build Coastguard Worker constexpr const char* kFiller = "<filler>"; 33*993b0882SAndroid Build Coastguard Worker 34*993b0882SAndroid Build Coastguard Worker // All rules for a grammar will be collected in a rules object. 35*993b0882SAndroid Build Coastguard Worker // 36*993b0882SAndroid Build Coastguard Worker // Rules r; 37*993b0882SAndroid Build Coastguard Worker // r.Add("<date>", {"<monthname>", "<day>", <year>"}); 38*993b0882SAndroid Build Coastguard Worker // r.Add("<monthname>", {"January"}); 39*993b0882SAndroid Build Coastguard Worker // ... 40*993b0882SAndroid Build Coastguard Worker // r.Add("<monthname>", {"December"}); 41*993b0882SAndroid Build Coastguard Worker // r.Add("<day>", {"<string_of_digits>"}); 42*993b0882SAndroid Build Coastguard Worker // r.Add("<year>", {"<string_of_digits>"}); 43*993b0882SAndroid Build Coastguard Worker // 44*993b0882SAndroid Build Coastguard Worker // The Add() method adds a rule with a given lhs, rhs/ 45*993b0882SAndroid Build Coastguard Worker // The rhs is just a list of terminals and nonterminals. Anything 46*993b0882SAndroid Build Coastguard Worker // surrounded in angle brackets is considered a nonterminal. A "?" can follow 47*993b0882SAndroid Build Coastguard Worker // any element of the RHS, like this: 48*993b0882SAndroid Build Coastguard Worker // 49*993b0882SAndroid Build Coastguard Worker // r.Add("<date>", {"<monthname>", "<day>?", ",?", "<year>"}); 50*993b0882SAndroid Build Coastguard Worker // 51*993b0882SAndroid Build Coastguard Worker // This indicates that the <day> and "," parts of the rhs are optional. 52*993b0882SAndroid Build Coastguard Worker // (This is just notational shorthand for adding a bunch of rules.) 53*993b0882SAndroid Build Coastguard Worker // 54*993b0882SAndroid Build Coastguard Worker // Once you're done adding rules, r.Finalize() lowers the rule set into an 55*993b0882SAndroid Build Coastguard Worker // internal representation. 56*993b0882SAndroid Build Coastguard Worker class Rules { 57*993b0882SAndroid Build Coastguard Worker public: Rules(const LocaleShardMap & locale_shard_map)58*993b0882SAndroid Build Coastguard Worker explicit Rules(const LocaleShardMap& locale_shard_map) 59*993b0882SAndroid Build Coastguard Worker : locale_shard_map_(locale_shard_map) {} 60*993b0882SAndroid Build Coastguard Worker 61*993b0882SAndroid Build Coastguard Worker // Represents one item in a right-hand side, a single terminal or nonterminal. 62*993b0882SAndroid Build Coastguard Worker struct RhsElement { RhsElementRhsElement63*993b0882SAndroid Build Coastguard Worker RhsElement() {} RhsElementRhsElement64*993b0882SAndroid Build Coastguard Worker explicit RhsElement(const std::string& terminal, const bool is_optional) 65*993b0882SAndroid Build Coastguard Worker : is_terminal(true), 66*993b0882SAndroid Build Coastguard Worker terminal(terminal), 67*993b0882SAndroid Build Coastguard Worker is_optional(is_optional), 68*993b0882SAndroid Build Coastguard Worker is_constituent(false) {} 69*993b0882SAndroid Build Coastguard Worker explicit RhsElement(const int nonterminal, const bool is_optional, 70*993b0882SAndroid Build Coastguard Worker const bool is_constituent = true) is_terminalRhsElement71*993b0882SAndroid Build Coastguard Worker : is_terminal(false), 72*993b0882SAndroid Build Coastguard Worker nonterminal(nonterminal), 73*993b0882SAndroid Build Coastguard Worker is_optional(is_optional), 74*993b0882SAndroid Build Coastguard Worker is_constituent(is_constituent) {} 75*993b0882SAndroid Build Coastguard Worker bool is_terminal; 76*993b0882SAndroid Build Coastguard Worker std::string terminal; 77*993b0882SAndroid Build Coastguard Worker int nonterminal; 78*993b0882SAndroid Build Coastguard Worker bool is_optional; 79*993b0882SAndroid Build Coastguard Worker 80*993b0882SAndroid Build Coastguard Worker // Whether the element is a constituent of a rule - these are the explicit 81*993b0882SAndroid Build Coastguard Worker // nonterminals, but not terminals or implicitly added anchors. 82*993b0882SAndroid Build Coastguard Worker bool is_constituent; 83*993b0882SAndroid Build Coastguard Worker }; 84*993b0882SAndroid Build Coastguard Worker 85*993b0882SAndroid Build Coastguard Worker // Represents the right-hand side, and possibly callback, of one rule. 86*993b0882SAndroid Build Coastguard Worker struct Rule { 87*993b0882SAndroid Build Coastguard Worker std::vector<RhsElement> rhs; 88*993b0882SAndroid Build Coastguard Worker CallbackId callback = kNoCallback; 89*993b0882SAndroid Build Coastguard Worker int64 callback_param = 0; 90*993b0882SAndroid Build Coastguard Worker int8 max_whitespace_gap = -1; 91*993b0882SAndroid Build Coastguard Worker bool case_sensitive = false; 92*993b0882SAndroid Build Coastguard Worker int shard = 0; 93*993b0882SAndroid Build Coastguard Worker }; 94*993b0882SAndroid Build Coastguard Worker 95*993b0882SAndroid Build Coastguard Worker struct NontermInfo { 96*993b0882SAndroid Build Coastguard Worker // The name of the non-terminal, if defined. 97*993b0882SAndroid Build Coastguard Worker std::string name; 98*993b0882SAndroid Build Coastguard Worker 99*993b0882SAndroid Build Coastguard Worker // Whether the nonterminal is provided via an annotation. 100*993b0882SAndroid Build Coastguard Worker bool from_annotation = false; 101*993b0882SAndroid Build Coastguard Worker 102*993b0882SAndroid Build Coastguard Worker // Rules that have this non-terminal as the lhs. 103*993b0882SAndroid Build Coastguard Worker std::vector<int> rules; 104*993b0882SAndroid Build Coastguard Worker 105*993b0882SAndroid Build Coastguard Worker // Regex rules that have this non-terminal as the lhs. 106*993b0882SAndroid Build Coastguard Worker std::vector<int> regex_rules; 107*993b0882SAndroid Build Coastguard Worker }; 108*993b0882SAndroid Build Coastguard Worker 109*993b0882SAndroid Build Coastguard Worker // Adds a rule `lhs ::= rhs` with the given callback id and parameter. 110*993b0882SAndroid Build Coastguard Worker // Note: Nonterminal names are in angle brackets and cannot contain 111*993b0882SAndroid Build Coastguard Worker // whitespace. The `rhs` is a list of components, each of which is either: 112*993b0882SAndroid Build Coastguard Worker // * A nonterminal name (in angle brackets) 113*993b0882SAndroid Build Coastguard Worker // * A terminal 114*993b0882SAndroid Build Coastguard Worker // optionally followed by a `?` which indicates that the component is 115*993b0882SAndroid Build Coastguard Worker // optional. The `rhs` must contain at least one non-optional component. 116*993b0882SAndroid Build Coastguard Worker void Add(const std::string& lhs, const std::vector<std::string>& rhs, 117*993b0882SAndroid Build Coastguard Worker const CallbackId callback = kNoCallback, 118*993b0882SAndroid Build Coastguard Worker const int64 callback_param = 0, int8 max_whitespace_gap = -1, 119*993b0882SAndroid Build Coastguard Worker bool case_sensitive = false, int shard = 0); 120*993b0882SAndroid Build Coastguard Worker 121*993b0882SAndroid Build Coastguard Worker // Adds a rule `lhs ::= rhs` with the given callback id and parameter. 122*993b0882SAndroid Build Coastguard Worker // The `rhs` must contain at least one non-optional component. 123*993b0882SAndroid Build Coastguard Worker void Add(int lhs, const std::vector<RhsElement>& rhs, 124*993b0882SAndroid Build Coastguard Worker CallbackId callback = kNoCallback, int64 callback_param = 0, 125*993b0882SAndroid Build Coastguard Worker int8 max_whitespace_gap = -1, bool case_sensitive = false, 126*993b0882SAndroid Build Coastguard Worker int shard = 0); 127*993b0882SAndroid Build Coastguard Worker 128*993b0882SAndroid Build Coastguard Worker // Adds a rule `lhs ::= rhs` with exclusion. 129*993b0882SAndroid Build Coastguard Worker // The rule only matches, if `excluded_nonterminal` doesn't match the same 130*993b0882SAndroid Build Coastguard Worker // span. 131*993b0882SAndroid Build Coastguard Worker void AddWithExclusion(const std::string& lhs, 132*993b0882SAndroid Build Coastguard Worker const std::vector<std::string>& rhs, 133*993b0882SAndroid Build Coastguard Worker const std::string& excluded_nonterminal, 134*993b0882SAndroid Build Coastguard Worker int8 max_whitespace_gap = -1, 135*993b0882SAndroid Build Coastguard Worker bool case_sensitive = false, int shard = 0); 136*993b0882SAndroid Build Coastguard Worker 137*993b0882SAndroid Build Coastguard Worker // Adds an assertion callback. 138*993b0882SAndroid Build Coastguard Worker void AddAssertion(const std::string& lhs, const std::vector<std::string>& rhs, 139*993b0882SAndroid Build Coastguard Worker bool negative = true, int8 max_whitespace_gap = -1, 140*993b0882SAndroid Build Coastguard Worker bool case_sensitive = false, int shard = 0); 141*993b0882SAndroid Build Coastguard Worker 142*993b0882SAndroid Build Coastguard Worker // Adds a mapping callback. 143*993b0882SAndroid Build Coastguard Worker void AddValueMapping(const std::string& lhs, 144*993b0882SAndroid Build Coastguard Worker const std::vector<std::string>& rhs, int64 value, 145*993b0882SAndroid Build Coastguard Worker int8 max_whitespace_gap = -1, 146*993b0882SAndroid Build Coastguard Worker bool case_sensitive = false, int shard = 0); 147*993b0882SAndroid Build Coastguard Worker void AddValueMapping(int lhs, const std::vector<RhsElement>& rhs, int64 value, 148*993b0882SAndroid Build Coastguard Worker int8 max_whitespace_gap = -1, 149*993b0882SAndroid Build Coastguard Worker bool case_sensitive = false, int shard = 0); 150*993b0882SAndroid Build Coastguard Worker 151*993b0882SAndroid Build Coastguard Worker // Adds a regex rule. 152*993b0882SAndroid Build Coastguard Worker void AddRegex(const std::string& lhs, const std::string& regex_pattern); 153*993b0882SAndroid Build Coastguard Worker void AddRegex(int lhs, const std::string& regex_pattern); 154*993b0882SAndroid Build Coastguard Worker 155*993b0882SAndroid Build Coastguard Worker // Creates a nonterminal with the given name, if one doesn't already exist. 156*993b0882SAndroid Build Coastguard Worker int AddNonterminal(const std::string& nonterminal_name); 157*993b0882SAndroid Build Coastguard Worker 158*993b0882SAndroid Build Coastguard Worker // Creates a new nonterminal. 159*993b0882SAndroid Build Coastguard Worker int AddNewNonterminal(); 160*993b0882SAndroid Build Coastguard Worker 161*993b0882SAndroid Build Coastguard Worker // Defines a nonterminal for an externally provided annotation. 162*993b0882SAndroid Build Coastguard Worker int AddAnnotation(const std::string& annotation_name); 163*993b0882SAndroid Build Coastguard Worker 164*993b0882SAndroid Build Coastguard Worker // Defines a nonterminal for an externally provided annotation. 165*993b0882SAndroid Build Coastguard Worker void BindAnnotation(const std::string& nonterminal_name, 166*993b0882SAndroid Build Coastguard Worker const std::string& annotation_name); 167*993b0882SAndroid Build Coastguard Worker 168*993b0882SAndroid Build Coastguard Worker // Adds an alias for a nonterminal. This is a separate name for the same 169*993b0882SAndroid Build Coastguard Worker // nonterminal. 170*993b0882SAndroid Build Coastguard Worker void AddAlias(const std::string& nonterminal_name, const std::string& alias); 171*993b0882SAndroid Build Coastguard Worker 172*993b0882SAndroid Build Coastguard Worker // Lowers the rule set into the intermediate representation. 173*993b0882SAndroid Build Coastguard Worker // Treats nonterminals given by the argument `predefined_nonterminals` as 174*993b0882SAndroid Build Coastguard Worker // defined externally. This allows to define rules that are dependent on 175*993b0882SAndroid Build Coastguard Worker // non-terminals produced by e.g. existing text annotations and that will be 176*993b0882SAndroid Build Coastguard Worker // fed to the matcher by the lexer. 177*993b0882SAndroid Build Coastguard Worker Ir Finalize(const std::set<std::string>& predefined_nonterminals = {}) const; 178*993b0882SAndroid Build Coastguard Worker nonterminals()179*993b0882SAndroid Build Coastguard Worker const std::vector<NontermInfo>& nonterminals() const { return nonterminals_; } rules()180*993b0882SAndroid Build Coastguard Worker const std::vector<Rule>& rules() const { return rules_; } 181*993b0882SAndroid Build Coastguard Worker 182*993b0882SAndroid Build Coastguard Worker private: 183*993b0882SAndroid Build Coastguard Worker void ExpandOptionals( 184*993b0882SAndroid Build Coastguard Worker int lhs, const std::vector<RhsElement>& rhs, CallbackId callback, 185*993b0882SAndroid Build Coastguard Worker int64 callback_param, int8 max_whitespace_gap, bool case_sensitive, 186*993b0882SAndroid Build Coastguard Worker int shard, std::vector<int>::const_iterator optional_element_indices, 187*993b0882SAndroid Build Coastguard Worker std::vector<int>::const_iterator optional_element_indices_end, 188*993b0882SAndroid Build Coastguard Worker std::vector<bool>* omit_these); 189*993b0882SAndroid Build Coastguard Worker 190*993b0882SAndroid Build Coastguard Worker // Applies optimizations to the right hand side of a rule. 191*993b0882SAndroid Build Coastguard Worker std::vector<RhsElement> OptimizeRhs(const std::vector<RhsElement>& rhs, 192*993b0882SAndroid Build Coastguard Worker int shard = 0); 193*993b0882SAndroid Build Coastguard Worker 194*993b0882SAndroid Build Coastguard Worker // Removes start and end anchors in case they are followed (respectively 195*993b0882SAndroid Build Coastguard Worker // preceded) by unbounded filler. 196*993b0882SAndroid Build Coastguard Worker std::vector<RhsElement> ResolveAnchors( 197*993b0882SAndroid Build Coastguard Worker const std::vector<RhsElement>& rhs) const; 198*993b0882SAndroid Build Coastguard Worker 199*993b0882SAndroid Build Coastguard Worker // Rewrites fillers in a rule. 200*993b0882SAndroid Build Coastguard Worker // Fillers in a rule such as `lhs ::= <a> <filler> <b>` could be lowered as 201*993b0882SAndroid Build Coastguard Worker // <tokens> ::= <token> 202*993b0882SAndroid Build Coastguard Worker // <tokens> ::= <tokens> <token> 203*993b0882SAndroid Build Coastguard Worker // This has the disadvantage that it will produce a match for each possible 204*993b0882SAndroid Build Coastguard Worker // span in the text, which is quadratic in the number of tokens. 205*993b0882SAndroid Build Coastguard Worker // It can be more efficiently written as: 206*993b0882SAndroid Build Coastguard Worker // `lhs ::= <a_with_tokens> <b>` with 207*993b0882SAndroid Build Coastguard Worker // `<a_with_tokens> ::= <a>` 208*993b0882SAndroid Build Coastguard Worker // `<a_with_tokens> ::= <a_with_tokens> <token>` 209*993b0882SAndroid Build Coastguard Worker // In this each occurrence of `<a>` can start a sequence of tokens. 210*993b0882SAndroid Build Coastguard Worker std::vector<RhsElement> ResolveFillers(const std::vector<RhsElement>& rhs, 211*993b0882SAndroid Build Coastguard Worker int shard = 0); 212*993b0882SAndroid Build Coastguard Worker 213*993b0882SAndroid Build Coastguard Worker // Checks whether an element denotes a specific nonterminal. 214*993b0882SAndroid Build Coastguard Worker bool IsNonterminalOfName(const RhsElement& element, 215*993b0882SAndroid Build Coastguard Worker const std::string& nonterminal) const; 216*993b0882SAndroid Build Coastguard Worker 217*993b0882SAndroid Build Coastguard Worker // Checks whether the fillers are used in any active rule. 218*993b0882SAndroid Build Coastguard Worker bool UsesFillers() const; 219*993b0882SAndroid Build Coastguard Worker 220*993b0882SAndroid Build Coastguard Worker const LocaleShardMap& locale_shard_map_; 221*993b0882SAndroid Build Coastguard Worker 222*993b0882SAndroid Build Coastguard Worker // Non-terminal to id map. 223*993b0882SAndroid Build Coastguard Worker std::unordered_map<std::string, int> nonterminal_names_; 224*993b0882SAndroid Build Coastguard Worker std::vector<NontermInfo> nonterminals_; 225*993b0882SAndroid Build Coastguard Worker std::unordered_map<std::string, std::string> nonterminal_alias_; 226*993b0882SAndroid Build Coastguard Worker std::unordered_map<std::string, int> annotation_nonterminals_; 227*993b0882SAndroid Build Coastguard Worker 228*993b0882SAndroid Build Coastguard Worker // Rules. 229*993b0882SAndroid Build Coastguard Worker std::vector<Rule> rules_; 230*993b0882SAndroid Build Coastguard Worker std::vector<std::string> regex_rules_; 231*993b0882SAndroid Build Coastguard Worker }; 232*993b0882SAndroid Build Coastguard Worker 233*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3::grammar 234*993b0882SAndroid Build Coastguard Worker 235*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_UTILS_RULES_H_ 236