1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_ 18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include <vector> 21*993b0882SAndroid Build Coastguard Worker 22*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h" 23*993b0882SAndroid Build Coastguard Worker #include "utils/base/arena.h" 24*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/derivation.h" 25*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/lexer.h" 26*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/parsing/matcher.h" 27*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/rules_generated.h" 28*993b0882SAndroid Build Coastguard Worker #include "utils/grammar/text-context.h" 29*993b0882SAndroid Build Coastguard Worker #include "utils/i18n/locale.h" 30*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h" 31*993b0882SAndroid Build Coastguard Worker 32*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3::grammar { 33*993b0882SAndroid Build Coastguard Worker 34*993b0882SAndroid Build Coastguard Worker // Syntactic parsing pass. 35*993b0882SAndroid Build Coastguard Worker // The parser validates and deduplicates candidates produced by the grammar 36*993b0882SAndroid Build Coastguard Worker // matcher. It augments the parse trees with derivation information for semantic 37*993b0882SAndroid Build Coastguard Worker // evaluation. 38*993b0882SAndroid Build Coastguard Worker class Parser { 39*993b0882SAndroid Build Coastguard Worker public: 40*993b0882SAndroid Build Coastguard Worker explicit Parser(const UniLib* unilib, const RulesSet* rules); 41*993b0882SAndroid Build Coastguard Worker 42*993b0882SAndroid Build Coastguard Worker // Parses an input text and returns the root rule derivations. 43*993b0882SAndroid Build Coastguard Worker std::vector<Derivation> Parse(const TextContext& input, 44*993b0882SAndroid Build Coastguard Worker UnsafeArena* arena) const; 45*993b0882SAndroid Build Coastguard Worker 46*993b0882SAndroid Build Coastguard Worker private: 47*993b0882SAndroid Build Coastguard Worker struct RegexAnnotator { 48*993b0882SAndroid Build Coastguard Worker std::unique_ptr<UniLib::RegexPattern> pattern; 49*993b0882SAndroid Build Coastguard Worker Nonterm nonterm; 50*993b0882SAndroid Build Coastguard Worker }; 51*993b0882SAndroid Build Coastguard Worker 52*993b0882SAndroid Build Coastguard Worker // Uncompresses and build the defined regex annotators. 53*993b0882SAndroid Build Coastguard Worker std::vector<RegexAnnotator> BuildRegexAnnotators() const; 54*993b0882SAndroid Build Coastguard Worker 55*993b0882SAndroid Build Coastguard Worker // Produces symbols for a text input to feed to a matcher. 56*993b0882SAndroid Build Coastguard Worker // These are symbols for each tokens from the lexer, existing text annotations 57*993b0882SAndroid Build Coastguard Worker // and regex annotations. 58*993b0882SAndroid Build Coastguard Worker // The symbols are sorted with increasing end-positions to satisfy the matcher 59*993b0882SAndroid Build Coastguard Worker // requirements. 60*993b0882SAndroid Build Coastguard Worker std::vector<Symbol> SortedSymbolsForInput(const TextContext& input, 61*993b0882SAndroid Build Coastguard Worker UnsafeArena* arena) const; 62*993b0882SAndroid Build Coastguard Worker 63*993b0882SAndroid Build Coastguard Worker // Emits a symbol to the matcher. 64*993b0882SAndroid Build Coastguard Worker void EmitSymbol(const Symbol& symbol, UnsafeArena* arena, 65*993b0882SAndroid Build Coastguard Worker Matcher* matcher) const; 66*993b0882SAndroid Build Coastguard Worker 67*993b0882SAndroid Build Coastguard Worker const UniLib& unilib_; 68*993b0882SAndroid Build Coastguard Worker const RulesSet* rules_; 69*993b0882SAndroid Build Coastguard Worker const Lexer lexer_; 70*993b0882SAndroid Build Coastguard Worker 71*993b0882SAndroid Build Coastguard Worker // Pre-defined nonterminals. 72*993b0882SAndroid Build Coastguard Worker const RulesSet_::Nonterminals* nonterminals_; 73*993b0882SAndroid Build Coastguard Worker 74*993b0882SAndroid Build Coastguard Worker // Pre-parsed locales of the rules. 75*993b0882SAndroid Build Coastguard Worker const std::vector<std::vector<Locale>> rules_locales_; 76*993b0882SAndroid Build Coastguard Worker 77*993b0882SAndroid Build Coastguard Worker std::vector<RegexAnnotator> regex_annotators_; 78*993b0882SAndroid Build Coastguard Worker }; 79*993b0882SAndroid Build Coastguard Worker 80*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3::grammar 81*993b0882SAndroid Build Coastguard Worker 82*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSER_H_ 83