1*993b0882SAndroid Build Coastguard Worker /* 2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project 3*993b0882SAndroid Build Coastguard Worker * 4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License"); 5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License. 6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at 7*993b0882SAndroid Build Coastguard Worker * 8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0 9*993b0882SAndroid Build Coastguard Worker * 10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software 11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS, 12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and 14*993b0882SAndroid Build Coastguard Worker * limitations under the License. 15*993b0882SAndroid Build Coastguard Worker */ 16*993b0882SAndroid Build Coastguard Worker 17*993b0882SAndroid Build Coastguard Worker #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_REGEX_PARSER_H_ 18*993b0882SAndroid Build Coastguard Worker #define LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_REGEX_PARSER_H_ 19*993b0882SAndroid Build Coastguard Worker 20*993b0882SAndroid Build Coastguard Worker #include <memory> 21*993b0882SAndroid Build Coastguard Worker #include <string> 22*993b0882SAndroid Build Coastguard Worker #include <unordered_map> 23*993b0882SAndroid Build Coastguard Worker #include <unordered_set> 24*993b0882SAndroid Build Coastguard Worker #include <vector> 25*993b0882SAndroid Build Coastguard Worker 26*993b0882SAndroid Build Coastguard Worker #include "annotator/datetime/extractor.h" 27*993b0882SAndroid Build Coastguard Worker #include "annotator/datetime/parser.h" 28*993b0882SAndroid Build Coastguard Worker #include "annotator/model_generated.h" 29*993b0882SAndroid Build Coastguard Worker #include "annotator/types.h" 30*993b0882SAndroid Build Coastguard Worker #include "utils/base/integral_types.h" 31*993b0882SAndroid Build Coastguard Worker #include "utils/base/statusor.h" 32*993b0882SAndroid Build Coastguard Worker #include "utils/calendar/calendar.h" 33*993b0882SAndroid Build Coastguard Worker #include "utils/strings/stringpiece.h" 34*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unicodetext.h" 35*993b0882SAndroid Build Coastguard Worker #include "utils/utf8/unilib.h" 36*993b0882SAndroid Build Coastguard Worker #include "utils/zlib/zlib.h" 37*993b0882SAndroid Build Coastguard Worker 38*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 { 39*993b0882SAndroid Build Coastguard Worker 40*993b0882SAndroid Build Coastguard Worker // Parses datetime expressions in the input and resolves them to actual absolute 41*993b0882SAndroid Build Coastguard Worker // time. 42*993b0882SAndroid Build Coastguard Worker class RegexDatetimeParser : public DatetimeParser { 43*993b0882SAndroid Build Coastguard Worker public: 44*993b0882SAndroid Build Coastguard Worker static std::unique_ptr<DatetimeParser> Instance( 45*993b0882SAndroid Build Coastguard Worker const DatetimeModel* model, const UniLib* unilib, 46*993b0882SAndroid Build Coastguard Worker const CalendarLib* calendarlib, ZlibDecompressor* decompressor); 47*993b0882SAndroid Build Coastguard Worker 48*993b0882SAndroid Build Coastguard Worker // Parses the dates in 'input' and fills result. Makes sure that the results 49*993b0882SAndroid Build Coastguard Worker // do not overlap. 50*993b0882SAndroid Build Coastguard Worker // If 'anchor_start_end' is true the extracted results need to start at the 51*993b0882SAndroid Build Coastguard Worker // beginning of 'input' and end at the end of it. 52*993b0882SAndroid Build Coastguard Worker StatusOr<std::vector<DatetimeParseResultSpan>> Parse( 53*993b0882SAndroid Build Coastguard Worker const std::string& input, int64 reference_time_ms_utc, 54*993b0882SAndroid Build Coastguard Worker const std::string& reference_timezone, const LocaleList& locale_list, 55*993b0882SAndroid Build Coastguard Worker ModeFlag mode, AnnotationUsecase annotation_usecase, 56*993b0882SAndroid Build Coastguard Worker bool anchor_start_end) const override; 57*993b0882SAndroid Build Coastguard Worker 58*993b0882SAndroid Build Coastguard Worker // Same as above but takes UnicodeText. 59*993b0882SAndroid Build Coastguard Worker StatusOr<std::vector<DatetimeParseResultSpan>> Parse( 60*993b0882SAndroid Build Coastguard Worker const UnicodeText& input, int64 reference_time_ms_utc, 61*993b0882SAndroid Build Coastguard Worker const std::string& reference_timezone, const LocaleList& locale_list, 62*993b0882SAndroid Build Coastguard Worker ModeFlag mode, AnnotationUsecase annotation_usecase, 63*993b0882SAndroid Build Coastguard Worker bool anchor_start_end) const override; 64*993b0882SAndroid Build Coastguard Worker 65*993b0882SAndroid Build Coastguard Worker protected: 66*993b0882SAndroid Build Coastguard Worker explicit RegexDatetimeParser(const DatetimeModel* model, const UniLib* unilib, 67*993b0882SAndroid Build Coastguard Worker const CalendarLib* calendarlib, 68*993b0882SAndroid Build Coastguard Worker ZlibDecompressor* decompressor); 69*993b0882SAndroid Build Coastguard Worker 70*993b0882SAndroid Build Coastguard Worker // Returns a list of locale ids for given locale spec string (collection of 71*993b0882SAndroid Build Coastguard Worker // locale names). 72*993b0882SAndroid Build Coastguard Worker std::vector<int> ParseAndExpandLocales( 73*993b0882SAndroid Build Coastguard Worker const std::vector<StringPiece>& locales) const; 74*993b0882SAndroid Build Coastguard Worker 75*993b0882SAndroid Build Coastguard Worker // Helper function that finds datetime spans, only using the rules associated 76*993b0882SAndroid Build Coastguard Worker // with the given locales. 77*993b0882SAndroid Build Coastguard Worker StatusOr<std::vector<DatetimeParseResultSpan>> FindSpansUsingLocales( 78*993b0882SAndroid Build Coastguard Worker const std::vector<int>& locale_ids, const UnicodeText& input, 79*993b0882SAndroid Build Coastguard Worker const int64 reference_time_ms_utc, const std::string& reference_timezone, 80*993b0882SAndroid Build Coastguard Worker ModeFlag mode, AnnotationUsecase annotation_usecase, 81*993b0882SAndroid Build Coastguard Worker bool anchor_start_end, const std::string& reference_locale, 82*993b0882SAndroid Build Coastguard Worker std::unordered_set<int>* executed_rules) const; 83*993b0882SAndroid Build Coastguard Worker 84*993b0882SAndroid Build Coastguard Worker StatusOr<std::vector<DatetimeParseResultSpan>> ParseWithRule( 85*993b0882SAndroid Build Coastguard Worker const CompiledRule& rule, const UnicodeText& input, 86*993b0882SAndroid Build Coastguard Worker int64 reference_time_ms_utc, const std::string& reference_timezone, 87*993b0882SAndroid Build Coastguard Worker const std::string& reference_locale, const int locale_id, 88*993b0882SAndroid Build Coastguard Worker bool anchor_start_end) const; 89*993b0882SAndroid Build Coastguard Worker 90*993b0882SAndroid Build Coastguard Worker // Converts the current match in 'matcher' into DatetimeParseResult. 91*993b0882SAndroid Build Coastguard Worker bool ExtractDatetime(const CompiledRule& rule, 92*993b0882SAndroid Build Coastguard Worker const UniLib::RegexMatcher& matcher, 93*993b0882SAndroid Build Coastguard Worker int64 reference_time_ms_utc, 94*993b0882SAndroid Build Coastguard Worker const std::string& reference_timezone, 95*993b0882SAndroid Build Coastguard Worker const std::string& reference_locale, int locale_id, 96*993b0882SAndroid Build Coastguard Worker std::vector<DatetimeParseResult>* results, 97*993b0882SAndroid Build Coastguard Worker CodepointSpan* result_span) const; 98*993b0882SAndroid Build Coastguard Worker 99*993b0882SAndroid Build Coastguard Worker // Parse and extract information from current match in 'matcher'. 100*993b0882SAndroid Build Coastguard Worker StatusOr<std::vector<DatetimeParseResultSpan>> HandleParseMatch( 101*993b0882SAndroid Build Coastguard Worker const CompiledRule& rule, const UniLib::RegexMatcher& matcher, 102*993b0882SAndroid Build Coastguard Worker int64 reference_time_ms_utc, const std::string& reference_timezone, 103*993b0882SAndroid Build Coastguard Worker const std::string& reference_locale, int locale_id) const; 104*993b0882SAndroid Build Coastguard Worker 105*993b0882SAndroid Build Coastguard Worker private: 106*993b0882SAndroid Build Coastguard Worker bool initialized_; 107*993b0882SAndroid Build Coastguard Worker const UniLib& unilib_; 108*993b0882SAndroid Build Coastguard Worker const CalendarLib& calendarlib_; 109*993b0882SAndroid Build Coastguard Worker std::vector<CompiledRule> rules_; 110*993b0882SAndroid Build Coastguard Worker std::unordered_map<int, std::vector<int>> locale_to_rules_; 111*993b0882SAndroid Build Coastguard Worker std::vector<std::unique_ptr<const UniLib::RegexPattern>> extractor_rules_; 112*993b0882SAndroid Build Coastguard Worker std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>> 113*993b0882SAndroid Build Coastguard Worker type_and_locale_to_extractor_rule_; 114*993b0882SAndroid Build Coastguard Worker std::unordered_map<std::string, int> locale_string_to_id_; 115*993b0882SAndroid Build Coastguard Worker std::vector<int> default_locale_ids_; 116*993b0882SAndroid Build Coastguard Worker bool use_extractors_for_locating_; 117*993b0882SAndroid Build Coastguard Worker bool generate_alternative_interpretations_when_ambiguous_; 118*993b0882SAndroid Build Coastguard Worker bool prefer_future_for_unspecified_date_; 119*993b0882SAndroid Build Coastguard Worker }; 120*993b0882SAndroid Build Coastguard Worker 121*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3 122*993b0882SAndroid Build Coastguard Worker 123*993b0882SAndroid Build Coastguard Worker #endif // LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_REGEX_PARSER_H_ 124