xref: /aosp_15_r20/external/libtextclassifier/native/annotator/datetime/regex-parser.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "annotator/datetime/regex-parser.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include <algorithm>
20*993b0882SAndroid Build Coastguard Worker #include <iterator>
21*993b0882SAndroid Build Coastguard Worker #include <set>
22*993b0882SAndroid Build Coastguard Worker #include <unordered_set>
23*993b0882SAndroid Build Coastguard Worker 
24*993b0882SAndroid Build Coastguard Worker #include "annotator/datetime/extractor.h"
25*993b0882SAndroid Build Coastguard Worker #include "annotator/datetime/utils.h"
26*993b0882SAndroid Build Coastguard Worker #include "utils/base/statusor.h"
27*993b0882SAndroid Build Coastguard Worker #include "utils/calendar/calendar.h"
28*993b0882SAndroid Build Coastguard Worker #include "utils/i18n/locale.h"
29*993b0882SAndroid Build Coastguard Worker #include "utils/strings/split.h"
30*993b0882SAndroid Build Coastguard Worker #include "utils/zlib/zlib_regex.h"
31*993b0882SAndroid Build Coastguard Worker 
32*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
Instance(const DatetimeModel * model,const UniLib * unilib,const CalendarLib * calendarlib,ZlibDecompressor * decompressor)33*993b0882SAndroid Build Coastguard Worker std::unique_ptr<DatetimeParser> RegexDatetimeParser::Instance(
34*993b0882SAndroid Build Coastguard Worker     const DatetimeModel* model, const UniLib* unilib,
35*993b0882SAndroid Build Coastguard Worker     const CalendarLib* calendarlib, ZlibDecompressor* decompressor) {
36*993b0882SAndroid Build Coastguard Worker   std::unique_ptr<RegexDatetimeParser> result(
37*993b0882SAndroid Build Coastguard Worker       new RegexDatetimeParser(model, unilib, calendarlib, decompressor));
38*993b0882SAndroid Build Coastguard Worker   if (!result->initialized_) {
39*993b0882SAndroid Build Coastguard Worker     result.reset();
40*993b0882SAndroid Build Coastguard Worker   }
41*993b0882SAndroid Build Coastguard Worker   return result;
42*993b0882SAndroid Build Coastguard Worker }
43*993b0882SAndroid Build Coastguard Worker 
RegexDatetimeParser(const DatetimeModel * model,const UniLib * unilib,const CalendarLib * calendarlib,ZlibDecompressor * decompressor)44*993b0882SAndroid Build Coastguard Worker RegexDatetimeParser::RegexDatetimeParser(const DatetimeModel* model,
45*993b0882SAndroid Build Coastguard Worker                                          const UniLib* unilib,
46*993b0882SAndroid Build Coastguard Worker                                          const CalendarLib* calendarlib,
47*993b0882SAndroid Build Coastguard Worker                                          ZlibDecompressor* decompressor)
48*993b0882SAndroid Build Coastguard Worker     : unilib_(*unilib), calendarlib_(*calendarlib) {
49*993b0882SAndroid Build Coastguard Worker   initialized_ = false;
50*993b0882SAndroid Build Coastguard Worker 
51*993b0882SAndroid Build Coastguard Worker   if (model == nullptr) {
52*993b0882SAndroid Build Coastguard Worker     return;
53*993b0882SAndroid Build Coastguard Worker   }
54*993b0882SAndroid Build Coastguard Worker 
55*993b0882SAndroid Build Coastguard Worker   if (model->patterns() != nullptr) {
56*993b0882SAndroid Build Coastguard Worker     for (const DatetimeModelPattern* pattern : *model->patterns()) {
57*993b0882SAndroid Build Coastguard Worker       if (pattern->regexes()) {
58*993b0882SAndroid Build Coastguard Worker         for (const DatetimeModelPattern_::Regex* regex : *pattern->regexes()) {
59*993b0882SAndroid Build Coastguard Worker           std::unique_ptr<UniLib::RegexPattern> regex_pattern =
60*993b0882SAndroid Build Coastguard Worker               UncompressMakeRegexPattern(
61*993b0882SAndroid Build Coastguard Worker                   unilib_, regex->pattern(), regex->compressed_pattern(),
62*993b0882SAndroid Build Coastguard Worker                   model->lazy_regex_compilation(), decompressor);
63*993b0882SAndroid Build Coastguard Worker           if (!regex_pattern) {
64*993b0882SAndroid Build Coastguard Worker             TC3_LOG(ERROR) << "Couldn't create rule pattern.";
65*993b0882SAndroid Build Coastguard Worker             return;
66*993b0882SAndroid Build Coastguard Worker           }
67*993b0882SAndroid Build Coastguard Worker           rules_.push_back({std::move(regex_pattern), regex, pattern});
68*993b0882SAndroid Build Coastguard Worker           if (pattern->locales()) {
69*993b0882SAndroid Build Coastguard Worker             for (int locale : *pattern->locales()) {
70*993b0882SAndroid Build Coastguard Worker               locale_to_rules_[locale].push_back(rules_.size() - 1);
71*993b0882SAndroid Build Coastguard Worker             }
72*993b0882SAndroid Build Coastguard Worker           }
73*993b0882SAndroid Build Coastguard Worker         }
74*993b0882SAndroid Build Coastguard Worker       }
75*993b0882SAndroid Build Coastguard Worker     }
76*993b0882SAndroid Build Coastguard Worker   }
77*993b0882SAndroid Build Coastguard Worker 
78*993b0882SAndroid Build Coastguard Worker   if (model->extractors() != nullptr) {
79*993b0882SAndroid Build Coastguard Worker     for (const DatetimeModelExtractor* extractor : *model->extractors()) {
80*993b0882SAndroid Build Coastguard Worker       std::unique_ptr<UniLib::RegexPattern> regex_pattern =
81*993b0882SAndroid Build Coastguard Worker           UncompressMakeRegexPattern(
82*993b0882SAndroid Build Coastguard Worker               unilib_, extractor->pattern(), extractor->compressed_pattern(),
83*993b0882SAndroid Build Coastguard Worker               model->lazy_regex_compilation(), decompressor);
84*993b0882SAndroid Build Coastguard Worker       if (!regex_pattern) {
85*993b0882SAndroid Build Coastguard Worker         TC3_LOG(ERROR) << "Couldn't create extractor pattern";
86*993b0882SAndroid Build Coastguard Worker         return;
87*993b0882SAndroid Build Coastguard Worker       }
88*993b0882SAndroid Build Coastguard Worker       extractor_rules_.push_back(std::move(regex_pattern));
89*993b0882SAndroid Build Coastguard Worker 
90*993b0882SAndroid Build Coastguard Worker       if (extractor->locales()) {
91*993b0882SAndroid Build Coastguard Worker         for (int locale : *extractor->locales()) {
92*993b0882SAndroid Build Coastguard Worker           type_and_locale_to_extractor_rule_[extractor->extractor()][locale] =
93*993b0882SAndroid Build Coastguard Worker               extractor_rules_.size() - 1;
94*993b0882SAndroid Build Coastguard Worker         }
95*993b0882SAndroid Build Coastguard Worker       }
96*993b0882SAndroid Build Coastguard Worker     }
97*993b0882SAndroid Build Coastguard Worker   }
98*993b0882SAndroid Build Coastguard Worker 
99*993b0882SAndroid Build Coastguard Worker   if (model->locales() != nullptr) {
100*993b0882SAndroid Build Coastguard Worker     for (int i = 0; i < model->locales()->size(); ++i) {
101*993b0882SAndroid Build Coastguard Worker       locale_string_to_id_[model->locales()->Get(i)->str()] = i;
102*993b0882SAndroid Build Coastguard Worker     }
103*993b0882SAndroid Build Coastguard Worker   }
104*993b0882SAndroid Build Coastguard Worker 
105*993b0882SAndroid Build Coastguard Worker   if (model->default_locales() != nullptr) {
106*993b0882SAndroid Build Coastguard Worker     for (const int locale : *model->default_locales()) {
107*993b0882SAndroid Build Coastguard Worker       default_locale_ids_.push_back(locale);
108*993b0882SAndroid Build Coastguard Worker     }
109*993b0882SAndroid Build Coastguard Worker   }
110*993b0882SAndroid Build Coastguard Worker 
111*993b0882SAndroid Build Coastguard Worker   use_extractors_for_locating_ = model->use_extractors_for_locating();
112*993b0882SAndroid Build Coastguard Worker   generate_alternative_interpretations_when_ambiguous_ =
113*993b0882SAndroid Build Coastguard Worker       model->generate_alternative_interpretations_when_ambiguous();
114*993b0882SAndroid Build Coastguard Worker   prefer_future_for_unspecified_date_ =
115*993b0882SAndroid Build Coastguard Worker       model->prefer_future_for_unspecified_date();
116*993b0882SAndroid Build Coastguard Worker 
117*993b0882SAndroid Build Coastguard Worker   initialized_ = true;
118*993b0882SAndroid Build Coastguard Worker }
119*993b0882SAndroid Build Coastguard Worker 
Parse(const std::string & input,const int64 reference_time_ms_utc,const std::string & reference_timezone,const LocaleList & locale_list,ModeFlag mode,AnnotationUsecase annotation_usecase,bool anchor_start_end) const120*993b0882SAndroid Build Coastguard Worker StatusOr<std::vector<DatetimeParseResultSpan>> RegexDatetimeParser::Parse(
121*993b0882SAndroid Build Coastguard Worker     const std::string& input, const int64 reference_time_ms_utc,
122*993b0882SAndroid Build Coastguard Worker     const std::string& reference_timezone, const LocaleList& locale_list,
123*993b0882SAndroid Build Coastguard Worker     ModeFlag mode, AnnotationUsecase annotation_usecase,
124*993b0882SAndroid Build Coastguard Worker     bool anchor_start_end) const {
125*993b0882SAndroid Build Coastguard Worker   return Parse(UTF8ToUnicodeText(input, /*do_copy=*/false),
126*993b0882SAndroid Build Coastguard Worker                reference_time_ms_utc, reference_timezone, locale_list, mode,
127*993b0882SAndroid Build Coastguard Worker                annotation_usecase, anchor_start_end);
128*993b0882SAndroid Build Coastguard Worker }
129*993b0882SAndroid Build Coastguard Worker 
130*993b0882SAndroid Build Coastguard Worker StatusOr<std::vector<DatetimeParseResultSpan>>
FindSpansUsingLocales(const std::vector<int> & locale_ids,const UnicodeText & input,const int64 reference_time_ms_utc,const std::string & reference_timezone,ModeFlag mode,AnnotationUsecase annotation_usecase,bool anchor_start_end,const std::string & reference_locale,std::unordered_set<int> * executed_rules) const131*993b0882SAndroid Build Coastguard Worker RegexDatetimeParser::FindSpansUsingLocales(
132*993b0882SAndroid Build Coastguard Worker     const std::vector<int>& locale_ids, const UnicodeText& input,
133*993b0882SAndroid Build Coastguard Worker     const int64 reference_time_ms_utc, const std::string& reference_timezone,
134*993b0882SAndroid Build Coastguard Worker     ModeFlag mode, AnnotationUsecase annotation_usecase, bool anchor_start_end,
135*993b0882SAndroid Build Coastguard Worker     const std::string& reference_locale,
136*993b0882SAndroid Build Coastguard Worker     std::unordered_set<int>* executed_rules) const {
137*993b0882SAndroid Build Coastguard Worker   std::vector<DatetimeParseResultSpan> found_spans;
138*993b0882SAndroid Build Coastguard Worker   for (const int locale_id : locale_ids) {
139*993b0882SAndroid Build Coastguard Worker     auto rules_it = locale_to_rules_.find(locale_id);
140*993b0882SAndroid Build Coastguard Worker     if (rules_it == locale_to_rules_.end()) {
141*993b0882SAndroid Build Coastguard Worker       continue;
142*993b0882SAndroid Build Coastguard Worker     }
143*993b0882SAndroid Build Coastguard Worker 
144*993b0882SAndroid Build Coastguard Worker     for (const int rule_id : rules_it->second) {
145*993b0882SAndroid Build Coastguard Worker       // Skip rules that were already executed in previous locales.
146*993b0882SAndroid Build Coastguard Worker       if (executed_rules->find(rule_id) != executed_rules->end()) {
147*993b0882SAndroid Build Coastguard Worker         continue;
148*993b0882SAndroid Build Coastguard Worker       }
149*993b0882SAndroid Build Coastguard Worker 
150*993b0882SAndroid Build Coastguard Worker       if ((rules_[rule_id].pattern->enabled_annotation_usecases() &
151*993b0882SAndroid Build Coastguard Worker            (1 << annotation_usecase)) == 0) {
152*993b0882SAndroid Build Coastguard Worker         continue;
153*993b0882SAndroid Build Coastguard Worker       }
154*993b0882SAndroid Build Coastguard Worker 
155*993b0882SAndroid Build Coastguard Worker       if (!(rules_[rule_id].pattern->enabled_modes() & mode)) {
156*993b0882SAndroid Build Coastguard Worker         continue;
157*993b0882SAndroid Build Coastguard Worker       }
158*993b0882SAndroid Build Coastguard Worker 
159*993b0882SAndroid Build Coastguard Worker       executed_rules->insert(rule_id);
160*993b0882SAndroid Build Coastguard Worker       TC3_ASSIGN_OR_RETURN(
161*993b0882SAndroid Build Coastguard Worker           const std::vector<DatetimeParseResultSpan>& found_spans_per_rule,
162*993b0882SAndroid Build Coastguard Worker           ParseWithRule(rules_[rule_id], input, reference_time_ms_utc,
163*993b0882SAndroid Build Coastguard Worker                         reference_timezone, reference_locale, locale_id,
164*993b0882SAndroid Build Coastguard Worker                         anchor_start_end));
165*993b0882SAndroid Build Coastguard Worker       found_spans.insert(std::end(found_spans),
166*993b0882SAndroid Build Coastguard Worker                          std::begin(found_spans_per_rule),
167*993b0882SAndroid Build Coastguard Worker                          std::end(found_spans_per_rule));
168*993b0882SAndroid Build Coastguard Worker     }
169*993b0882SAndroid Build Coastguard Worker   }
170*993b0882SAndroid Build Coastguard Worker   return found_spans;
171*993b0882SAndroid Build Coastguard Worker }
172*993b0882SAndroid Build Coastguard Worker 
Parse(const UnicodeText & input,const int64 reference_time_ms_utc,const std::string & reference_timezone,const LocaleList & locale_list,ModeFlag mode,AnnotationUsecase annotation_usecase,bool anchor_start_end) const173*993b0882SAndroid Build Coastguard Worker StatusOr<std::vector<DatetimeParseResultSpan>> RegexDatetimeParser::Parse(
174*993b0882SAndroid Build Coastguard Worker     const UnicodeText& input, const int64 reference_time_ms_utc,
175*993b0882SAndroid Build Coastguard Worker     const std::string& reference_timezone, const LocaleList& locale_list,
176*993b0882SAndroid Build Coastguard Worker     ModeFlag mode, AnnotationUsecase annotation_usecase,
177*993b0882SAndroid Build Coastguard Worker     bool anchor_start_end) const {
178*993b0882SAndroid Build Coastguard Worker   std::unordered_set<int> executed_rules;
179*993b0882SAndroid Build Coastguard Worker   const std::vector<int> requested_locales =
180*993b0882SAndroid Build Coastguard Worker       ParseAndExpandLocales(locale_list.GetLocaleTags());
181*993b0882SAndroid Build Coastguard Worker   TC3_ASSIGN_OR_RETURN(
182*993b0882SAndroid Build Coastguard Worker       const std::vector<DatetimeParseResultSpan>& found_spans,
183*993b0882SAndroid Build Coastguard Worker       FindSpansUsingLocales(requested_locales, input, reference_time_ms_utc,
184*993b0882SAndroid Build Coastguard Worker                             reference_timezone, mode, annotation_usecase,
185*993b0882SAndroid Build Coastguard Worker                             anchor_start_end, locale_list.GetReferenceLocale(),
186*993b0882SAndroid Build Coastguard Worker                             &executed_rules));
187*993b0882SAndroid Build Coastguard Worker   std::vector<std::pair<DatetimeParseResultSpan, int>> indexed_found_spans;
188*993b0882SAndroid Build Coastguard Worker   indexed_found_spans.reserve(found_spans.size());
189*993b0882SAndroid Build Coastguard Worker   for (int i = 0; i < found_spans.size(); i++) {
190*993b0882SAndroid Build Coastguard Worker     indexed_found_spans.push_back({found_spans[i], i});
191*993b0882SAndroid Build Coastguard Worker   }
192*993b0882SAndroid Build Coastguard Worker 
193*993b0882SAndroid Build Coastguard Worker   // Resolve conflicts by always picking the longer span and breaking ties by
194*993b0882SAndroid Build Coastguard Worker   // selecting the earlier entry in the list for a given locale.
195*993b0882SAndroid Build Coastguard Worker   std::stable_sort(indexed_found_spans.begin(), indexed_found_spans.end(),
196*993b0882SAndroid Build Coastguard Worker                    [](const std::pair<DatetimeParseResultSpan, int>& a,
197*993b0882SAndroid Build Coastguard Worker                       const std::pair<DatetimeParseResultSpan, int>& b) {
198*993b0882SAndroid Build Coastguard Worker                      if ((a.first.span.second - a.first.span.first) !=
199*993b0882SAndroid Build Coastguard Worker                          (b.first.span.second - b.first.span.first)) {
200*993b0882SAndroid Build Coastguard Worker                        return (a.first.span.second - a.first.span.first) >
201*993b0882SAndroid Build Coastguard Worker                               (b.first.span.second - b.first.span.first);
202*993b0882SAndroid Build Coastguard Worker                      } else {
203*993b0882SAndroid Build Coastguard Worker                        return a.second < b.second;
204*993b0882SAndroid Build Coastguard Worker                      }
205*993b0882SAndroid Build Coastguard Worker                    });
206*993b0882SAndroid Build Coastguard Worker 
207*993b0882SAndroid Build Coastguard Worker   std::vector<DatetimeParseResultSpan> results;
208*993b0882SAndroid Build Coastguard Worker   std::vector<DatetimeParseResultSpan> resolved_found_spans;
209*993b0882SAndroid Build Coastguard Worker   resolved_found_spans.reserve(indexed_found_spans.size());
210*993b0882SAndroid Build Coastguard Worker   for (auto& span_index_pair : indexed_found_spans) {
211*993b0882SAndroid Build Coastguard Worker     resolved_found_spans.push_back(span_index_pair.first);
212*993b0882SAndroid Build Coastguard Worker   }
213*993b0882SAndroid Build Coastguard Worker 
214*993b0882SAndroid Build Coastguard Worker   std::set<int, std::function<bool(int, int)>> chosen_indices_set(
215*993b0882SAndroid Build Coastguard Worker       [&resolved_found_spans](int a, int b) {
216*993b0882SAndroid Build Coastguard Worker         return resolved_found_spans[a].span.first <
217*993b0882SAndroid Build Coastguard Worker                resolved_found_spans[b].span.first;
218*993b0882SAndroid Build Coastguard Worker       });
219*993b0882SAndroid Build Coastguard Worker   for (int i = 0; i < resolved_found_spans.size(); ++i) {
220*993b0882SAndroid Build Coastguard Worker     if (!DoesCandidateConflict(i, resolved_found_spans, chosen_indices_set)) {
221*993b0882SAndroid Build Coastguard Worker       chosen_indices_set.insert(i);
222*993b0882SAndroid Build Coastguard Worker       results.push_back(resolved_found_spans[i]);
223*993b0882SAndroid Build Coastguard Worker     }
224*993b0882SAndroid Build Coastguard Worker   }
225*993b0882SAndroid Build Coastguard Worker   return results;
226*993b0882SAndroid Build Coastguard Worker }
227*993b0882SAndroid Build Coastguard Worker 
228*993b0882SAndroid Build Coastguard Worker StatusOr<std::vector<DatetimeParseResultSpan>>
HandleParseMatch(const CompiledRule & rule,const UniLib::RegexMatcher & matcher,int64 reference_time_ms_utc,const std::string & reference_timezone,const std::string & reference_locale,int locale_id) const229*993b0882SAndroid Build Coastguard Worker RegexDatetimeParser::HandleParseMatch(const CompiledRule& rule,
230*993b0882SAndroid Build Coastguard Worker                                       const UniLib::RegexMatcher& matcher,
231*993b0882SAndroid Build Coastguard Worker                                       int64 reference_time_ms_utc,
232*993b0882SAndroid Build Coastguard Worker                                       const std::string& reference_timezone,
233*993b0882SAndroid Build Coastguard Worker                                       const std::string& reference_locale,
234*993b0882SAndroid Build Coastguard Worker                                       int locale_id) const {
235*993b0882SAndroid Build Coastguard Worker   std::vector<DatetimeParseResultSpan> results;
236*993b0882SAndroid Build Coastguard Worker   int status = UniLib::RegexMatcher::kNoError;
237*993b0882SAndroid Build Coastguard Worker   const int start = matcher.Start(&status);
238*993b0882SAndroid Build Coastguard Worker   if (status != UniLib::RegexMatcher::kNoError) {
239*993b0882SAndroid Build Coastguard Worker     return Status(StatusCode::INTERNAL,
240*993b0882SAndroid Build Coastguard Worker                   "Failed to gets the start offset of the last match.");
241*993b0882SAndroid Build Coastguard Worker   }
242*993b0882SAndroid Build Coastguard Worker 
243*993b0882SAndroid Build Coastguard Worker   const int end = matcher.End(&status);
244*993b0882SAndroid Build Coastguard Worker   if (status != UniLib::RegexMatcher::kNoError) {
245*993b0882SAndroid Build Coastguard Worker     return Status(StatusCode::INTERNAL,
246*993b0882SAndroid Build Coastguard Worker                   "Failed to gets the end offset of the last match.");
247*993b0882SAndroid Build Coastguard Worker   }
248*993b0882SAndroid Build Coastguard Worker 
249*993b0882SAndroid Build Coastguard Worker   DatetimeParseResultSpan parse_result;
250*993b0882SAndroid Build Coastguard Worker   std::vector<DatetimeParseResult> alternatives;
251*993b0882SAndroid Build Coastguard Worker   if (!ExtractDatetime(rule, matcher, reference_time_ms_utc, reference_timezone,
252*993b0882SAndroid Build Coastguard Worker                        reference_locale, locale_id, &alternatives,
253*993b0882SAndroid Build Coastguard Worker                        &parse_result.span)) {
254*993b0882SAndroid Build Coastguard Worker     return Status(StatusCode::INTERNAL, "Failed to extract Datetime.");
255*993b0882SAndroid Build Coastguard Worker   }
256*993b0882SAndroid Build Coastguard Worker 
257*993b0882SAndroid Build Coastguard Worker   if (!use_extractors_for_locating_) {
258*993b0882SAndroid Build Coastguard Worker     parse_result.span = {start, end};
259*993b0882SAndroid Build Coastguard Worker   }
260*993b0882SAndroid Build Coastguard Worker 
261*993b0882SAndroid Build Coastguard Worker   if (parse_result.span.first != kInvalidIndex &&
262*993b0882SAndroid Build Coastguard Worker       parse_result.span.second != kInvalidIndex) {
263*993b0882SAndroid Build Coastguard Worker     parse_result.target_classification_score =
264*993b0882SAndroid Build Coastguard Worker         rule.pattern->target_classification_score();
265*993b0882SAndroid Build Coastguard Worker     parse_result.priority_score = rule.pattern->priority_score();
266*993b0882SAndroid Build Coastguard Worker 
267*993b0882SAndroid Build Coastguard Worker     for (DatetimeParseResult& alternative : alternatives) {
268*993b0882SAndroid Build Coastguard Worker       parse_result.data.push_back(alternative);
269*993b0882SAndroid Build Coastguard Worker     }
270*993b0882SAndroid Build Coastguard Worker   }
271*993b0882SAndroid Build Coastguard Worker   results.push_back(parse_result);
272*993b0882SAndroid Build Coastguard Worker   return results;
273*993b0882SAndroid Build Coastguard Worker }
274*993b0882SAndroid Build Coastguard Worker 
275*993b0882SAndroid Build Coastguard Worker StatusOr<std::vector<DatetimeParseResultSpan>>
ParseWithRule(const CompiledRule & rule,const UnicodeText & input,const int64 reference_time_ms_utc,const std::string & reference_timezone,const std::string & reference_locale,const int locale_id,bool anchor_start_end) const276*993b0882SAndroid Build Coastguard Worker RegexDatetimeParser::ParseWithRule(const CompiledRule& rule,
277*993b0882SAndroid Build Coastguard Worker                                    const UnicodeText& input,
278*993b0882SAndroid Build Coastguard Worker                                    const int64 reference_time_ms_utc,
279*993b0882SAndroid Build Coastguard Worker                                    const std::string& reference_timezone,
280*993b0882SAndroid Build Coastguard Worker                                    const std::string& reference_locale,
281*993b0882SAndroid Build Coastguard Worker                                    const int locale_id,
282*993b0882SAndroid Build Coastguard Worker                                    bool anchor_start_end) const {
283*993b0882SAndroid Build Coastguard Worker   std::vector<DatetimeParseResultSpan> results;
284*993b0882SAndroid Build Coastguard Worker   std::unique_ptr<UniLib::RegexMatcher> matcher =
285*993b0882SAndroid Build Coastguard Worker       rule.compiled_regex->Matcher(input);
286*993b0882SAndroid Build Coastguard Worker   int status = UniLib::RegexMatcher::kNoError;
287*993b0882SAndroid Build Coastguard Worker   if (anchor_start_end) {
288*993b0882SAndroid Build Coastguard Worker     if (matcher->Matches(&status) && status == UniLib::RegexMatcher::kNoError) {
289*993b0882SAndroid Build Coastguard Worker       return HandleParseMatch(rule, *matcher, reference_time_ms_utc,
290*993b0882SAndroid Build Coastguard Worker                               reference_timezone, reference_locale, locale_id);
291*993b0882SAndroid Build Coastguard Worker     }
292*993b0882SAndroid Build Coastguard Worker   } else {
293*993b0882SAndroid Build Coastguard Worker     while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {
294*993b0882SAndroid Build Coastguard Worker       TC3_ASSIGN_OR_RETURN(
295*993b0882SAndroid Build Coastguard Worker           const std::vector<DatetimeParseResultSpan>& pattern_occurrence,
296*993b0882SAndroid Build Coastguard Worker           HandleParseMatch(rule, *matcher, reference_time_ms_utc,
297*993b0882SAndroid Build Coastguard Worker                            reference_timezone, reference_locale, locale_id));
298*993b0882SAndroid Build Coastguard Worker       results.insert(std::end(results), std::begin(pattern_occurrence),
299*993b0882SAndroid Build Coastguard Worker                      std::end(pattern_occurrence));
300*993b0882SAndroid Build Coastguard Worker     }
301*993b0882SAndroid Build Coastguard Worker   }
302*993b0882SAndroid Build Coastguard Worker   return results;
303*993b0882SAndroid Build Coastguard Worker }
304*993b0882SAndroid Build Coastguard Worker 
ParseAndExpandLocales(const std::vector<StringPiece> & locales) const305*993b0882SAndroid Build Coastguard Worker std::vector<int> RegexDatetimeParser::ParseAndExpandLocales(
306*993b0882SAndroid Build Coastguard Worker     const std::vector<StringPiece>& locales) const {
307*993b0882SAndroid Build Coastguard Worker   std::vector<int> result;
308*993b0882SAndroid Build Coastguard Worker   for (const StringPiece& locale_str : locales) {
309*993b0882SAndroid Build Coastguard Worker     auto locale_it = locale_string_to_id_.find(locale_str.ToString());
310*993b0882SAndroid Build Coastguard Worker     if (locale_it != locale_string_to_id_.end()) {
311*993b0882SAndroid Build Coastguard Worker       result.push_back(locale_it->second);
312*993b0882SAndroid Build Coastguard Worker     }
313*993b0882SAndroid Build Coastguard Worker 
314*993b0882SAndroid Build Coastguard Worker     const Locale locale = Locale::FromBCP47(locale_str.ToString());
315*993b0882SAndroid Build Coastguard Worker     if (!locale.IsValid()) {
316*993b0882SAndroid Build Coastguard Worker       continue;
317*993b0882SAndroid Build Coastguard Worker     }
318*993b0882SAndroid Build Coastguard Worker 
319*993b0882SAndroid Build Coastguard Worker     const std::string language = locale.Language();
320*993b0882SAndroid Build Coastguard Worker     const std::string script = locale.Script();
321*993b0882SAndroid Build Coastguard Worker     const std::string region = locale.Region();
322*993b0882SAndroid Build Coastguard Worker 
323*993b0882SAndroid Build Coastguard Worker     // First, try adding *-region locale.
324*993b0882SAndroid Build Coastguard Worker     if (!region.empty()) {
325*993b0882SAndroid Build Coastguard Worker       locale_it = locale_string_to_id_.find("*-" + region);
326*993b0882SAndroid Build Coastguard Worker       if (locale_it != locale_string_to_id_.end()) {
327*993b0882SAndroid Build Coastguard Worker         result.push_back(locale_it->second);
328*993b0882SAndroid Build Coastguard Worker       }
329*993b0882SAndroid Build Coastguard Worker     }
330*993b0882SAndroid Build Coastguard Worker     // Second, try adding language-script-* locale.
331*993b0882SAndroid Build Coastguard Worker     if (!script.empty()) {
332*993b0882SAndroid Build Coastguard Worker       locale_it = locale_string_to_id_.find(language + "-" + script + "-*");
333*993b0882SAndroid Build Coastguard Worker       if (locale_it != locale_string_to_id_.end()) {
334*993b0882SAndroid Build Coastguard Worker         result.push_back(locale_it->second);
335*993b0882SAndroid Build Coastguard Worker       }
336*993b0882SAndroid Build Coastguard Worker     }
337*993b0882SAndroid Build Coastguard Worker     // Third, try adding language-* locale.
338*993b0882SAndroid Build Coastguard Worker     if (!language.empty()) {
339*993b0882SAndroid Build Coastguard Worker       locale_it = locale_string_to_id_.find(language + "-*");
340*993b0882SAndroid Build Coastguard Worker       if (locale_it != locale_string_to_id_.end()) {
341*993b0882SAndroid Build Coastguard Worker         result.push_back(locale_it->second);
342*993b0882SAndroid Build Coastguard Worker       }
343*993b0882SAndroid Build Coastguard Worker     }
344*993b0882SAndroid Build Coastguard Worker   }
345*993b0882SAndroid Build Coastguard Worker 
346*993b0882SAndroid Build Coastguard Worker   // Add the default locales if they haven't been added already.
347*993b0882SAndroid Build Coastguard Worker   const std::unordered_set<int> result_set(result.begin(), result.end());
348*993b0882SAndroid Build Coastguard Worker   for (const int default_locale_id : default_locale_ids_) {
349*993b0882SAndroid Build Coastguard Worker     if (result_set.find(default_locale_id) == result_set.end()) {
350*993b0882SAndroid Build Coastguard Worker       result.push_back(default_locale_id);
351*993b0882SAndroid Build Coastguard Worker     }
352*993b0882SAndroid Build Coastguard Worker   }
353*993b0882SAndroid Build Coastguard Worker 
354*993b0882SAndroid Build Coastguard Worker   return result;
355*993b0882SAndroid Build Coastguard Worker }
356*993b0882SAndroid Build Coastguard Worker 
ExtractDatetime(const CompiledRule & rule,const UniLib::RegexMatcher & matcher,const int64 reference_time_ms_utc,const std::string & reference_timezone,const std::string & reference_locale,int locale_id,std::vector<DatetimeParseResult> * results,CodepointSpan * result_span) const357*993b0882SAndroid Build Coastguard Worker bool RegexDatetimeParser::ExtractDatetime(
358*993b0882SAndroid Build Coastguard Worker     const CompiledRule& rule, const UniLib::RegexMatcher& matcher,
359*993b0882SAndroid Build Coastguard Worker     const int64 reference_time_ms_utc, const std::string& reference_timezone,
360*993b0882SAndroid Build Coastguard Worker     const std::string& reference_locale, int locale_id,
361*993b0882SAndroid Build Coastguard Worker     std::vector<DatetimeParseResult>* results,
362*993b0882SAndroid Build Coastguard Worker     CodepointSpan* result_span) const {
363*993b0882SAndroid Build Coastguard Worker   DatetimeParsedData parse;
364*993b0882SAndroid Build Coastguard Worker   DatetimeExtractor extractor(rule, matcher, locale_id, &unilib_,
365*993b0882SAndroid Build Coastguard Worker                               extractor_rules_,
366*993b0882SAndroid Build Coastguard Worker                               type_and_locale_to_extractor_rule_);
367*993b0882SAndroid Build Coastguard Worker   if (!extractor.Extract(&parse, result_span)) {
368*993b0882SAndroid Build Coastguard Worker     return false;
369*993b0882SAndroid Build Coastguard Worker   }
370*993b0882SAndroid Build Coastguard Worker   std::vector<DatetimeParsedData> interpretations;
371*993b0882SAndroid Build Coastguard Worker   if (generate_alternative_interpretations_when_ambiguous_) {
372*993b0882SAndroid Build Coastguard Worker     FillInterpretations(parse, calendarlib_.GetGranularity(parse),
373*993b0882SAndroid Build Coastguard Worker                         &interpretations);
374*993b0882SAndroid Build Coastguard Worker   } else {
375*993b0882SAndroid Build Coastguard Worker     interpretations.push_back(parse);
376*993b0882SAndroid Build Coastguard Worker   }
377*993b0882SAndroid Build Coastguard Worker 
378*993b0882SAndroid Build Coastguard Worker   results->reserve(results->size() + interpretations.size());
379*993b0882SAndroid Build Coastguard Worker   for (const DatetimeParsedData& interpretation : interpretations) {
380*993b0882SAndroid Build Coastguard Worker     std::vector<DatetimeComponent> date_components;
381*993b0882SAndroid Build Coastguard Worker     interpretation.GetDatetimeComponents(&date_components);
382*993b0882SAndroid Build Coastguard Worker     DatetimeParseResult result;
383*993b0882SAndroid Build Coastguard Worker     // TODO(hassan): Text classifier only provides ambiguity limited to “AM/PM
384*993b0882SAndroid Build Coastguard Worker     //               which is encoded in the pair of DatetimeParseResult; both
385*993b0882SAndroid Build Coastguard Worker     //               corresponding to the same date, but one corresponding to
386*993b0882SAndroid Build Coastguard Worker     //               “AM” and the other one corresponding to “PM”.
387*993b0882SAndroid Build Coastguard Worker     //               Remove multiple DatetimeParseResult per datetime span,
388*993b0882SAndroid Build Coastguard Worker     //               once the ambiguities/DatetimeComponents are added in the
389*993b0882SAndroid Build Coastguard Worker     //               response. For Details see b/130355975
390*993b0882SAndroid Build Coastguard Worker     if (!calendarlib_.InterpretParseData(
391*993b0882SAndroid Build Coastguard Worker             interpretation, reference_time_ms_utc, reference_timezone,
392*993b0882SAndroid Build Coastguard Worker             reference_locale, prefer_future_for_unspecified_date_,
393*993b0882SAndroid Build Coastguard Worker             &(result.time_ms_utc), &(result.granularity))) {
394*993b0882SAndroid Build Coastguard Worker       return false;
395*993b0882SAndroid Build Coastguard Worker     }
396*993b0882SAndroid Build Coastguard Worker 
397*993b0882SAndroid Build Coastguard Worker     // Sort the date time units by component type.
398*993b0882SAndroid Build Coastguard Worker     std::stable_sort(date_components.begin(), date_components.end(),
399*993b0882SAndroid Build Coastguard Worker                      [](DatetimeComponent a, DatetimeComponent b) {
400*993b0882SAndroid Build Coastguard Worker                        return a.component_type > b.component_type;
401*993b0882SAndroid Build Coastguard Worker                      });
402*993b0882SAndroid Build Coastguard Worker     result.datetime_components.swap(date_components);
403*993b0882SAndroid Build Coastguard Worker     results->push_back(result);
404*993b0882SAndroid Build Coastguard Worker   }
405*993b0882SAndroid Build Coastguard Worker   return true;
406*993b0882SAndroid Build Coastguard Worker }
407*993b0882SAndroid Build Coastguard Worker 
408*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
409