xref: /aosp_15_r20/external/cronet/base/win/embedded_i18n/language_selector.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // This file defines a helper class for selecting a supported language from a
6 // set of candidates. It is used to get localized strings that are directly
7 // embedded into the executable / library instead of stored in external
8 // .pak files.
9 
10 #include "base/win/embedded_i18n/language_selector.h"
11 
12 #include <algorithm>
13 #include <functional>
14 #include <string_view>
15 
16 #include "base/check_op.h"
17 #include "base/memory/raw_ptr.h"
18 #include "base/ranges/algorithm.h"
19 #include "base/strings/string_util.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "base/win/i18n.h"
22 
23 namespace base {
24 namespace win {
25 namespace i18n {
26 
27 namespace {
28 
29 using LangToOffset = LanguageSelector::LangToOffset;
30 
31 // Holds pointers to LangToOffset pairs for specific languages that are the
32 // targets of exceptions (where one language is mapped to another) or wildcards
33 // (where a raw language identifier is mapped to a specific localization).
34 struct AvailableLanguageAliases {
35   raw_ptr<const LangToOffset> en_gb_language_offset;
36   raw_ptr<const LangToOffset> en_us_language_offset;
37   raw_ptr<const LangToOffset> es_language_offset;
38   raw_ptr<const LangToOffset> es_419_language_offset;
39   raw_ptr<const LangToOffset> fil_language_offset;
40   raw_ptr<const LangToOffset> iw_language_offset;
41   raw_ptr<const LangToOffset> no_language_offset;
42   raw_ptr<const LangToOffset> pt_br_language_offset;
43   raw_ptr<const LangToOffset> zh_cn_language_offset;
44   raw_ptr<const LangToOffset> zh_tw_language_offset;
45 };
46 
47 #if DCHECK_IS_ON()
48 // Returns true if the items in the given range are sorted and lower cased.
IsArraySortedAndLowerCased(span<const LangToOffset> languages_to_offset)49 bool IsArraySortedAndLowerCased(span<const LangToOffset> languages_to_offset) {
50   return std::is_sorted(languages_to_offset.begin(),
51                         languages_to_offset.end()) &&
52          base::ranges::all_of(languages_to_offset, [](const auto& lang) {
53            auto language = AsStringPiece16(lang.first);
54            return ToLowerASCII(language) == language;
55          });
56 }
57 #endif  // DCHECK_IS_ON()
58 
59 // Determines the availability of all languages that may be used as aliases in
60 // GetAliasedLanguageOffset or GetCompatibleNeutralLanguageOffset
DetermineAvailableAliases(span<const LangToOffset> languages_to_offset)61 AvailableLanguageAliases DetermineAvailableAliases(
62     span<const LangToOffset> languages_to_offset) {
63   AvailableLanguageAliases available_aliases = {};
64 
65   for (const LangToOffset& lang_to_offset : languages_to_offset) {
66     if (lang_to_offset.first == L"en-gb")
67       available_aliases.en_gb_language_offset = &lang_to_offset;
68     else if (lang_to_offset.first == L"en-us")
69       available_aliases.en_us_language_offset = &lang_to_offset;
70     else if (lang_to_offset.first == L"es")
71       available_aliases.es_language_offset = &lang_to_offset;
72     else if (lang_to_offset.first == L"es-419")
73       available_aliases.es_419_language_offset = &lang_to_offset;
74     else if (lang_to_offset.first == L"fil")
75       available_aliases.fil_language_offset = &lang_to_offset;
76     else if (lang_to_offset.first == L"iw")
77       available_aliases.iw_language_offset = &lang_to_offset;
78     else if (lang_to_offset.first == L"no")
79       available_aliases.no_language_offset = &lang_to_offset;
80     else if (lang_to_offset.first == L"pt-br")
81       available_aliases.pt_br_language_offset = &lang_to_offset;
82     else if (lang_to_offset.first == L"zh-cn")
83       available_aliases.zh_cn_language_offset = &lang_to_offset;
84     else if (lang_to_offset.first == L"zh-tw")
85       available_aliases.zh_tw_language_offset = &lang_to_offset;
86   }
87 
88   // Fallback language must exist.
89   DCHECK(available_aliases.en_us_language_offset);
90   return available_aliases;
91 }
92 
93 // Returns true if a LangToOffset entry can be found in |languages_to_offset|
94 // that matches the |language| exactly. |offset| will store the offset of the
95 // language that matches if any. |languages_to_offset| must be sorted by
96 // language and all languages must lower case.
GetExactLanguageOffset(span<const LangToOffset> languages_to_offset,const std::wstring & language,const LangToOffset ** matched_language_to_offset)97 bool GetExactLanguageOffset(span<const LangToOffset> languages_to_offset,
98                             const std::wstring& language,
99                             const LangToOffset** matched_language_to_offset) {
100   DCHECK(matched_language_to_offset);
101 
102   // Binary search in the sorted arrays to find the offset corresponding
103   // to a given language |name|.
104   auto search_result = std::lower_bound(
105       languages_to_offset.begin(), languages_to_offset.end(), language,
106       [](const LangToOffset& left, const std::wstring& to_find) {
107         return left.first < to_find;
108       });
109   if (languages_to_offset.end() != search_result &&
110       search_result->first == language) {
111     *matched_language_to_offset = &*search_result;
112     return true;
113   }
114   return false;
115 }
116 
117 // Returns true if the current language can be aliased to another language.
GetAliasedLanguageOffset(const AvailableLanguageAliases & available_aliases,const std::wstring & language,const LangToOffset ** matched_language_to_offset)118 bool GetAliasedLanguageOffset(const AvailableLanguageAliases& available_aliases,
119                               const std::wstring& language,
120                               const LangToOffset** matched_language_to_offset) {
121   DCHECK(matched_language_to_offset);
122 
123   // Alias some English variants to British English (all others wildcard to
124   // US).
125   if (available_aliases.en_gb_language_offset &&
126       (language == L"en-au" || language == L"en-ca" || language == L"en-nz" ||
127        language == L"en-za")) {
128     *matched_language_to_offset = available_aliases.en_gb_language_offset;
129     return true;
130   }
131   // Alias es-es to es (all others wildcard to es-419).
132   if (available_aliases.es_language_offset && language == L"es-es") {
133     *matched_language_to_offset = available_aliases.es_language_offset;
134     return true;
135   }
136   // Google web properties use iw for he. Handle both just to be safe.
137   if (available_aliases.iw_language_offset && language == L"he") {
138     *matched_language_to_offset = available_aliases.iw_language_offset;
139     return true;
140   }
141   // Google web properties use no for nb. Handle both just to be safe.
142   if (available_aliases.no_language_offset && language == L"nb") {
143     *matched_language_to_offset = available_aliases.no_language_offset;
144     return true;
145   }
146   // Some Google web properties use tl for fil. Handle both just to be safe.
147   // They're not completely identical, but alias it here.
148   if (available_aliases.fil_language_offset && language == L"tl") {
149     *matched_language_to_offset = available_aliases.fil_language_offset;
150     return true;
151   }
152   if (available_aliases.zh_cn_language_offset &&
153       // Pre-Vista alias for Chinese w/ script subtag.
154       (language == L"zh-chs" ||
155        // Vista+ alias for Chinese w/ script subtag.
156        language == L"zh-hans" ||
157        // Although the wildcard entry for zh would result in this, alias zh-sg
158        // so that it will win if it precedes another valid tag in a list of
159        // candidates.
160        language == L"zh-sg")) {
161     *matched_language_to_offset = available_aliases.zh_cn_language_offset;
162     return true;
163   }
164   if (available_aliases.zh_tw_language_offset &&
165       // Pre-Vista alias for Chinese w/ script subtag.
166       (language == L"zh-cht" ||
167        // Vista+ alias for Chinese w/ script subtag.
168        language == L"zh-hant" ||
169        // Alias Hong Kong and Macau to Taiwan.
170        language == L"zh-hk" || language == L"zh-mo")) {
171     *matched_language_to_offset = available_aliases.zh_tw_language_offset;
172     return true;
173   }
174 
175   return false;
176 }
177 
178 // Returns true if the current neutral language can be aliased to another
179 // language.
GetCompatibleNeutralLanguageOffset(const AvailableLanguageAliases & available_aliases,const std::wstring & neutral_language,const LangToOffset ** matched_language_to_offset)180 bool GetCompatibleNeutralLanguageOffset(
181     const AvailableLanguageAliases& available_aliases,
182     const std::wstring& neutral_language,
183     const LangToOffset** matched_language_to_offset) {
184   DCHECK(matched_language_to_offset);
185 
186   if (available_aliases.en_us_language_offset && neutral_language == L"en") {
187     // Use the U.S. region for anything English.
188     *matched_language_to_offset = available_aliases.en_us_language_offset;
189     return true;
190   }
191   if (available_aliases.es_419_language_offset && neutral_language == L"es") {
192     // Use the Latin American region for anything Spanish.
193     *matched_language_to_offset = available_aliases.es_419_language_offset;
194     return true;
195   }
196   if (available_aliases.pt_br_language_offset && neutral_language == L"pt") {
197     // Use the Brazil region for anything Portugese.
198     *matched_language_to_offset = available_aliases.pt_br_language_offset;
199     return true;
200   }
201   if (available_aliases.zh_cn_language_offset && neutral_language == L"zh") {
202     // Use the P.R.C. region for anything Chinese.
203     *matched_language_to_offset = available_aliases.zh_cn_language_offset;
204     return true;
205   }
206 
207   return false;
208 }
209 
210 // Runs through the set of candidates, sending their downcased representation
211 // through |select_predicate|.  Returns true if the predicate selects a
212 // candidate, in which case |matched_name| is assigned the value of the
213 // candidate and |matched_offset| is assigned the language offset of the
214 // selected translation.
215 // static
SelectIf(const std::vector<std::wstring> & candidates,span<const LangToOffset> languages_to_offset,const AvailableLanguageAliases & available_aliases,const LangToOffset ** matched_language_to_offset,std::wstring * matched_name)216 bool SelectIf(const std::vector<std::wstring>& candidates,
217               span<const LangToOffset> languages_to_offset,
218               const AvailableLanguageAliases& available_aliases,
219               const LangToOffset** matched_language_to_offset,
220               std::wstring* matched_name) {
221   DCHECK(matched_language_to_offset);
222   DCHECK(matched_name);
223 
224   // Note: always perform the exact match first so that an alias is never
225   // selected in place of a future translation.
226 
227   // An earlier candidate entry matching on an exact match or alias match takes
228   // precedence over a later candidate entry matching on an exact match.
229   for (const std::wstring& scan : candidates) {
230     std::wstring lower_case_candidate =
231         AsWString(ToLowerASCII(AsStringPiece16(scan)));
232     if (GetExactLanguageOffset(languages_to_offset, lower_case_candidate,
233                                matched_language_to_offset) ||
234         GetAliasedLanguageOffset(available_aliases, lower_case_candidate,
235                                  matched_language_to_offset)) {
236       matched_name->assign(scan);
237       return true;
238     }
239   }
240 
241   // If no candidate matches exactly or by alias, try to match by locale neutral
242   // language.
243   for (const std::wstring& scan : candidates) {
244     std::wstring lower_case_candidate =
245         AsWString(ToLowerASCII(AsStringPiece16(scan)));
246 
247     // Extract the locale neutral language from the language to search and try
248     // to find an exact match for that language in the provided table.
249     std::wstring neutral_language =
250         lower_case_candidate.substr(0, lower_case_candidate.find(L'-'));
251 
252     if (GetCompatibleNeutralLanguageOffset(available_aliases, neutral_language,
253                                            matched_language_to_offset)) {
254       matched_name->assign(scan);
255       return true;
256     }
257   }
258 
259   return false;
260 }
261 
SelectLanguageMatchingCandidate(const std::vector<std::wstring> & candidates,span<const LangToOffset> languages_to_offset,size_t * selected_offset,std::wstring * matched_candidate,std::wstring * selected_language)262 void SelectLanguageMatchingCandidate(
263     const std::vector<std::wstring>& candidates,
264     span<const LangToOffset> languages_to_offset,
265     size_t* selected_offset,
266     std::wstring* matched_candidate,
267     std::wstring* selected_language) {
268   DCHECK(selected_offset);
269   DCHECK(matched_candidate);
270   DCHECK(selected_language);
271   DCHECK(!languages_to_offset.empty());
272   DCHECK_EQ(static_cast<size_t>(*selected_offset), languages_to_offset.size());
273   DCHECK(matched_candidate->empty());
274   DCHECK(selected_language->empty());
275   // Note: While DCHECK_IS_ON() seems redundant here, this is required to avoid
276   // compilation errors, since IsArraySortedAndLowerCased is not defined
277   // otherwise.
278 #if DCHECK_IS_ON()
279   DCHECK(IsArraySortedAndLowerCased(languages_to_offset))
280       << "languages_to_offset is not sorted and lower cased";
281 #endif  // DCHECK_IS_ON()
282 
283   // Get which languages that are commonly used as aliases and wildcards are
284   // available for use to match candidates.
285   AvailableLanguageAliases available_aliases =
286       DetermineAvailableAliases(languages_to_offset);
287 
288   // The fallback must exist.
289   DCHECK(available_aliases.en_us_language_offset);
290 
291   // Try to find the first matching candidate from all the language mappings
292   // that are given. Failing that, used en-us as the fallback language.
293   const LangToOffset* matched_language_to_offset = nullptr;
294   if (!SelectIf(candidates, languages_to_offset, available_aliases,
295                 &matched_language_to_offset, matched_candidate)) {
296     matched_language_to_offset = available_aliases.en_us_language_offset;
297     *matched_candidate =
298         std::wstring(available_aliases.en_us_language_offset->first);
299   }
300 
301   DCHECK(matched_language_to_offset);
302   // Get the real language being used for the matched candidate.
303   *selected_language = std::wstring(matched_language_to_offset->first);
304   *selected_offset = matched_language_to_offset->second;
305 }
306 
GetCandidatesFromSystem(std::wstring_view preferred_language)307 std::vector<std::wstring> GetCandidatesFromSystem(
308     std::wstring_view preferred_language) {
309   std::vector<std::wstring> candidates;
310 
311   // Get the initial candidate list for this particular implementation (if
312   // applicable).
313   if (!preferred_language.empty())
314     candidates.emplace_back(preferred_language);
315 
316   // Now try the UI languages.  Use the thread preferred ones since that will
317   // kindly return us a list of all kinds of fallbacks.
318   win::i18n::GetThreadPreferredUILanguageList(&candidates);
319   return candidates;
320 }
321 
322 }  // namespace
323 
LanguageSelector(std::wstring_view preferred_language,span<const LangToOffset> languages_to_offset)324 LanguageSelector::LanguageSelector(std::wstring_view preferred_language,
325                                    span<const LangToOffset> languages_to_offset)
326     : LanguageSelector(GetCandidatesFromSystem(preferred_language),
327                        languages_to_offset) {}
328 
LanguageSelector(const std::vector<std::wstring> & candidates,span<const LangToOffset> languages_to_offset)329 LanguageSelector::LanguageSelector(const std::vector<std::wstring>& candidates,
330                                    span<const LangToOffset> languages_to_offset)
331     : selected_offset_(languages_to_offset.size()) {
332   SelectLanguageMatchingCandidate(candidates, languages_to_offset,
333                                   &selected_offset_, &matched_candidate_,
334                                   &selected_language_);
335 }
336 
337 LanguageSelector::~LanguageSelector() = default;
338 
339 }  // namespace i18n
340 }  // namespace win
341 }  // namespace base
342