1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // This file defines a helper class for selecting a supported language from a
6 // set of candidates. It is used to get localized strings that are directly
7 // embedded into the executable / library instead of stored in external
8 // .pak files.
9
10 #include "base/win/embedded_i18n/language_selector.h"
11
12 #include <algorithm>
13 #include <functional>
14 #include <string_view>
15
16 #include "base/check_op.h"
17 #include "base/memory/raw_ptr.h"
18 #include "base/ranges/algorithm.h"
19 #include "base/strings/string_util.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "base/win/i18n.h"
22
23 namespace base {
24 namespace win {
25 namespace i18n {
26
27 namespace {
28
29 using LangToOffset = LanguageSelector::LangToOffset;
30
31 // Holds pointers to LangToOffset pairs for specific languages that are the
32 // targets of exceptions (where one language is mapped to another) or wildcards
33 // (where a raw language identifier is mapped to a specific localization).
34 struct AvailableLanguageAliases {
35 raw_ptr<const LangToOffset> en_gb_language_offset;
36 raw_ptr<const LangToOffset> en_us_language_offset;
37 raw_ptr<const LangToOffset> es_language_offset;
38 raw_ptr<const LangToOffset> es_419_language_offset;
39 raw_ptr<const LangToOffset> fil_language_offset;
40 raw_ptr<const LangToOffset> iw_language_offset;
41 raw_ptr<const LangToOffset> no_language_offset;
42 raw_ptr<const LangToOffset> pt_br_language_offset;
43 raw_ptr<const LangToOffset> zh_cn_language_offset;
44 raw_ptr<const LangToOffset> zh_tw_language_offset;
45 };
46
47 #if DCHECK_IS_ON()
48 // Returns true if the items in the given range are sorted and lower cased.
IsArraySortedAndLowerCased(span<const LangToOffset> languages_to_offset)49 bool IsArraySortedAndLowerCased(span<const LangToOffset> languages_to_offset) {
50 return std::is_sorted(languages_to_offset.begin(),
51 languages_to_offset.end()) &&
52 base::ranges::all_of(languages_to_offset, [](const auto& lang) {
53 auto language = AsStringPiece16(lang.first);
54 return ToLowerASCII(language) == language;
55 });
56 }
57 #endif // DCHECK_IS_ON()
58
59 // Determines the availability of all languages that may be used as aliases in
60 // GetAliasedLanguageOffset or GetCompatibleNeutralLanguageOffset
DetermineAvailableAliases(span<const LangToOffset> languages_to_offset)61 AvailableLanguageAliases DetermineAvailableAliases(
62 span<const LangToOffset> languages_to_offset) {
63 AvailableLanguageAliases available_aliases = {};
64
65 for (const LangToOffset& lang_to_offset : languages_to_offset) {
66 if (lang_to_offset.first == L"en-gb")
67 available_aliases.en_gb_language_offset = &lang_to_offset;
68 else if (lang_to_offset.first == L"en-us")
69 available_aliases.en_us_language_offset = &lang_to_offset;
70 else if (lang_to_offset.first == L"es")
71 available_aliases.es_language_offset = &lang_to_offset;
72 else if (lang_to_offset.first == L"es-419")
73 available_aliases.es_419_language_offset = &lang_to_offset;
74 else if (lang_to_offset.first == L"fil")
75 available_aliases.fil_language_offset = &lang_to_offset;
76 else if (lang_to_offset.first == L"iw")
77 available_aliases.iw_language_offset = &lang_to_offset;
78 else if (lang_to_offset.first == L"no")
79 available_aliases.no_language_offset = &lang_to_offset;
80 else if (lang_to_offset.first == L"pt-br")
81 available_aliases.pt_br_language_offset = &lang_to_offset;
82 else if (lang_to_offset.first == L"zh-cn")
83 available_aliases.zh_cn_language_offset = &lang_to_offset;
84 else if (lang_to_offset.first == L"zh-tw")
85 available_aliases.zh_tw_language_offset = &lang_to_offset;
86 }
87
88 // Fallback language must exist.
89 DCHECK(available_aliases.en_us_language_offset);
90 return available_aliases;
91 }
92
93 // Returns true if a LangToOffset entry can be found in |languages_to_offset|
94 // that matches the |language| exactly. |offset| will store the offset of the
95 // language that matches if any. |languages_to_offset| must be sorted by
96 // language and all languages must lower case.
GetExactLanguageOffset(span<const LangToOffset> languages_to_offset,const std::wstring & language,const LangToOffset ** matched_language_to_offset)97 bool GetExactLanguageOffset(span<const LangToOffset> languages_to_offset,
98 const std::wstring& language,
99 const LangToOffset** matched_language_to_offset) {
100 DCHECK(matched_language_to_offset);
101
102 // Binary search in the sorted arrays to find the offset corresponding
103 // to a given language |name|.
104 auto search_result = std::lower_bound(
105 languages_to_offset.begin(), languages_to_offset.end(), language,
106 [](const LangToOffset& left, const std::wstring& to_find) {
107 return left.first < to_find;
108 });
109 if (languages_to_offset.end() != search_result &&
110 search_result->first == language) {
111 *matched_language_to_offset = &*search_result;
112 return true;
113 }
114 return false;
115 }
116
117 // Returns true if the current language can be aliased to another language.
GetAliasedLanguageOffset(const AvailableLanguageAliases & available_aliases,const std::wstring & language,const LangToOffset ** matched_language_to_offset)118 bool GetAliasedLanguageOffset(const AvailableLanguageAliases& available_aliases,
119 const std::wstring& language,
120 const LangToOffset** matched_language_to_offset) {
121 DCHECK(matched_language_to_offset);
122
123 // Alias some English variants to British English (all others wildcard to
124 // US).
125 if (available_aliases.en_gb_language_offset &&
126 (language == L"en-au" || language == L"en-ca" || language == L"en-nz" ||
127 language == L"en-za")) {
128 *matched_language_to_offset = available_aliases.en_gb_language_offset;
129 return true;
130 }
131 // Alias es-es to es (all others wildcard to es-419).
132 if (available_aliases.es_language_offset && language == L"es-es") {
133 *matched_language_to_offset = available_aliases.es_language_offset;
134 return true;
135 }
136 // Google web properties use iw for he. Handle both just to be safe.
137 if (available_aliases.iw_language_offset && language == L"he") {
138 *matched_language_to_offset = available_aliases.iw_language_offset;
139 return true;
140 }
141 // Google web properties use no for nb. Handle both just to be safe.
142 if (available_aliases.no_language_offset && language == L"nb") {
143 *matched_language_to_offset = available_aliases.no_language_offset;
144 return true;
145 }
146 // Some Google web properties use tl for fil. Handle both just to be safe.
147 // They're not completely identical, but alias it here.
148 if (available_aliases.fil_language_offset && language == L"tl") {
149 *matched_language_to_offset = available_aliases.fil_language_offset;
150 return true;
151 }
152 if (available_aliases.zh_cn_language_offset &&
153 // Pre-Vista alias for Chinese w/ script subtag.
154 (language == L"zh-chs" ||
155 // Vista+ alias for Chinese w/ script subtag.
156 language == L"zh-hans" ||
157 // Although the wildcard entry for zh would result in this, alias zh-sg
158 // so that it will win if it precedes another valid tag in a list of
159 // candidates.
160 language == L"zh-sg")) {
161 *matched_language_to_offset = available_aliases.zh_cn_language_offset;
162 return true;
163 }
164 if (available_aliases.zh_tw_language_offset &&
165 // Pre-Vista alias for Chinese w/ script subtag.
166 (language == L"zh-cht" ||
167 // Vista+ alias for Chinese w/ script subtag.
168 language == L"zh-hant" ||
169 // Alias Hong Kong and Macau to Taiwan.
170 language == L"zh-hk" || language == L"zh-mo")) {
171 *matched_language_to_offset = available_aliases.zh_tw_language_offset;
172 return true;
173 }
174
175 return false;
176 }
177
178 // Returns true if the current neutral language can be aliased to another
179 // language.
GetCompatibleNeutralLanguageOffset(const AvailableLanguageAliases & available_aliases,const std::wstring & neutral_language,const LangToOffset ** matched_language_to_offset)180 bool GetCompatibleNeutralLanguageOffset(
181 const AvailableLanguageAliases& available_aliases,
182 const std::wstring& neutral_language,
183 const LangToOffset** matched_language_to_offset) {
184 DCHECK(matched_language_to_offset);
185
186 if (available_aliases.en_us_language_offset && neutral_language == L"en") {
187 // Use the U.S. region for anything English.
188 *matched_language_to_offset = available_aliases.en_us_language_offset;
189 return true;
190 }
191 if (available_aliases.es_419_language_offset && neutral_language == L"es") {
192 // Use the Latin American region for anything Spanish.
193 *matched_language_to_offset = available_aliases.es_419_language_offset;
194 return true;
195 }
196 if (available_aliases.pt_br_language_offset && neutral_language == L"pt") {
197 // Use the Brazil region for anything Portugese.
198 *matched_language_to_offset = available_aliases.pt_br_language_offset;
199 return true;
200 }
201 if (available_aliases.zh_cn_language_offset && neutral_language == L"zh") {
202 // Use the P.R.C. region for anything Chinese.
203 *matched_language_to_offset = available_aliases.zh_cn_language_offset;
204 return true;
205 }
206
207 return false;
208 }
209
210 // Runs through the set of candidates, sending their downcased representation
211 // through |select_predicate|. Returns true if the predicate selects a
212 // candidate, in which case |matched_name| is assigned the value of the
213 // candidate and |matched_offset| is assigned the language offset of the
214 // selected translation.
215 // static
SelectIf(const std::vector<std::wstring> & candidates,span<const LangToOffset> languages_to_offset,const AvailableLanguageAliases & available_aliases,const LangToOffset ** matched_language_to_offset,std::wstring * matched_name)216 bool SelectIf(const std::vector<std::wstring>& candidates,
217 span<const LangToOffset> languages_to_offset,
218 const AvailableLanguageAliases& available_aliases,
219 const LangToOffset** matched_language_to_offset,
220 std::wstring* matched_name) {
221 DCHECK(matched_language_to_offset);
222 DCHECK(matched_name);
223
224 // Note: always perform the exact match first so that an alias is never
225 // selected in place of a future translation.
226
227 // An earlier candidate entry matching on an exact match or alias match takes
228 // precedence over a later candidate entry matching on an exact match.
229 for (const std::wstring& scan : candidates) {
230 std::wstring lower_case_candidate =
231 AsWString(ToLowerASCII(AsStringPiece16(scan)));
232 if (GetExactLanguageOffset(languages_to_offset, lower_case_candidate,
233 matched_language_to_offset) ||
234 GetAliasedLanguageOffset(available_aliases, lower_case_candidate,
235 matched_language_to_offset)) {
236 matched_name->assign(scan);
237 return true;
238 }
239 }
240
241 // If no candidate matches exactly or by alias, try to match by locale neutral
242 // language.
243 for (const std::wstring& scan : candidates) {
244 std::wstring lower_case_candidate =
245 AsWString(ToLowerASCII(AsStringPiece16(scan)));
246
247 // Extract the locale neutral language from the language to search and try
248 // to find an exact match for that language in the provided table.
249 std::wstring neutral_language =
250 lower_case_candidate.substr(0, lower_case_candidate.find(L'-'));
251
252 if (GetCompatibleNeutralLanguageOffset(available_aliases, neutral_language,
253 matched_language_to_offset)) {
254 matched_name->assign(scan);
255 return true;
256 }
257 }
258
259 return false;
260 }
261
SelectLanguageMatchingCandidate(const std::vector<std::wstring> & candidates,span<const LangToOffset> languages_to_offset,size_t * selected_offset,std::wstring * matched_candidate,std::wstring * selected_language)262 void SelectLanguageMatchingCandidate(
263 const std::vector<std::wstring>& candidates,
264 span<const LangToOffset> languages_to_offset,
265 size_t* selected_offset,
266 std::wstring* matched_candidate,
267 std::wstring* selected_language) {
268 DCHECK(selected_offset);
269 DCHECK(matched_candidate);
270 DCHECK(selected_language);
271 DCHECK(!languages_to_offset.empty());
272 DCHECK_EQ(static_cast<size_t>(*selected_offset), languages_to_offset.size());
273 DCHECK(matched_candidate->empty());
274 DCHECK(selected_language->empty());
275 // Note: While DCHECK_IS_ON() seems redundant here, this is required to avoid
276 // compilation errors, since IsArraySortedAndLowerCased is not defined
277 // otherwise.
278 #if DCHECK_IS_ON()
279 DCHECK(IsArraySortedAndLowerCased(languages_to_offset))
280 << "languages_to_offset is not sorted and lower cased";
281 #endif // DCHECK_IS_ON()
282
283 // Get which languages that are commonly used as aliases and wildcards are
284 // available for use to match candidates.
285 AvailableLanguageAliases available_aliases =
286 DetermineAvailableAliases(languages_to_offset);
287
288 // The fallback must exist.
289 DCHECK(available_aliases.en_us_language_offset);
290
291 // Try to find the first matching candidate from all the language mappings
292 // that are given. Failing that, used en-us as the fallback language.
293 const LangToOffset* matched_language_to_offset = nullptr;
294 if (!SelectIf(candidates, languages_to_offset, available_aliases,
295 &matched_language_to_offset, matched_candidate)) {
296 matched_language_to_offset = available_aliases.en_us_language_offset;
297 *matched_candidate =
298 std::wstring(available_aliases.en_us_language_offset->first);
299 }
300
301 DCHECK(matched_language_to_offset);
302 // Get the real language being used for the matched candidate.
303 *selected_language = std::wstring(matched_language_to_offset->first);
304 *selected_offset = matched_language_to_offset->second;
305 }
306
GetCandidatesFromSystem(std::wstring_view preferred_language)307 std::vector<std::wstring> GetCandidatesFromSystem(
308 std::wstring_view preferred_language) {
309 std::vector<std::wstring> candidates;
310
311 // Get the initial candidate list for this particular implementation (if
312 // applicable).
313 if (!preferred_language.empty())
314 candidates.emplace_back(preferred_language);
315
316 // Now try the UI languages. Use the thread preferred ones since that will
317 // kindly return us a list of all kinds of fallbacks.
318 win::i18n::GetThreadPreferredUILanguageList(&candidates);
319 return candidates;
320 }
321
322 } // namespace
323
LanguageSelector(std::wstring_view preferred_language,span<const LangToOffset> languages_to_offset)324 LanguageSelector::LanguageSelector(std::wstring_view preferred_language,
325 span<const LangToOffset> languages_to_offset)
326 : LanguageSelector(GetCandidatesFromSystem(preferred_language),
327 languages_to_offset) {}
328
LanguageSelector(const std::vector<std::wstring> & candidates,span<const LangToOffset> languages_to_offset)329 LanguageSelector::LanguageSelector(const std::vector<std::wstring>& candidates,
330 span<const LangToOffset> languages_to_offset)
331 : selected_offset_(languages_to_offset.size()) {
332 SelectLanguageMatchingCandidate(candidates, languages_to_offset,
333 &selected_offset_, &matched_candidate_,
334 &selected_language_);
335 }
336
337 LanguageSelector::~LanguageSelector() = default;
338
339 } // namespace i18n
340 } // namespace win
341 } // namespace base
342