xref: /aosp_15_r20/external/cronet/third_party/ced/src/util/languages/languages.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2016 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 ////////////////////////////////////////////////////////////////////////////////
16 
17 #ifndef UTIL_LANGUAGES_LANGUAGES_H_
18 #define UTIL_LANGUAGES_LANGUAGES_H_
19 
20 // This interface defines the Language enum and functions that depend
21 // only on Language values.
22 
23 // A hash-function for Language, hash<Language>, is defined in
24 // i18n/languages/public/languages-hash.h
25 
26 #ifndef SWIG
27 // Language enum defined in languages.proto
28 // Also description on how to add languages.
29 #include "util/languages/languages.pb.h"
30 
31 #else
32 
33 // TODO: Include a header containing swig-compatible enum.
34 
35 #endif
36 
37 const int kNumLanguages = NUM_LANGUAGES;
38 
39 // Return the default language (ENGLISH).
40 Language default_language();
41 
42 
43 // *******************************************
44 // Language predicates
45 //   IsValidLanguage()
46 //   IS_LANGUAGE_UNKNOWN()
47 //   IsCJKLanguage()
48 //   IsChineseLanguage()
49 //   IsNorwegianLanguage()
50 //   IsPortugueseLanguage()
51 //   IsRightToLeftLanguage()
52 //   IsMaybeRightToLeftLanguage()
53 //   IsSameLanguage()
54 //   IsScriptRequiringLongerSnippets()
55 // *******************************************
56 
57 // IsValidLanguage
58 // ===============
59 //
60 // Function to check if the input is within range of the Language enum. If
61 // IsValidLanguage(lang) returns true, it is safe to call
62 // static_cast<Language>(lang).
63 //
IsValidLanguage(int lang)64 inline bool IsValidLanguage(int lang) {
65   return ((lang >= 0) && (lang < kNumLanguages));
66 }
67 
68 // Return true if the language is "unknown". (This function was
69 // previously a macro, hence the spelling in all caps.)
70 //
IS_LANGUAGE_UNKNOWN(Language lang)71 inline bool IS_LANGUAGE_UNKNOWN(Language lang) {
72   return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;
73 }
74 
75 // IsCJKLanguage
76 // -------------
77 //
78 // This function returns true if the language is either Chinese
79 // (simplified or traditional), Japanese, or Korean.
80 bool IsCJKLanguage(Language lang);
81 
82 // IsChineseLanguage
83 // -----------------
84 //
85 // This function returns true if the language is either Chinese
86 // (simplified or traditional)
87 bool IsChineseLanguage(Language lang);
88 
89 // IsNorwegianLanguage
90 // --------------------
91 //
92 // This function returns true if the language is any of the Norwegian
93 // (regular or Nynorsk).
94 bool IsNorwegianLanguage(Language lang);
95 
96 // IsPortugueseLanguage
97 // --------------------
98 //
99 // This function returns true if the language is any of the Portuguese
100 // languages (regular, Portugal or Brazil)
101 bool IsPortugueseLanguage(Language lang);
102 
103 // IsSameLanguage
104 // --------------
105 //
106 // WARNING: This function provides only a simple test on the values of
107 // the two Language arguments. It returns false if either language is
108 // invalid. It returns true if the language arguments are equal, or
109 // if they are both Chinese languages, both Norwegian languages, or
110 // both Portuguese languages, as defined by IsChineseLanguage,
111 // IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns
112 // false.
113 bool IsSameLanguage(Language lang1, Language lang2);
114 
115 
116 // IsRightToLeftLanguage
117 // ---------------------
118 //
119 // This function returns true if the language is only written right-to-left
120 // (E.g., Hebrew, Arabic, Persian etc.)
121 //
122 // IMPORTANT NOTE: Technically we're talking about scripts, not languages.
123 // There are languages that can be written in more than one script.
124 // Examples:
125 //   - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in
126 //     Latin or Cyrillic script, and right-to-left in Arabic script.
127 //   - Sindhi and Punjabi are written in different scripts, depending on
128 //     region and dialect.
129 //   - Turkmen used an Arabic script historically, but not any more.
130 //   - Pashto and Uyghur can use Arabic script, but use a Roman script
131 //     on the Internet.
132 //   - Kashmiri and Urdu are written either with Arabic or Devanagari script.
133 //
134 // This function only returns true for languages that are always, unequivocally
135 // written in right-to-left script.
136 //
137 // TODO: If we want to do anything special with multi-script languages
138 // we should create new 'languages' for each language+script, as we do for
139 // traditional vs. simplified Chinese. However most such languages are rare in
140 // use and even rarer on the web, so this is unlikely to be something we'll
141 // be concerned with for a while.
142 bool IsRightToLeftLanguage(Language lang);
143 
144 // IsMaybeRightToLeftLanguage
145 // --------------------------
146 //
147 // This function returns true if the language may appear on the web in a
148 // right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)
149 //
150 // NOTE: See important notes under IsRightToLeftLanguage(...).
151 //
152 // This function returns true for languages that *may* appear on the web in a
153 // right-to-left script, even if they may also appear in a left-to-right
154 // script.
155 //
156 // This function should typically be used in cases where doing some work on
157 // left-to-right text would be OK (usually a no-op), and this function is used
158 // just to cut down on unnecessary work on regular, LTR text.
159 bool IsMaybeRightToLeftLanguage(Language lang);
160 
161 // IsScriptRequiringLongerSnippets
162 // --------------------
163 //
164 // This function returns true if the script chracteristics require longer
165 // snippet length (Devanagari, Bengali, Gurmukhi,
166 // Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).
167 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
168 // bool IsScriptRequiringLongerSnippets(UnicodeScript script);
169 
170 
171 // *******************************************
172 // LANGUAGE NAMES
173 //
174 // This interface defines a standard name for each valid Language,
175 // and a standard name for invalid languages. Some language names use all
176 // uppercase letters, but others use mixed case.
177 //   LanguageName() [Language to name]
178 //   LanguageEnumName() [language to enum name]
179 //   LanguageFromName() [name to Language]
180 //   default_language_name()
181 //   invalid_language_name()
182 // *******************************************
183 
184 // Given a Language, returns its standard name.
185 // Return invalid_language_name() if the language is invalid.
186 const char* LanguageName(Language lang);
187 
188 // Given a Language, return the name of the enum constant for that
189 // language. In all but a few cases, this is the same as its standard
190 // name. For example, LanguageName(CHINESE) returns "Chinese", but
191 // LanguageEnumName(CHINESE) returns "CHINESE". This is intended for
192 // code that is generating C++ code, where the enum constant is more
193 // useful than its integer value.  Return "NUM_LANGUAGES" if
194 // the language is invalid.
195 const char* LanguageEnumName(Language lang);
196 
197 // The maximum length of a standard language name.
198 const int kMaxLanguageNameSize = 50;
199 
200 // The standard name for the default language.
201 const char* default_language_name();
202 
203 // The standard name for all invalid languages.
204 const char* invalid_language_name();
205 
206 // If lang_name matches the standard name of a Language, using a
207 // case-insensitive comparison, set *language to that Language and
208 // return true.
209 // Otherwise, set *language to UNKNOWN_LANGUAGE and return false.
210 //
211 // For backwards compatibility, "HATIAN_CREOLE" is allowed as a name
212 // for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.
213 // For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed
214 // as a name for UNKNOWN_LANGUAGE (the return value is true in this case,
215 // as it is for "Unknown"), and "CHINESE_T" is allowed as a name for
216 // CHINESE_T (i.e., a synonym for "ChineseT").
217 //
218 // REQUIRES: language must not be NULL.
219 //
220 bool LanguageFromName(const char* lang_name, Language *language);
221 
222 
223 
224 // *******************************************
225 // LANGUAGE CODES
226 //
227 // This interface defines a standard code for each valid language, and
228 // a standard code for invalid languages. These are derived from ISO codes,
229 // with some Google additions.
230 //   LanguageCode()
231 //   default_language_code()
232 //   invalid_language_code()
233 //   LanguageCodeWithDialects()
234 //   LanguageCodeISO639_1()
235 //   LanguageCodeISO639_2()
236 // *******************************************
237 
238 // Given a Language, return its standard code. There are Google-specific codes:
239 //     For CHINESE_T, return "zh-TW".
240 //     For TG_UNKNOWN_LANGUAGE, return "ut".
241 //     For UNKNOWN_LANGUAGE, return "un".
242 //     For PORTUGUESE_P, return "pt-PT".
243 //     For PORTUGUESE_B, return "pt-BR".
244 //     For LIMBU, return "sit-NP".
245 //     For CHEROKEE, return "chr".
246 //     For SYRIAC, return "syr".
247 // Otherwise return the ISO 639-1 two-letter language code for lang.
248 // If lang is invalid, return invalid_language_code().
249 //
250 // NOTE: See the note below about the codes for Chinese languages.
251 //
252 const char* LanguageCode(Language lang);
253 
254 // The maximum length of a language code.
255 const int kMaxLanguageCodeSize = 50;
256 
257 // The standard code for the default language.
258 const char* default_language_code();
259 
260 // The standard code for all invalid languages.
261 const char* invalid_language_code();
262 
263 
264 // --------------------------------------------
265 // NOTE: CHINESE LANGUAGE CODES
266 //
267 // There are three functions that return codes for Chinese languages.
268 // LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.
269 // LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.
270 // The following list shows the different results.
271 //
272 // LanguageCode(CHINESE) returns "zh"
273 // LanguageCode(CHINESE_T) returns "zh-TW".
274 //
275 // LanguageCodeWithDialects(CHINESE) returns "zh-CN".
276 // LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".
277 //
278 // LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".
279 // LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".
280 // LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".
281 //
282 // --------------------------------------------
283 
284 // LanguageCodeWithDialects
285 // ------------------------
286 //
287 // If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).
288 const char* LanguageCodeWithDialects(Language lang);
289 
290 // LanguageCodeISO639_1
291 // --------------------
292 //
293 // Return the ISO 639-1 two-letter language code for lang.
294 // Return invalid_language_code() if lang is invalid or does not have
295 // an ISO 639-1 two-letter language code.
296 const char* LanguageCodeISO639_1(Language lang);
297 
298 // LanguageCodeISO639_2
299 // --------------------
300 //
301 // Return the ISO 639-2 three-letter language for lang.
302 // Return invalid_language_code() if lang is invalid or does not have
303 // an ISO 639-2 three-letter language code.
304 const char* LanguageCodeISO639_2(Language lang);
305 
306 // LanguageFromCode
307 // ----------------
308 //
309 // If lang_code matches the code for a Language, using a case-insensitive
310 // comparison, set *lang to that Language and return true.
311 // Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.
312 //
313 // lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2
314 // (three-letter) code, or a Google-specific code (see LanguageCode).
315 //
316 // Certain language-code aliases are also allowed:
317 //   For "zh-cn" and "zh_cn", set *lang to CHINESE.
318 //   For "zh-tw" and "zh_tw", set *lang to CHINESE_T.
319 //   For "he", set *lang to HEBREW.
320 //   For "in", set *lang to INDONESIAN.
321 //   For "ji", set *lang to YIDDISH.
322 //   For "fil", set *lang to TAGALOG.
323 //
324 // REQUIRES: 'lang' must not be NULL.
325 bool LanguageFromCode(const char* lang_code, Language *language);
326 
327 
328 // LanguageFromCodeOrName
329 // ----------------------
330 //
331 // If lang_code_or_name is a language code or a language name.
332 // set *language to the corresponding Language and return true.
333 // Otherwise set *language to UNKNOWN_LANGUAGE and return false.
334 //
335 bool LanguageFromCodeOrName(const char* lang_code_or_name,
336                             Language* language);
337 
338 // LanguageNameFromCode
339 // --------------------
340 //
341 // If language_code is the code for a Language (see LanguageFromCode),
342 // return the standard name of that language (see LanguageName).
343 // Otherwise return invalid_language_name().
344 //
345 const char* LanguageNameFromCode(const char* language_code);
346 
347 
348 // Miscellany
349 
350 // LanguageCodeToUnderscoreForm
351 // ----------------------------
352 //
353 // Given a language code, convert the dash "-" to underscore "_".
354 //
355 // Specifically, if result_length <= strlen(lang_code), set result[0]
356 // to '\0' and return false. Otherwise, copy lang_code to result,
357 // converting every dash to an underscore, converting every character
358 // before the first dash or underscore to lower case, and converting
359 // every character after the first dash or underscore to upper
360 // case. If there is no dash or underscore, convert the entire string
361 // to lower case.
362 //
363 // REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.
364 
365 bool LanguageCodeToUnderscoreForm(const char* lang_code,
366                                   char* result,
367                                   int result_length);
368 
369 //
370 // AlwaysPutInExpectedRestrict
371 // ---------------------------
372 //
373 // For Web pages in certain top-level domains, Web Search always
374 // applies a "country restrict". If 'tld' matches one of those, using
375 // a case-SENSITIVE comparison, set *expected_language to the Language
376 // most commonly found in that top-level domain and return true.
377 // Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.
378 bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);
379 
380 
381 #endif  // UTIL_LANGUAGES_LANGUAGES_H_
382