1"""Internationalization and localization support. 2 3This module provides internationalization (I18N) and localization (L10N) 4support for your Python programs by providing an interface to the GNU gettext 5message catalog library. 6 7I18N refers to the operation by which a program is made aware of multiple 8languages. L10N refers to the adaptation of your program, once 9internationalized, to the local language and cultural habits. 10 11""" 12 13# This module represents the integration of work, contributions, feedback, and 14# suggestions from the following people: 15# 16# Martin von Loewis, who wrote the initial implementation of the underlying 17# C-based libintlmodule (later renamed _gettext), along with a skeletal 18# gettext.py implementation. 19# 20# Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule, 21# which also included a pure-Python implementation to read .mo files if 22# intlmodule wasn't available. 23# 24# James Henstridge, who also wrote a gettext.py module, which has some 25# interesting, but currently unsupported experimental features: the notion of 26# a Catalog class and instances, and the ability to add to a catalog file via 27# a Python API. 28# 29# Barry Warsaw integrated these modules, wrote the .install() API and code, 30# and conformed all C and Python code to Python's coding standards. 31# 32# Francois Pinard and Marc-Andre Lemburg also contributed valuably to this 33# module. 34# 35# J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs. 36# 37# TODO: 38# - Lazy loading of .mo files. Currently the entire catalog is loaded into 39# memory, but that's probably bad for large translated programs. Instead, 40# the lexical sort of original strings in GNU .mo files should be exploited 41# to do binary searches and lazy initializations. Or you might want to use 42# the undocumented double-hash algorithm for .mo files with hash tables, but 43# you'll need to study the GNU gettext code to do this. 44# 45# - Support Solaris .mo file formats. Unfortunately, we've been unable to 46# find this format documented anywhere. 47 48 49import os 50import re 51import sys 52 53 54__all__ = ['NullTranslations', 'GNUTranslations', 'Catalog', 55 'bindtextdomain', 'find', 'translation', 'install', 56 'textdomain', 'dgettext', 'dngettext', 'gettext', 57 'ngettext', 'pgettext', 'dpgettext', 'npgettext', 58 'dnpgettext' 59 ] 60 61_default_localedir = os.path.join(sys.base_prefix, 'share', 'locale') 62 63# Expression parsing for plural form selection. 64# 65# The gettext library supports a small subset of C syntax. The only 66# incompatible difference is that integer literals starting with zero are 67# decimal. 68# 69# https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms 70# http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y 71 72_token_pattern = re.compile(r""" 73 (?P<WHITESPACES>[ \t]+) | # spaces and horizontal tabs 74 (?P<NUMBER>[0-9]+\b) | # decimal integer 75 (?P<NAME>n\b) | # only n is allowed 76 (?P<PARENTHESIS>[()]) | 77 (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >, 78 # <=, >=, ==, !=, &&, ||, 79 # ? : 80 # unary and bitwise ops 81 # not allowed 82 (?P<INVALID>\w+|.) # invalid token 83 """, re.VERBOSE|re.DOTALL) 84 85 86def _tokenize(plural): 87 for mo in re.finditer(_token_pattern, plural): 88 kind = mo.lastgroup 89 if kind == 'WHITESPACES': 90 continue 91 value = mo.group(kind) 92 if kind == 'INVALID': 93 raise ValueError('invalid token in plural form: %s' % value) 94 yield value 95 yield '' 96 97 98def _error(value): 99 if value: 100 return ValueError('unexpected token in plural form: %s' % value) 101 else: 102 return ValueError('unexpected end of plural form') 103 104 105_binary_ops = ( 106 ('||',), 107 ('&&',), 108 ('==', '!='), 109 ('<', '>', '<=', '>='), 110 ('+', '-'), 111 ('*', '/', '%'), 112) 113_binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops} 114_c2py_ops = {'||': 'or', '&&': 'and', '/': '//'} 115 116 117def _parse(tokens, priority=-1): 118 result = '' 119 nexttok = next(tokens) 120 while nexttok == '!': 121 result += 'not ' 122 nexttok = next(tokens) 123 124 if nexttok == '(': 125 sub, nexttok = _parse(tokens) 126 result = '%s(%s)' % (result, sub) 127 if nexttok != ')': 128 raise ValueError('unbalanced parenthesis in plural form') 129 elif nexttok == 'n': 130 result = '%s%s' % (result, nexttok) 131 else: 132 try: 133 value = int(nexttok, 10) 134 except ValueError: 135 raise _error(nexttok) from None 136 result = '%s%d' % (result, value) 137 nexttok = next(tokens) 138 139 j = 100 140 while nexttok in _binary_ops: 141 i = _binary_ops[nexttok] 142 if i < priority: 143 break 144 # Break chained comparisons 145 if i in (3, 4) and j in (3, 4): # '==', '!=', '<', '>', '<=', '>=' 146 result = '(%s)' % result 147 # Replace some C operators by their Python equivalents 148 op = _c2py_ops.get(nexttok, nexttok) 149 right, nexttok = _parse(tokens, i + 1) 150 result = '%s %s %s' % (result, op, right) 151 j = i 152 if j == priority == 4: # '<', '>', '<=', '>=' 153 result = '(%s)' % result 154 155 if nexttok == '?' and priority <= 0: 156 if_true, nexttok = _parse(tokens, 0) 157 if nexttok != ':': 158 raise _error(nexttok) 159 if_false, nexttok = _parse(tokens) 160 result = '%s if %s else %s' % (if_true, result, if_false) 161 if priority == 0: 162 result = '(%s)' % result 163 164 return result, nexttok 165 166 167def _as_int(n): 168 try: 169 i = round(n) 170 except TypeError: 171 raise TypeError('Plural value must be an integer, got %s' % 172 (n.__class__.__name__,)) from None 173 import warnings 174 warnings.warn('Plural value must be an integer, got %s' % 175 (n.__class__.__name__,), 176 DeprecationWarning, 4) 177 return n 178 179 180def c2py(plural): 181 """Gets a C expression as used in PO files for plural forms and returns a 182 Python function that implements an equivalent expression. 183 """ 184 185 if len(plural) > 1000: 186 raise ValueError('plural form expression is too long') 187 try: 188 result, nexttok = _parse(_tokenize(plural)) 189 if nexttok: 190 raise _error(nexttok) 191 192 depth = 0 193 for c in result: 194 if c == '(': 195 depth += 1 196 if depth > 20: 197 # Python compiler limit is about 90. 198 # The most complex example has 2. 199 raise ValueError('plural form expression is too complex') 200 elif c == ')': 201 depth -= 1 202 203 ns = {'_as_int': _as_int} 204 exec('''if True: 205 def func(n): 206 if not isinstance(n, int): 207 n = _as_int(n) 208 return int(%s) 209 ''' % result, ns) 210 return ns['func'] 211 except RecursionError: 212 # Recursion error can be raised in _parse() or exec(). 213 raise ValueError('plural form expression is too complex') 214 215 216def _expand_lang(loc): 217 import locale 218 loc = locale.normalize(loc) 219 COMPONENT_CODESET = 1 << 0 220 COMPONENT_TERRITORY = 1 << 1 221 COMPONENT_MODIFIER = 1 << 2 222 # split up the locale into its base components 223 mask = 0 224 pos = loc.find('@') 225 if pos >= 0: 226 modifier = loc[pos:] 227 loc = loc[:pos] 228 mask |= COMPONENT_MODIFIER 229 else: 230 modifier = '' 231 pos = loc.find('.') 232 if pos >= 0: 233 codeset = loc[pos:] 234 loc = loc[:pos] 235 mask |= COMPONENT_CODESET 236 else: 237 codeset = '' 238 pos = loc.find('_') 239 if pos >= 0: 240 territory = loc[pos:] 241 loc = loc[:pos] 242 mask |= COMPONENT_TERRITORY 243 else: 244 territory = '' 245 language = loc 246 ret = [] 247 for i in range(mask+1): 248 if not (i & ~mask): # if all components for this combo exist ... 249 val = language 250 if i & COMPONENT_TERRITORY: val += territory 251 if i & COMPONENT_CODESET: val += codeset 252 if i & COMPONENT_MODIFIER: val += modifier 253 ret.append(val) 254 ret.reverse() 255 return ret 256 257 258class NullTranslations: 259 def __init__(self, fp=None): 260 self._info = {} 261 self._charset = None 262 self._fallback = None 263 if fp is not None: 264 self._parse(fp) 265 266 def _parse(self, fp): 267 pass 268 269 def add_fallback(self, fallback): 270 if self._fallback: 271 self._fallback.add_fallback(fallback) 272 else: 273 self._fallback = fallback 274 275 def gettext(self, message): 276 if self._fallback: 277 return self._fallback.gettext(message) 278 return message 279 280 def ngettext(self, msgid1, msgid2, n): 281 if self._fallback: 282 return self._fallback.ngettext(msgid1, msgid2, n) 283 if n == 1: 284 return msgid1 285 else: 286 return msgid2 287 288 def pgettext(self, context, message): 289 if self._fallback: 290 return self._fallback.pgettext(context, message) 291 return message 292 293 def npgettext(self, context, msgid1, msgid2, n): 294 if self._fallback: 295 return self._fallback.npgettext(context, msgid1, msgid2, n) 296 if n == 1: 297 return msgid1 298 else: 299 return msgid2 300 301 def info(self): 302 return self._info 303 304 def charset(self): 305 return self._charset 306 307 def install(self, names=None): 308 import builtins 309 builtins.__dict__['_'] = self.gettext 310 if names is not None: 311 allowed = {'gettext', 'ngettext', 'npgettext', 'pgettext'} 312 for name in allowed & set(names): 313 builtins.__dict__[name] = getattr(self, name) 314 315 316class GNUTranslations(NullTranslations): 317 # Magic number of .mo files 318 LE_MAGIC = 0x950412de 319 BE_MAGIC = 0xde120495 320 321 # The encoding of a msgctxt and a msgid in a .mo file is 322 # msgctxt + "\x04" + msgid (gettext version >= 0.15) 323 CONTEXT = "%s\x04%s" 324 325 # Acceptable .mo versions 326 VERSIONS = (0, 1) 327 328 def _get_versions(self, version): 329 """Returns a tuple of major version, minor version""" 330 return (version >> 16, version & 0xffff) 331 332 def _parse(self, fp): 333 """Override this method to support alternative .mo formats.""" 334 # Delay struct import for speeding up gettext import when .mo files 335 # are not used. 336 from struct import unpack 337 filename = getattr(fp, 'name', '') 338 # Parse the .mo file header, which consists of 5 little endian 32 339 # bit words. 340 self._catalog = catalog = {} 341 self.plural = lambda n: int(n != 1) # germanic plural by default 342 buf = fp.read() 343 buflen = len(buf) 344 # Are we big endian or little endian? 345 magic = unpack('<I', buf[:4])[0] 346 if magic == self.LE_MAGIC: 347 version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20]) 348 ii = '<II' 349 elif magic == self.BE_MAGIC: 350 version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20]) 351 ii = '>II' 352 else: 353 raise OSError(0, 'Bad magic number', filename) 354 355 major_version, minor_version = self._get_versions(version) 356 357 if major_version not in self.VERSIONS: 358 raise OSError(0, 'Bad version number ' + str(major_version), filename) 359 360 # Now put all messages from the .mo file buffer into the catalog 361 # dictionary. 362 for i in range(0, msgcount): 363 mlen, moff = unpack(ii, buf[masteridx:masteridx+8]) 364 mend = moff + mlen 365 tlen, toff = unpack(ii, buf[transidx:transidx+8]) 366 tend = toff + tlen 367 if mend < buflen and tend < buflen: 368 msg = buf[moff:mend] 369 tmsg = buf[toff:tend] 370 else: 371 raise OSError(0, 'File is corrupt', filename) 372 # See if we're looking at GNU .mo conventions for metadata 373 if mlen == 0: 374 # Catalog description 375 lastk = None 376 for b_item in tmsg.split(b'\n'): 377 item = b_item.decode().strip() 378 if not item: 379 continue 380 # Skip over comment lines: 381 if item.startswith('#-#-#-#-#') and item.endswith('#-#-#-#-#'): 382 continue 383 k = v = None 384 if ':' in item: 385 k, v = item.split(':', 1) 386 k = k.strip().lower() 387 v = v.strip() 388 self._info[k] = v 389 lastk = k 390 elif lastk: 391 self._info[lastk] += '\n' + item 392 if k == 'content-type': 393 self._charset = v.split('charset=')[1] 394 elif k == 'plural-forms': 395 v = v.split(';') 396 plural = v[1].split('plural=')[1] 397 self.plural = c2py(plural) 398 # Note: we unconditionally convert both msgids and msgstrs to 399 # Unicode using the character encoding specified in the charset 400 # parameter of the Content-Type header. The gettext documentation 401 # strongly encourages msgids to be us-ascii, but some applications 402 # require alternative encodings (e.g. Zope's ZCML and ZPT). For 403 # traditional gettext applications, the msgid conversion will 404 # cause no problems since us-ascii should always be a subset of 405 # the charset encoding. We may want to fall back to 8-bit msgids 406 # if the Unicode conversion fails. 407 charset = self._charset or 'ascii' 408 if b'\x00' in msg: 409 # Plural forms 410 msgid1, msgid2 = msg.split(b'\x00') 411 tmsg = tmsg.split(b'\x00') 412 msgid1 = str(msgid1, charset) 413 for i, x in enumerate(tmsg): 414 catalog[(msgid1, i)] = str(x, charset) 415 else: 416 catalog[str(msg, charset)] = str(tmsg, charset) 417 # advance to next entry in the seek tables 418 masteridx += 8 419 transidx += 8 420 421 def gettext(self, message): 422 missing = object() 423 tmsg = self._catalog.get(message, missing) 424 if tmsg is missing: 425 if self._fallback: 426 return self._fallback.gettext(message) 427 return message 428 return tmsg 429 430 def ngettext(self, msgid1, msgid2, n): 431 try: 432 tmsg = self._catalog[(msgid1, self.plural(n))] 433 except KeyError: 434 if self._fallback: 435 return self._fallback.ngettext(msgid1, msgid2, n) 436 if n == 1: 437 tmsg = msgid1 438 else: 439 tmsg = msgid2 440 return tmsg 441 442 def pgettext(self, context, message): 443 ctxt_msg_id = self.CONTEXT % (context, message) 444 missing = object() 445 tmsg = self._catalog.get(ctxt_msg_id, missing) 446 if tmsg is missing: 447 if self._fallback: 448 return self._fallback.pgettext(context, message) 449 return message 450 return tmsg 451 452 def npgettext(self, context, msgid1, msgid2, n): 453 ctxt_msg_id = self.CONTEXT % (context, msgid1) 454 try: 455 tmsg = self._catalog[ctxt_msg_id, self.plural(n)] 456 except KeyError: 457 if self._fallback: 458 return self._fallback.npgettext(context, msgid1, msgid2, n) 459 if n == 1: 460 tmsg = msgid1 461 else: 462 tmsg = msgid2 463 return tmsg 464 465 466# Locate a .mo file using the gettext strategy 467def find(domain, localedir=None, languages=None, all=False): 468 # Get some reasonable defaults for arguments that were not supplied 469 if localedir is None: 470 localedir = _default_localedir 471 if languages is None: 472 languages = [] 473 for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'): 474 val = os.environ.get(envar) 475 if val: 476 languages = val.split(':') 477 break 478 if 'C' not in languages: 479 languages.append('C') 480 # now normalize and expand the languages 481 nelangs = [] 482 for lang in languages: 483 for nelang in _expand_lang(lang): 484 if nelang not in nelangs: 485 nelangs.append(nelang) 486 # select a language 487 if all: 488 result = [] 489 else: 490 result = None 491 for lang in nelangs: 492 if lang == 'C': 493 break 494 mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain) 495 if os.path.exists(mofile): 496 if all: 497 result.append(mofile) 498 else: 499 return mofile 500 return result 501 502 503# a mapping between absolute .mo file path and Translation object 504_translations = {} 505 506 507def translation(domain, localedir=None, languages=None, 508 class_=None, fallback=False): 509 if class_ is None: 510 class_ = GNUTranslations 511 mofiles = find(domain, localedir, languages, all=True) 512 if not mofiles: 513 if fallback: 514 return NullTranslations() 515 from errno import ENOENT 516 raise FileNotFoundError(ENOENT, 517 'No translation file found for domain', domain) 518 # Avoid opening, reading, and parsing the .mo file after it's been done 519 # once. 520 result = None 521 for mofile in mofiles: 522 key = (class_, os.path.abspath(mofile)) 523 t = _translations.get(key) 524 if t is None: 525 with open(mofile, 'rb') as fp: 526 t = _translations.setdefault(key, class_(fp)) 527 # Copy the translation object to allow setting fallbacks and 528 # output charset. All other instance data is shared with the 529 # cached object. 530 # Delay copy import for speeding up gettext import when .mo files 531 # are not used. 532 import copy 533 t = copy.copy(t) 534 if result is None: 535 result = t 536 else: 537 result.add_fallback(t) 538 return result 539 540 541def install(domain, localedir=None, *, names=None): 542 t = translation(domain, localedir, fallback=True) 543 t.install(names) 544 545 546# a mapping b/w domains and locale directories 547_localedirs = {} 548# current global domain, `messages' used for compatibility w/ GNU gettext 549_current_domain = 'messages' 550 551 552def textdomain(domain=None): 553 global _current_domain 554 if domain is not None: 555 _current_domain = domain 556 return _current_domain 557 558 559def bindtextdomain(domain, localedir=None): 560 global _localedirs 561 if localedir is not None: 562 _localedirs[domain] = localedir 563 return _localedirs.get(domain, _default_localedir) 564 565 566def dgettext(domain, message): 567 try: 568 t = translation(domain, _localedirs.get(domain, None)) 569 except OSError: 570 return message 571 return t.gettext(message) 572 573 574def dngettext(domain, msgid1, msgid2, n): 575 try: 576 t = translation(domain, _localedirs.get(domain, None)) 577 except OSError: 578 if n == 1: 579 return msgid1 580 else: 581 return msgid2 582 return t.ngettext(msgid1, msgid2, n) 583 584 585def dpgettext(domain, context, message): 586 try: 587 t = translation(domain, _localedirs.get(domain, None)) 588 except OSError: 589 return message 590 return t.pgettext(context, message) 591 592 593def dnpgettext(domain, context, msgid1, msgid2, n): 594 try: 595 t = translation(domain, _localedirs.get(domain, None)) 596 except OSError: 597 if n == 1: 598 return msgid1 599 else: 600 return msgid2 601 return t.npgettext(context, msgid1, msgid2, n) 602 603 604def gettext(message): 605 return dgettext(_current_domain, message) 606 607 608def ngettext(msgid1, msgid2, n): 609 return dngettext(_current_domain, msgid1, msgid2, n) 610 611 612def pgettext(context, message): 613 return dpgettext(_current_domain, context, message) 614 615 616def npgettext(context, msgid1, msgid2, n): 617 return dnpgettext(_current_domain, context, msgid1, msgid2, n) 618 619 620# dcgettext() has been deemed unnecessary and is not implemented. 621 622# James Henstridge's Catalog constructor from GNOME gettext. Documented usage 623# was: 624# 625# import gettext 626# cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR) 627# _ = cat.gettext 628# print _('Hello World') 629 630# The resulting catalog object currently don't support access through a 631# dictionary API, which was supported (but apparently unused) in GNOME 632# gettext. 633 634Catalog = translation 635