1"""Internationalization and localization support.
2
3This module provides internationalization (I18N) and localization (L10N)
4support for your Python programs by providing an interface to the GNU gettext
5message catalog library.
6
7I18N refers to the operation by which a program is made aware of multiple
8languages.  L10N refers to the adaptation of your program, once
9internationalized, to the local language and cultural habits.
10
11"""
12
13# This module represents the integration of work, contributions, feedback, and
14# suggestions from the following people:
15#
16# Martin von Loewis, who wrote the initial implementation of the underlying
17# C-based libintlmodule (later renamed _gettext), along with a skeletal
18# gettext.py implementation.
19#
20# Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
21# which also included a pure-Python implementation to read .mo files if
22# intlmodule wasn't available.
23#
24# James Henstridge, who also wrote a gettext.py module, which has some
25# interesting, but currently unsupported experimental features: the notion of
26# a Catalog class and instances, and the ability to add to a catalog file via
27# a Python API.
28#
29# Barry Warsaw integrated these modules, wrote the .install() API and code,
30# and conformed all C and Python code to Python's coding standards.
31#
32# Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
33# module.
34#
35# J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
36#
37# TODO:
38# - Lazy loading of .mo files.  Currently the entire catalog is loaded into
39#   memory, but that's probably bad for large translated programs.  Instead,
40#   the lexical sort of original strings in GNU .mo files should be exploited
41#   to do binary searches and lazy initializations.  Or you might want to use
42#   the undocumented double-hash algorithm for .mo files with hash tables, but
43#   you'll need to study the GNU gettext code to do this.
44#
45# - Support Solaris .mo file formats.  Unfortunately, we've been unable to
46#   find this format documented anywhere.
47
48
49import os
50import re
51import sys
52
53
54__all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
55           'bindtextdomain', 'find', 'translation', 'install',
56           'textdomain', 'dgettext', 'dngettext', 'gettext',
57           'ngettext', 'pgettext', 'dpgettext', 'npgettext',
58           'dnpgettext'
59           ]
60
61_default_localedir = os.path.join(sys.base_prefix, 'share', 'locale')
62
63# Expression parsing for plural form selection.
64#
65# The gettext library supports a small subset of C syntax.  The only
66# incompatible difference is that integer literals starting with zero are
67# decimal.
68#
69# https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms
70# http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y
71
72_token_pattern = re.compile(r"""
73        (?P<WHITESPACES>[ \t]+)                    | # spaces and horizontal tabs
74        (?P<NUMBER>[0-9]+\b)                       | # decimal integer
75        (?P<NAME>n\b)                              | # only n is allowed
76        (?P<PARENTHESIS>[()])                      |
77        (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >,
78                                                     # <=, >=, ==, !=, &&, ||,
79                                                     # ? :
80                                                     # unary and bitwise ops
81                                                     # not allowed
82        (?P<INVALID>\w+|.)                           # invalid token
83    """, re.VERBOSE|re.DOTALL)
84
85
86def _tokenize(plural):
87    for mo in re.finditer(_token_pattern, plural):
88        kind = mo.lastgroup
89        if kind == 'WHITESPACES':
90            continue
91        value = mo.group(kind)
92        if kind == 'INVALID':
93            raise ValueError('invalid token in plural form: %s' % value)
94        yield value
95    yield ''
96
97
98def _error(value):
99    if value:
100        return ValueError('unexpected token in plural form: %s' % value)
101    else:
102        return ValueError('unexpected end of plural form')
103
104
105_binary_ops = (
106    ('||',),
107    ('&&',),
108    ('==', '!='),
109    ('<', '>', '<=', '>='),
110    ('+', '-'),
111    ('*', '/', '%'),
112)
113_binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops}
114_c2py_ops = {'||': 'or', '&&': 'and', '/': '//'}
115
116
117def _parse(tokens, priority=-1):
118    result = ''
119    nexttok = next(tokens)
120    while nexttok == '!':
121        result += 'not '
122        nexttok = next(tokens)
123
124    if nexttok == '(':
125        sub, nexttok = _parse(tokens)
126        result = '%s(%s)' % (result, sub)
127        if nexttok != ')':
128            raise ValueError('unbalanced parenthesis in plural form')
129    elif nexttok == 'n':
130        result = '%s%s' % (result, nexttok)
131    else:
132        try:
133            value = int(nexttok, 10)
134        except ValueError:
135            raise _error(nexttok) from None
136        result = '%s%d' % (result, value)
137    nexttok = next(tokens)
138
139    j = 100
140    while nexttok in _binary_ops:
141        i = _binary_ops[nexttok]
142        if i < priority:
143            break
144        # Break chained comparisons
145        if i in (3, 4) and j in (3, 4):  # '==', '!=', '<', '>', '<=', '>='
146            result = '(%s)' % result
147        # Replace some C operators by their Python equivalents
148        op = _c2py_ops.get(nexttok, nexttok)
149        right, nexttok = _parse(tokens, i + 1)
150        result = '%s %s %s' % (result, op, right)
151        j = i
152    if j == priority == 4:  # '<', '>', '<=', '>='
153        result = '(%s)' % result
154
155    if nexttok == '?' and priority <= 0:
156        if_true, nexttok = _parse(tokens, 0)
157        if nexttok != ':':
158            raise _error(nexttok)
159        if_false, nexttok = _parse(tokens)
160        result = '%s if %s else %s' % (if_true, result, if_false)
161        if priority == 0:
162            result = '(%s)' % result
163
164    return result, nexttok
165
166
167def _as_int(n):
168    try:
169        i = round(n)
170    except TypeError:
171        raise TypeError('Plural value must be an integer, got %s' %
172                        (n.__class__.__name__,)) from None
173    import warnings
174    warnings.warn('Plural value must be an integer, got %s' %
175                  (n.__class__.__name__,),
176                  DeprecationWarning, 4)
177    return n
178
179
180def c2py(plural):
181    """Gets a C expression as used in PO files for plural forms and returns a
182    Python function that implements an equivalent expression.
183    """
184
185    if len(plural) > 1000:
186        raise ValueError('plural form expression is too long')
187    try:
188        result, nexttok = _parse(_tokenize(plural))
189        if nexttok:
190            raise _error(nexttok)
191
192        depth = 0
193        for c in result:
194            if c == '(':
195                depth += 1
196                if depth > 20:
197                    # Python compiler limit is about 90.
198                    # The most complex example has 2.
199                    raise ValueError('plural form expression is too complex')
200            elif c == ')':
201                depth -= 1
202
203        ns = {'_as_int': _as_int}
204        exec('''if True:
205            def func(n):
206                if not isinstance(n, int):
207                    n = _as_int(n)
208                return int(%s)
209            ''' % result, ns)
210        return ns['func']
211    except RecursionError:
212        # Recursion error can be raised in _parse() or exec().
213        raise ValueError('plural form expression is too complex')
214
215
216def _expand_lang(loc):
217    import locale
218    loc = locale.normalize(loc)
219    COMPONENT_CODESET   = 1 << 0
220    COMPONENT_TERRITORY = 1 << 1
221    COMPONENT_MODIFIER  = 1 << 2
222    # split up the locale into its base components
223    mask = 0
224    pos = loc.find('@')
225    if pos >= 0:
226        modifier = loc[pos:]
227        loc = loc[:pos]
228        mask |= COMPONENT_MODIFIER
229    else:
230        modifier = ''
231    pos = loc.find('.')
232    if pos >= 0:
233        codeset = loc[pos:]
234        loc = loc[:pos]
235        mask |= COMPONENT_CODESET
236    else:
237        codeset = ''
238    pos = loc.find('_')
239    if pos >= 0:
240        territory = loc[pos:]
241        loc = loc[:pos]
242        mask |= COMPONENT_TERRITORY
243    else:
244        territory = ''
245    language = loc
246    ret = []
247    for i in range(mask+1):
248        if not (i & ~mask):  # if all components for this combo exist ...
249            val = language
250            if i & COMPONENT_TERRITORY: val += territory
251            if i & COMPONENT_CODESET:   val += codeset
252            if i & COMPONENT_MODIFIER:  val += modifier
253            ret.append(val)
254    ret.reverse()
255    return ret
256
257
258class NullTranslations:
259    def __init__(self, fp=None):
260        self._info = {}
261        self._charset = None
262        self._fallback = None
263        if fp is not None:
264            self._parse(fp)
265
266    def _parse(self, fp):
267        pass
268
269    def add_fallback(self, fallback):
270        if self._fallback:
271            self._fallback.add_fallback(fallback)
272        else:
273            self._fallback = fallback
274
275    def gettext(self, message):
276        if self._fallback:
277            return self._fallback.gettext(message)
278        return message
279
280    def ngettext(self, msgid1, msgid2, n):
281        if self._fallback:
282            return self._fallback.ngettext(msgid1, msgid2, n)
283        if n == 1:
284            return msgid1
285        else:
286            return msgid2
287
288    def pgettext(self, context, message):
289        if self._fallback:
290            return self._fallback.pgettext(context, message)
291        return message
292
293    def npgettext(self, context, msgid1, msgid2, n):
294        if self._fallback:
295            return self._fallback.npgettext(context, msgid1, msgid2, n)
296        if n == 1:
297            return msgid1
298        else:
299            return msgid2
300
301    def info(self):
302        return self._info
303
304    def charset(self):
305        return self._charset
306
307    def install(self, names=None):
308        import builtins
309        builtins.__dict__['_'] = self.gettext
310        if names is not None:
311            allowed = {'gettext', 'ngettext', 'npgettext', 'pgettext'}
312            for name in allowed & set(names):
313                builtins.__dict__[name] = getattr(self, name)
314
315
316class GNUTranslations(NullTranslations):
317    # Magic number of .mo files
318    LE_MAGIC = 0x950412de
319    BE_MAGIC = 0xde120495
320
321    # The encoding of a msgctxt and a msgid in a .mo file is
322    # msgctxt + "\x04" + msgid (gettext version >= 0.15)
323    CONTEXT = "%s\x04%s"
324
325    # Acceptable .mo versions
326    VERSIONS = (0, 1)
327
328    def _get_versions(self, version):
329        """Returns a tuple of major version, minor version"""
330        return (version >> 16, version & 0xffff)
331
332    def _parse(self, fp):
333        """Override this method to support alternative .mo formats."""
334        # Delay struct import for speeding up gettext import when .mo files
335        # are not used.
336        from struct import unpack
337        filename = getattr(fp, 'name', '')
338        # Parse the .mo file header, which consists of 5 little endian 32
339        # bit words.
340        self._catalog = catalog = {}
341        self.plural = lambda n: int(n != 1) # germanic plural by default
342        buf = fp.read()
343        buflen = len(buf)
344        # Are we big endian or little endian?
345        magic = unpack('<I', buf[:4])[0]
346        if magic == self.LE_MAGIC:
347            version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
348            ii = '<II'
349        elif magic == self.BE_MAGIC:
350            version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
351            ii = '>II'
352        else:
353            raise OSError(0, 'Bad magic number', filename)
354
355        major_version, minor_version = self._get_versions(version)
356
357        if major_version not in self.VERSIONS:
358            raise OSError(0, 'Bad version number ' + str(major_version), filename)
359
360        # Now put all messages from the .mo file buffer into the catalog
361        # dictionary.
362        for i in range(0, msgcount):
363            mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
364            mend = moff + mlen
365            tlen, toff = unpack(ii, buf[transidx:transidx+8])
366            tend = toff + tlen
367            if mend < buflen and tend < buflen:
368                msg = buf[moff:mend]
369                tmsg = buf[toff:tend]
370            else:
371                raise OSError(0, 'File is corrupt', filename)
372            # See if we're looking at GNU .mo conventions for metadata
373            if mlen == 0:
374                # Catalog description
375                lastk = None
376                for b_item in tmsg.split(b'\n'):
377                    item = b_item.decode().strip()
378                    if not item:
379                        continue
380                    # Skip over comment lines:
381                    if item.startswith('#-#-#-#-#') and item.endswith('#-#-#-#-#'):
382                        continue
383                    k = v = None
384                    if ':' in item:
385                        k, v = item.split(':', 1)
386                        k = k.strip().lower()
387                        v = v.strip()
388                        self._info[k] = v
389                        lastk = k
390                    elif lastk:
391                        self._info[lastk] += '\n' + item
392                    if k == 'content-type':
393                        self._charset = v.split('charset=')[1]
394                    elif k == 'plural-forms':
395                        v = v.split(';')
396                        plural = v[1].split('plural=')[1]
397                        self.plural = c2py(plural)
398            # Note: we unconditionally convert both msgids and msgstrs to
399            # Unicode using the character encoding specified in the charset
400            # parameter of the Content-Type header.  The gettext documentation
401            # strongly encourages msgids to be us-ascii, but some applications
402            # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
403            # traditional gettext applications, the msgid conversion will
404            # cause no problems since us-ascii should always be a subset of
405            # the charset encoding.  We may want to fall back to 8-bit msgids
406            # if the Unicode conversion fails.
407            charset = self._charset or 'ascii'
408            if b'\x00' in msg:
409                # Plural forms
410                msgid1, msgid2 = msg.split(b'\x00')
411                tmsg = tmsg.split(b'\x00')
412                msgid1 = str(msgid1, charset)
413                for i, x in enumerate(tmsg):
414                    catalog[(msgid1, i)] = str(x, charset)
415            else:
416                catalog[str(msg, charset)] = str(tmsg, charset)
417            # advance to next entry in the seek tables
418            masteridx += 8
419            transidx += 8
420
421    def gettext(self, message):
422        missing = object()
423        tmsg = self._catalog.get(message, missing)
424        if tmsg is missing:
425            if self._fallback:
426                return self._fallback.gettext(message)
427            return message
428        return tmsg
429
430    def ngettext(self, msgid1, msgid2, n):
431        try:
432            tmsg = self._catalog[(msgid1, self.plural(n))]
433        except KeyError:
434            if self._fallback:
435                return self._fallback.ngettext(msgid1, msgid2, n)
436            if n == 1:
437                tmsg = msgid1
438            else:
439                tmsg = msgid2
440        return tmsg
441
442    def pgettext(self, context, message):
443        ctxt_msg_id = self.CONTEXT % (context, message)
444        missing = object()
445        tmsg = self._catalog.get(ctxt_msg_id, missing)
446        if tmsg is missing:
447            if self._fallback:
448                return self._fallback.pgettext(context, message)
449            return message
450        return tmsg
451
452    def npgettext(self, context, msgid1, msgid2, n):
453        ctxt_msg_id = self.CONTEXT % (context, msgid1)
454        try:
455            tmsg = self._catalog[ctxt_msg_id, self.plural(n)]
456        except KeyError:
457            if self._fallback:
458                return self._fallback.npgettext(context, msgid1, msgid2, n)
459            if n == 1:
460                tmsg = msgid1
461            else:
462                tmsg = msgid2
463        return tmsg
464
465
466# Locate a .mo file using the gettext strategy
467def find(domain, localedir=None, languages=None, all=False):
468    # Get some reasonable defaults for arguments that were not supplied
469    if localedir is None:
470        localedir = _default_localedir
471    if languages is None:
472        languages = []
473        for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
474            val = os.environ.get(envar)
475            if val:
476                languages = val.split(':')
477                break
478        if 'C' not in languages:
479            languages.append('C')
480    # now normalize and expand the languages
481    nelangs = []
482    for lang in languages:
483        for nelang in _expand_lang(lang):
484            if nelang not in nelangs:
485                nelangs.append(nelang)
486    # select a language
487    if all:
488        result = []
489    else:
490        result = None
491    for lang in nelangs:
492        if lang == 'C':
493            break
494        mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
495        if os.path.exists(mofile):
496            if all:
497                result.append(mofile)
498            else:
499                return mofile
500    return result
501
502
503# a mapping between absolute .mo file path and Translation object
504_translations = {}
505
506
507def translation(domain, localedir=None, languages=None,
508                class_=None, fallback=False):
509    if class_ is None:
510        class_ = GNUTranslations
511    mofiles = find(domain, localedir, languages, all=True)
512    if not mofiles:
513        if fallback:
514            return NullTranslations()
515        from errno import ENOENT
516        raise FileNotFoundError(ENOENT,
517                                'No translation file found for domain', domain)
518    # Avoid opening, reading, and parsing the .mo file after it's been done
519    # once.
520    result = None
521    for mofile in mofiles:
522        key = (class_, os.path.abspath(mofile))
523        t = _translations.get(key)
524        if t is None:
525            with open(mofile, 'rb') as fp:
526                t = _translations.setdefault(key, class_(fp))
527        # Copy the translation object to allow setting fallbacks and
528        # output charset. All other instance data is shared with the
529        # cached object.
530        # Delay copy import for speeding up gettext import when .mo files
531        # are not used.
532        import copy
533        t = copy.copy(t)
534        if result is None:
535            result = t
536        else:
537            result.add_fallback(t)
538    return result
539
540
541def install(domain, localedir=None, *, names=None):
542    t = translation(domain, localedir, fallback=True)
543    t.install(names)
544
545
546# a mapping b/w domains and locale directories
547_localedirs = {}
548# current global domain, `messages' used for compatibility w/ GNU gettext
549_current_domain = 'messages'
550
551
552def textdomain(domain=None):
553    global _current_domain
554    if domain is not None:
555        _current_domain = domain
556    return _current_domain
557
558
559def bindtextdomain(domain, localedir=None):
560    global _localedirs
561    if localedir is not None:
562        _localedirs[domain] = localedir
563    return _localedirs.get(domain, _default_localedir)
564
565
566def dgettext(domain, message):
567    try:
568        t = translation(domain, _localedirs.get(domain, None))
569    except OSError:
570        return message
571    return t.gettext(message)
572
573
574def dngettext(domain, msgid1, msgid2, n):
575    try:
576        t = translation(domain, _localedirs.get(domain, None))
577    except OSError:
578        if n == 1:
579            return msgid1
580        else:
581            return msgid2
582    return t.ngettext(msgid1, msgid2, n)
583
584
585def dpgettext(domain, context, message):
586    try:
587        t = translation(domain, _localedirs.get(domain, None))
588    except OSError:
589        return message
590    return t.pgettext(context, message)
591
592
593def dnpgettext(domain, context, msgid1, msgid2, n):
594    try:
595        t = translation(domain, _localedirs.get(domain, None))
596    except OSError:
597        if n == 1:
598            return msgid1
599        else:
600            return msgid2
601    return t.npgettext(context, msgid1, msgid2, n)
602
603
604def gettext(message):
605    return dgettext(_current_domain, message)
606
607
608def ngettext(msgid1, msgid2, n):
609    return dngettext(_current_domain, msgid1, msgid2, n)
610
611
612def pgettext(context, message):
613    return dpgettext(_current_domain, context, message)
614
615
616def npgettext(context, msgid1, msgid2, n):
617    return dnpgettext(_current_domain, context, msgid1, msgid2, n)
618
619
620# dcgettext() has been deemed unnecessary and is not implemented.
621
622# James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
623# was:
624#
625#    import gettext
626#    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
627#    _ = cat.gettext
628#    print _('Hello World')
629
630# The resulting catalog object currently don't support access through a
631# dictionary API, which was supported (but apparently unused) in GNOME
632# gettext.
633
634Catalog = translation
635