xref: /aosp_15_r20/external/fonttools/Lib/fontTools/unicodedata/__init__.py (revision e1fe3e4ad2793916b15cccdc4a7da52a7e1dd0e9)
1from __future__ import annotations
2
3from fontTools.misc.textTools import byteord, tostr
4
5import re
6from bisect import bisect_right
7from typing import Literal, TypeVar, overload
8
9
10try:
11    # use unicodedata backport compatible with python2:
12    # https://github.com/fonttools/unicodedata2
13    from unicodedata2 import *
14except ImportError:  # pragma: no cover
15    # fall back to built-in unicodedata (possibly outdated)
16    from unicodedata import *
17
18from . import Blocks, Scripts, ScriptExtensions, OTTags
19
20
21__all__ = [
22    # names from built-in unicodedata module
23    "lookup",
24    "name",
25    "decimal",
26    "digit",
27    "numeric",
28    "category",
29    "bidirectional",
30    "combining",
31    "east_asian_width",
32    "mirrored",
33    "decomposition",
34    "normalize",
35    "unidata_version",
36    "ucd_3_2_0",
37    # additonal functions
38    "block",
39    "script",
40    "script_extension",
41    "script_name",
42    "script_code",
43    "script_horizontal_direction",
44    "ot_tags_from_script",
45    "ot_tag_to_script",
46]
47
48
49def script(char):
50    """Return the four-letter script code assigned to the Unicode character
51    'char' as string.
52
53    >>> script("a")
54    'Latn'
55    >>> script(",")
56    'Zyyy'
57    >>> script(chr(0x10FFFF))
58    'Zzzz'
59    """
60    code = byteord(char)
61    # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
62    # comes after (to the right of) any existing entries of x in a, and it
63    # partitions array a into two halves so that, for the left side
64    # all(val <= x for val in a[lo:i]), and for the right side
65    # all(val > x for val in a[i:hi]).
66    # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
67    # breakpoints); we want to use `bisect_right` to look up the range that
68    # contains the given codepoint: i.e. whose start is less than or equal
69    # to the codepoint. Thus, we subtract -1 from the index returned.
70    i = bisect_right(Scripts.RANGES, code)
71    return Scripts.VALUES[i - 1]
72
73
74def script_extension(char):
75    """Return the script extension property assigned to the Unicode character
76    'char' as a set of string.
77
78    >>> script_extension("a") == {'Latn'}
79    True
80    >>> script_extension(chr(0x060C)) == {'Rohg', 'Syrc', 'Yezi', 'Arab', 'Thaa', 'Nkoo'}
81    True
82    >>> script_extension(chr(0x10FFFF)) == {'Zzzz'}
83    True
84    """
85    code = byteord(char)
86    i = bisect_right(ScriptExtensions.RANGES, code)
87    value = ScriptExtensions.VALUES[i - 1]
88    if value is None:
89        # code points not explicitly listed for Script Extensions
90        # have as their value the corresponding Script property value
91        return {script(char)}
92    return value
93
94
95def script_name(code, default=KeyError):
96    """Return the long, human-readable script name given a four-letter
97    Unicode script code.
98
99    If no matching name is found, a KeyError is raised by default.
100
101    You can use the 'default' argument to return a fallback value (e.g.
102    'Unknown' or None) instead of throwing an error.
103    """
104    try:
105        return str(Scripts.NAMES[code].replace("_", " "))
106    except KeyError:
107        if isinstance(default, type) and issubclass(default, KeyError):
108            raise
109        return default
110
111
112_normalize_re = re.compile(r"[-_ ]+")
113
114
115def _normalize_property_name(string):
116    """Remove case, strip space, '-' and '_' for loose matching."""
117    return _normalize_re.sub("", string).lower()
118
119
120_SCRIPT_CODES = {_normalize_property_name(v): k for k, v in Scripts.NAMES.items()}
121
122
123def script_code(script_name, default=KeyError):
124    """Returns the four-letter Unicode script code from its long name
125
126    If no matching script code is found, a KeyError is raised by default.
127
128    You can use the 'default' argument to return a fallback string (e.g.
129    'Zzzz' or None) instead of throwing an error.
130    """
131    normalized_name = _normalize_property_name(script_name)
132    try:
133        return _SCRIPT_CODES[normalized_name]
134    except KeyError:
135        if isinstance(default, type) and issubclass(default, KeyError):
136            raise
137        return default
138
139
140# The data on script direction is taken from Harfbuzz source code:
141# https://github.com/harfbuzz/harfbuzz/blob/3.2.0/src/hb-common.cc#L514-L613
142# This in turn references the following "Script_Metadata" document:
143# https://docs.google.com/spreadsheets/d/1Y90M0Ie3MUJ6UVCRDOypOtijlMDLNNyyLk36T6iMu0o
144RTL_SCRIPTS = {
145    # Unicode-1.1 additions
146    "Arab",  # Arabic
147    "Hebr",  # Hebrew
148    # Unicode-3.0 additions
149    "Syrc",  # Syriac
150    "Thaa",  # Thaana
151    # Unicode-4.0 additions
152    "Cprt",  # Cypriot
153    # Unicode-4.1 additions
154    "Khar",  # Kharoshthi
155    # Unicode-5.0 additions
156    "Phnx",  # Phoenician
157    "Nkoo",  # Nko
158    # Unicode-5.1 additions
159    "Lydi",  # Lydian
160    # Unicode-5.2 additions
161    "Avst",  # Avestan
162    "Armi",  # Imperial Aramaic
163    "Phli",  # Inscriptional Pahlavi
164    "Prti",  # Inscriptional Parthian
165    "Sarb",  # Old South Arabian
166    "Orkh",  # Old Turkic
167    "Samr",  # Samaritan
168    # Unicode-6.0 additions
169    "Mand",  # Mandaic
170    # Unicode-6.1 additions
171    "Merc",  # Meroitic Cursive
172    "Mero",  # Meroitic Hieroglyphs
173    # Unicode-7.0 additions
174    "Mani",  # Manichaean
175    "Mend",  # Mende Kikakui
176    "Nbat",  # Nabataean
177    "Narb",  # Old North Arabian
178    "Palm",  # Palmyrene
179    "Phlp",  # Psalter Pahlavi
180    # Unicode-8.0 additions
181    "Hatr",  # Hatran
182    "Hung",  # Old Hungarian
183    # Unicode-9.0 additions
184    "Adlm",  # Adlam
185    # Unicode-11.0 additions
186    "Rohg",  # Hanifi Rohingya
187    "Sogo",  # Old Sogdian
188    "Sogd",  # Sogdian
189    # Unicode-12.0 additions
190    "Elym",  # Elymaic
191    # Unicode-13.0 additions
192    "Chrs",  # Chorasmian
193    "Yezi",  # Yezidi
194    # Unicode-14.0 additions
195    "Ougr",  # Old Uyghur
196}
197
198
199HorizDirection = Literal["RTL", "LTR"]
200T = TypeVar("T")
201
202
203@overload
204def script_horizontal_direction(script_code: str, default: T) -> HorizDirection | T: ...
205
206
207@overload
208def script_horizontal_direction(
209    script_code: str, default: type[KeyError] = KeyError
210) -> HorizDirection: ...
211
212
213def script_horizontal_direction(
214    script_code: str, default: T | type[KeyError] = KeyError
215) -> HorizDirection | T:
216    """Return "RTL" for scripts that contain right-to-left characters
217    according to the Bidi_Class property. Otherwise return "LTR".
218    """
219    if script_code not in Scripts.NAMES:
220        if isinstance(default, type) and issubclass(default, KeyError):
221            raise default(script_code)
222        return default
223    return "RTL" if script_code in RTL_SCRIPTS else "LTR"
224
225
226def block(char):
227    """Return the block property assigned to the Unicode character 'char'
228    as a string.
229
230    >>> block("a")
231    'Basic Latin'
232    >>> block(chr(0x060C))
233    'Arabic'
234    >>> block(chr(0xEFFFF))
235    'No_Block'
236    """
237    code = byteord(char)
238    i = bisect_right(Blocks.RANGES, code)
239    return Blocks.VALUES[i - 1]
240
241
242def ot_tags_from_script(script_code):
243    """Return a list of OpenType script tags associated with a given
244    Unicode script code.
245    Return ['DFLT'] script tag for invalid/unknown script codes.
246    """
247    if script_code in OTTags.SCRIPT_EXCEPTIONS:
248        return [OTTags.SCRIPT_EXCEPTIONS[script_code]]
249
250    if script_code not in Scripts.NAMES:
251        return [OTTags.DEFAULT_SCRIPT]
252
253    script_tags = [script_code[0].lower() + script_code[1:]]
254    if script_code in OTTags.NEW_SCRIPT_TAGS:
255        script_tags.extend(OTTags.NEW_SCRIPT_TAGS[script_code])
256        script_tags.reverse()  # last in, first out
257
258    return script_tags
259
260
261def ot_tag_to_script(tag):
262    """Return the Unicode script code for the given OpenType script tag, or
263    None for "DFLT" tag or if there is no Unicode script associated with it.
264    Raises ValueError if the tag is invalid.
265    """
266    tag = tostr(tag).strip()
267    if not tag or " " in tag or len(tag) > 4:
268        raise ValueError("invalid OpenType tag: %r" % tag)
269
270    if tag in OTTags.SCRIPT_ALIASES:
271        tag = OTTags.SCRIPT_ALIASES[tag]
272
273    while len(tag) != 4:
274        tag += str(" ")  # pad with spaces
275
276    if tag == OTTags.DEFAULT_SCRIPT:
277        # it's unclear which Unicode script the "DFLT" OpenType tag maps to,
278        # so here we return None
279        return None
280
281    if tag in OTTags.NEW_SCRIPT_TAGS_REVERSED:
282        return OTTags.NEW_SCRIPT_TAGS_REVERSED[tag]
283
284    if tag in OTTags.SCRIPT_EXCEPTIONS_REVERSED:
285        return OTTags.SCRIPT_EXCEPTIONS_REVERSED[tag]
286
287    # This side of the conversion is fully algorithmic
288
289    # Any spaces at the end of the tag are replaced by repeating the last
290    # letter. Eg 'nko ' -> 'Nkoo'.
291    # Change first char to uppercase
292    script_code = tag[0].upper() + tag[1]
293    for i in range(2, 4):
294        script_code += script_code[i - 1] if tag[i] == " " else tag[i]
295
296    if script_code not in Scripts.NAMES:
297        return None
298    return script_code
299