1from __future__ import annotations 2 3from fontTools.misc.textTools import byteord, tostr 4 5import re 6from bisect import bisect_right 7from typing import Literal, TypeVar, overload 8 9 10try: 11 # use unicodedata backport compatible with python2: 12 # https://github.com/fonttools/unicodedata2 13 from unicodedata2 import * 14except ImportError: # pragma: no cover 15 # fall back to built-in unicodedata (possibly outdated) 16 from unicodedata import * 17 18from . import Blocks, Scripts, ScriptExtensions, OTTags 19 20 21__all__ = [ 22 # names from built-in unicodedata module 23 "lookup", 24 "name", 25 "decimal", 26 "digit", 27 "numeric", 28 "category", 29 "bidirectional", 30 "combining", 31 "east_asian_width", 32 "mirrored", 33 "decomposition", 34 "normalize", 35 "unidata_version", 36 "ucd_3_2_0", 37 # additonal functions 38 "block", 39 "script", 40 "script_extension", 41 "script_name", 42 "script_code", 43 "script_horizontal_direction", 44 "ot_tags_from_script", 45 "ot_tag_to_script", 46] 47 48 49def script(char): 50 """Return the four-letter script code assigned to the Unicode character 51 'char' as string. 52 53 >>> script("a") 54 'Latn' 55 >>> script(",") 56 'Zyyy' 57 >>> script(chr(0x10FFFF)) 58 'Zzzz' 59 """ 60 code = byteord(char) 61 # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which 62 # comes after (to the right of) any existing entries of x in a, and it 63 # partitions array a into two halves so that, for the left side 64 # all(val <= x for val in a[lo:i]), and for the right side 65 # all(val > x for val in a[i:hi]). 66 # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting 67 # breakpoints); we want to use `bisect_right` to look up the range that 68 # contains the given codepoint: i.e. whose start is less than or equal 69 # to the codepoint. Thus, we subtract -1 from the index returned. 70 i = bisect_right(Scripts.RANGES, code) 71 return Scripts.VALUES[i - 1] 72 73 74def script_extension(char): 75 """Return the script extension property assigned to the Unicode character 76 'char' as a set of string. 77 78 >>> script_extension("a") == {'Latn'} 79 True 80 >>> script_extension(chr(0x060C)) == {'Rohg', 'Syrc', 'Yezi', 'Arab', 'Thaa', 'Nkoo'} 81 True 82 >>> script_extension(chr(0x10FFFF)) == {'Zzzz'} 83 True 84 """ 85 code = byteord(char) 86 i = bisect_right(ScriptExtensions.RANGES, code) 87 value = ScriptExtensions.VALUES[i - 1] 88 if value is None: 89 # code points not explicitly listed for Script Extensions 90 # have as their value the corresponding Script property value 91 return {script(char)} 92 return value 93 94 95def script_name(code, default=KeyError): 96 """Return the long, human-readable script name given a four-letter 97 Unicode script code. 98 99 If no matching name is found, a KeyError is raised by default. 100 101 You can use the 'default' argument to return a fallback value (e.g. 102 'Unknown' or None) instead of throwing an error. 103 """ 104 try: 105 return str(Scripts.NAMES[code].replace("_", " ")) 106 except KeyError: 107 if isinstance(default, type) and issubclass(default, KeyError): 108 raise 109 return default 110 111 112_normalize_re = re.compile(r"[-_ ]+") 113 114 115def _normalize_property_name(string): 116 """Remove case, strip space, '-' and '_' for loose matching.""" 117 return _normalize_re.sub("", string).lower() 118 119 120_SCRIPT_CODES = {_normalize_property_name(v): k for k, v in Scripts.NAMES.items()} 121 122 123def script_code(script_name, default=KeyError): 124 """Returns the four-letter Unicode script code from its long name 125 126 If no matching script code is found, a KeyError is raised by default. 127 128 You can use the 'default' argument to return a fallback string (e.g. 129 'Zzzz' or None) instead of throwing an error. 130 """ 131 normalized_name = _normalize_property_name(script_name) 132 try: 133 return _SCRIPT_CODES[normalized_name] 134 except KeyError: 135 if isinstance(default, type) and issubclass(default, KeyError): 136 raise 137 return default 138 139 140# The data on script direction is taken from Harfbuzz source code: 141# https://github.com/harfbuzz/harfbuzz/blob/3.2.0/src/hb-common.cc#L514-L613 142# This in turn references the following "Script_Metadata" document: 143# https://docs.google.com/spreadsheets/d/1Y90M0Ie3MUJ6UVCRDOypOtijlMDLNNyyLk36T6iMu0o 144RTL_SCRIPTS = { 145 # Unicode-1.1 additions 146 "Arab", # Arabic 147 "Hebr", # Hebrew 148 # Unicode-3.0 additions 149 "Syrc", # Syriac 150 "Thaa", # Thaana 151 # Unicode-4.0 additions 152 "Cprt", # Cypriot 153 # Unicode-4.1 additions 154 "Khar", # Kharoshthi 155 # Unicode-5.0 additions 156 "Phnx", # Phoenician 157 "Nkoo", # Nko 158 # Unicode-5.1 additions 159 "Lydi", # Lydian 160 # Unicode-5.2 additions 161 "Avst", # Avestan 162 "Armi", # Imperial Aramaic 163 "Phli", # Inscriptional Pahlavi 164 "Prti", # Inscriptional Parthian 165 "Sarb", # Old South Arabian 166 "Orkh", # Old Turkic 167 "Samr", # Samaritan 168 # Unicode-6.0 additions 169 "Mand", # Mandaic 170 # Unicode-6.1 additions 171 "Merc", # Meroitic Cursive 172 "Mero", # Meroitic Hieroglyphs 173 # Unicode-7.0 additions 174 "Mani", # Manichaean 175 "Mend", # Mende Kikakui 176 "Nbat", # Nabataean 177 "Narb", # Old North Arabian 178 "Palm", # Palmyrene 179 "Phlp", # Psalter Pahlavi 180 # Unicode-8.0 additions 181 "Hatr", # Hatran 182 "Hung", # Old Hungarian 183 # Unicode-9.0 additions 184 "Adlm", # Adlam 185 # Unicode-11.0 additions 186 "Rohg", # Hanifi Rohingya 187 "Sogo", # Old Sogdian 188 "Sogd", # Sogdian 189 # Unicode-12.0 additions 190 "Elym", # Elymaic 191 # Unicode-13.0 additions 192 "Chrs", # Chorasmian 193 "Yezi", # Yezidi 194 # Unicode-14.0 additions 195 "Ougr", # Old Uyghur 196} 197 198 199HorizDirection = Literal["RTL", "LTR"] 200T = TypeVar("T") 201 202 203@overload 204def script_horizontal_direction(script_code: str, default: T) -> HorizDirection | T: ... 205 206 207@overload 208def script_horizontal_direction( 209 script_code: str, default: type[KeyError] = KeyError 210) -> HorizDirection: ... 211 212 213def script_horizontal_direction( 214 script_code: str, default: T | type[KeyError] = KeyError 215) -> HorizDirection | T: 216 """Return "RTL" for scripts that contain right-to-left characters 217 according to the Bidi_Class property. Otherwise return "LTR". 218 """ 219 if script_code not in Scripts.NAMES: 220 if isinstance(default, type) and issubclass(default, KeyError): 221 raise default(script_code) 222 return default 223 return "RTL" if script_code in RTL_SCRIPTS else "LTR" 224 225 226def block(char): 227 """Return the block property assigned to the Unicode character 'char' 228 as a string. 229 230 >>> block("a") 231 'Basic Latin' 232 >>> block(chr(0x060C)) 233 'Arabic' 234 >>> block(chr(0xEFFFF)) 235 'No_Block' 236 """ 237 code = byteord(char) 238 i = bisect_right(Blocks.RANGES, code) 239 return Blocks.VALUES[i - 1] 240 241 242def ot_tags_from_script(script_code): 243 """Return a list of OpenType script tags associated with a given 244 Unicode script code. 245 Return ['DFLT'] script tag for invalid/unknown script codes. 246 """ 247 if script_code in OTTags.SCRIPT_EXCEPTIONS: 248 return [OTTags.SCRIPT_EXCEPTIONS[script_code]] 249 250 if script_code not in Scripts.NAMES: 251 return [OTTags.DEFAULT_SCRIPT] 252 253 script_tags = [script_code[0].lower() + script_code[1:]] 254 if script_code in OTTags.NEW_SCRIPT_TAGS: 255 script_tags.extend(OTTags.NEW_SCRIPT_TAGS[script_code]) 256 script_tags.reverse() # last in, first out 257 258 return script_tags 259 260 261def ot_tag_to_script(tag): 262 """Return the Unicode script code for the given OpenType script tag, or 263 None for "DFLT" tag or if there is no Unicode script associated with it. 264 Raises ValueError if the tag is invalid. 265 """ 266 tag = tostr(tag).strip() 267 if not tag or " " in tag or len(tag) > 4: 268 raise ValueError("invalid OpenType tag: %r" % tag) 269 270 if tag in OTTags.SCRIPT_ALIASES: 271 tag = OTTags.SCRIPT_ALIASES[tag] 272 273 while len(tag) != 4: 274 tag += str(" ") # pad with spaces 275 276 if tag == OTTags.DEFAULT_SCRIPT: 277 # it's unclear which Unicode script the "DFLT" OpenType tag maps to, 278 # so here we return None 279 return None 280 281 if tag in OTTags.NEW_SCRIPT_TAGS_REVERSED: 282 return OTTags.NEW_SCRIPT_TAGS_REVERSED[tag] 283 284 if tag in OTTags.SCRIPT_EXCEPTIONS_REVERSED: 285 return OTTags.SCRIPT_EXCEPTIONS_REVERSED[tag] 286 287 # This side of the conversion is fully algorithmic 288 289 # Any spaces at the end of the tag are replaced by repeating the last 290 # letter. Eg 'nko ' -> 'Nkoo'. 291 # Change first char to uppercase 292 script_code = tag[0].upper() + tag[1] 293 for i in range(2, 4): 294 script_code += script_code[i - 1] if tag[i] == " " else tag[i] 295 296 if script_code not in Scripts.NAMES: 297 return None 298 return script_code 299