1 /* ------------------------------------------------------------------------
2 
3    unicodedata -- Provides access to the Unicode database.
4 
5    The current version number is reported in the unidata_version constant.
6 
7    Written by Marc-Andre Lemburg ([email protected]).
8    Modified for Python 2.0 by Fredrik Lundh ([email protected])
9    Modified by Martin v. Löwis ([email protected])
10 
11    Copyright (c) Corporation for National Research Initiatives.
12 
13    ------------------------------------------------------------------------ */
14 
15 #ifndef Py_BUILD_CORE_BUILTIN
16 #  define Py_BUILD_CORE_MODULE 1
17 #endif
18 
19 #define PY_SSIZE_T_CLEAN
20 
21 #include "Python.h"
22 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
23 #include "structmember.h"         // PyMemberDef
24 
25 #include <stdbool.h>
26 
27 /*[clinic input]
28 module unicodedata
29 class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
30 [clinic start generated code]*/
31 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
32 
33 /* character properties */
34 
35 typedef struct {
36     const unsigned char category;       /* index into
37                                            _PyUnicode_CategoryNames */
38     const unsigned char combining;      /* combining class value 0 - 255 */
39     const unsigned char bidirectional;  /* index into
40                                            _PyUnicode_BidirectionalNames */
41     const unsigned char mirrored;       /* true if mirrored in bidir mode */
42     const unsigned char east_asian_width;       /* index into
43                                                    _PyUnicode_EastAsianWidth */
44     const unsigned char normalization_quick_check; /* see is_normalized() */
45 } _PyUnicode_DatabaseRecord;
46 
47 typedef struct change_record {
48     /* sequence of fields should be the same as in merge_old_version */
49     const unsigned char bidir_changed;
50     const unsigned char category_changed;
51     const unsigned char decimal_changed;
52     const unsigned char mirrored_changed;
53     const unsigned char east_asian_width_changed;
54     const double numeric_changed;
55 } change_record;
56 
57 /* data file generated by Tools/unicode/makeunicodedata.py */
58 #include "unicodedata_db.h"
59 
60 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)61 _getrecord_ex(Py_UCS4 code)
62 {
63     int index;
64     if (code >= 0x110000)
65         index = 0;
66     else {
67         index = index1[(code>>SHIFT)];
68         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
69     }
70 
71     return &_PyUnicode_Database_Records[index];
72 }
73 
74 /* ------------- Previous-version API ------------------------------------- */
75 typedef struct previous_version {
76     PyObject_HEAD
77     const char *name;
78     const change_record* (*getrecord)(Py_UCS4);
79     Py_UCS4 (*normalization)(Py_UCS4);
80 } PreviousDBVersion;
81 
82 #include "clinic/unicodedata.c.h"
83 
84 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
85 
86 static PyMemberDef DB_members[] = {
87         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
88         {NULL}
89 };
90 
91 // Check if self is an unicodedata.UCD instance.
92 // If self is NULL (when the PyCapsule C API is used), return 0.
93 // PyModule_Check() is used to avoid having to retrieve the ucd_type.
94 // See unicodedata_functions comment to the rationale of this macro.
95 #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
96 
97 static PyObject*
new_previous_version(PyTypeObject * ucd_type,const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))98 new_previous_version(PyTypeObject *ucd_type,
99                      const char*name, const change_record* (*getrecord)(Py_UCS4),
100                      Py_UCS4 (*normalization)(Py_UCS4))
101 {
102     PreviousDBVersion *self;
103     self = PyObject_GC_New(PreviousDBVersion, ucd_type);
104     if (self == NULL)
105         return NULL;
106     self->name = name;
107     self->getrecord = getrecord;
108     self->normalization = normalization;
109     PyObject_GC_Track(self);
110     return (PyObject*)self;
111 }
112 
113 
114 /* --- Module API --------------------------------------------------------- */
115 
116 /*[clinic input]
117 unicodedata.UCD.decimal
118 
119     self: self
120     chr: int(accept={str})
121     default: object=NULL
122     /
123 
124 Converts a Unicode character into its equivalent decimal value.
125 
126 Returns the decimal value assigned to the character chr as integer.
127 If no such value is defined, default is returned, or, if not given,
128 ValueError is raised.
129 [clinic start generated code]*/
130 
131 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)132 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
133                              PyObject *default_value)
134 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
135 {
136     int have_old = 0;
137     long rc;
138     Py_UCS4 c = (Py_UCS4)chr;
139 
140     if (UCD_Check(self)) {
141         const change_record *old = get_old_record(self, c);
142         if (old->category_changed == 0) {
143             /* unassigned */
144             have_old = 1;
145             rc = -1;
146         }
147         else if (old->decimal_changed != 0xFF) {
148             have_old = 1;
149             rc = old->decimal_changed;
150         }
151     }
152 
153     if (!have_old)
154         rc = Py_UNICODE_TODECIMAL(c);
155     if (rc < 0) {
156         if (default_value == NULL) {
157             PyErr_SetString(PyExc_ValueError,
158                             "not a decimal");
159             return NULL;
160         }
161         else {
162             Py_INCREF(default_value);
163             return default_value;
164         }
165     }
166     return PyLong_FromLong(rc);
167 }
168 
169 /*[clinic input]
170 unicodedata.UCD.digit
171 
172     self: self
173     chr: int(accept={str})
174     default: object=NULL
175     /
176 
177 Converts a Unicode character into its equivalent digit value.
178 
179 Returns the digit value assigned to the character chr as integer.
180 If no such value is defined, default is returned, or, if not given,
181 ValueError is raised.
182 [clinic start generated code]*/
183 
184 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)185 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
186 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
187 {
188     long rc;
189     Py_UCS4 c = (Py_UCS4)chr;
190     rc = Py_UNICODE_TODIGIT(c);
191     if (rc < 0) {
192         if (default_value == NULL) {
193             PyErr_SetString(PyExc_ValueError, "not a digit");
194             return NULL;
195         }
196         else {
197             Py_INCREF(default_value);
198             return default_value;
199         }
200     }
201     return PyLong_FromLong(rc);
202 }
203 
204 /*[clinic input]
205 unicodedata.UCD.numeric
206 
207     self: self
208     chr: int(accept={str})
209     default: object=NULL
210     /
211 
212 Converts a Unicode character into its equivalent numeric value.
213 
214 Returns the numeric value assigned to the character chr as float.
215 If no such value is defined, default is returned, or, if not given,
216 ValueError is raised.
217 [clinic start generated code]*/
218 
219 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)220 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
221                              PyObject *default_value)
222 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
223 {
224     int have_old = 0;
225     double rc;
226     Py_UCS4 c = (Py_UCS4)chr;
227 
228     if (UCD_Check(self)) {
229         const change_record *old = get_old_record(self, c);
230         if (old->category_changed == 0) {
231             /* unassigned */
232             have_old = 1;
233             rc = -1.0;
234         }
235         else if (old->decimal_changed != 0xFF) {
236             have_old = 1;
237             rc = old->decimal_changed;
238         }
239     }
240 
241     if (!have_old)
242         rc = Py_UNICODE_TONUMERIC(c);
243     if (rc == -1.0) {
244         if (default_value == NULL) {
245             PyErr_SetString(PyExc_ValueError, "not a numeric character");
246             return NULL;
247         }
248         else {
249             Py_INCREF(default_value);
250             return default_value;
251         }
252     }
253     return PyFloat_FromDouble(rc);
254 }
255 
256 /*[clinic input]
257 unicodedata.UCD.category
258 
259     self: self
260     chr: int(accept={str})
261     /
262 
263 Returns the general category assigned to the character chr as string.
264 [clinic start generated code]*/
265 
266 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)267 unicodedata_UCD_category_impl(PyObject *self, int chr)
268 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
269 {
270     int index;
271     Py_UCS4 c = (Py_UCS4)chr;
272     index = (int) _getrecord_ex(c)->category;
273     if (UCD_Check(self)) {
274         const change_record *old = get_old_record(self, c);
275         if (old->category_changed != 0xFF)
276             index = old->category_changed;
277     }
278     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
279 }
280 
281 /*[clinic input]
282 unicodedata.UCD.bidirectional
283 
284     self: self
285     chr: int(accept={str})
286     /
287 
288 Returns the bidirectional class assigned to the character chr as string.
289 
290 If no such value is defined, an empty string is returned.
291 [clinic start generated code]*/
292 
293 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)294 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
295 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
296 {
297     int index;
298     Py_UCS4 c = (Py_UCS4)chr;
299     index = (int) _getrecord_ex(c)->bidirectional;
300     if (UCD_Check(self)) {
301         const change_record *old = get_old_record(self, c);
302         if (old->category_changed == 0)
303             index = 0; /* unassigned */
304         else if (old->bidir_changed != 0xFF)
305             index = old->bidir_changed;
306     }
307     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
308 }
309 
310 /*[clinic input]
311 unicodedata.UCD.combining -> int
312 
313     self: self
314     chr: int(accept={str})
315     /
316 
317 Returns the canonical combining class assigned to the character chr as integer.
318 
319 Returns 0 if no combining class is defined.
320 [clinic start generated code]*/
321 
322 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)323 unicodedata_UCD_combining_impl(PyObject *self, int chr)
324 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
325 {
326     int index;
327     Py_UCS4 c = (Py_UCS4)chr;
328     index = (int) _getrecord_ex(c)->combining;
329     if (UCD_Check(self)) {
330         const change_record *old = get_old_record(self, c);
331         if (old->category_changed == 0)
332             index = 0; /* unassigned */
333     }
334     return index;
335 }
336 
337 /*[clinic input]
338 unicodedata.UCD.mirrored -> int
339 
340     self: self
341     chr: int(accept={str})
342     /
343 
344 Returns the mirrored property assigned to the character chr as integer.
345 
346 Returns 1 if the character has been identified as a "mirrored"
347 character in bidirectional text, 0 otherwise.
348 [clinic start generated code]*/
349 
350 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)351 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
352 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
353 {
354     int index;
355     Py_UCS4 c = (Py_UCS4)chr;
356     index = (int) _getrecord_ex(c)->mirrored;
357     if (UCD_Check(self)) {
358         const change_record *old = get_old_record(self, c);
359         if (old->category_changed == 0)
360             index = 0; /* unassigned */
361         else if (old->mirrored_changed != 0xFF)
362             index = old->mirrored_changed;
363     }
364     return index;
365 }
366 
367 /*[clinic input]
368 unicodedata.UCD.east_asian_width
369 
370     self: self
371     chr: int(accept={str})
372     /
373 
374 Returns the east asian width assigned to the character chr as string.
375 [clinic start generated code]*/
376 
377 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)378 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
379 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
380 {
381     int index;
382     Py_UCS4 c = (Py_UCS4)chr;
383     index = (int) _getrecord_ex(c)->east_asian_width;
384     if (UCD_Check(self)) {
385         const change_record *old = get_old_record(self, c);
386         if (old->category_changed == 0)
387             index = 0; /* unassigned */
388         else if (old->east_asian_width_changed != 0xFF)
389             index = old->east_asian_width_changed;
390     }
391     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
392 }
393 
394 /*[clinic input]
395 unicodedata.UCD.decomposition
396 
397     self: self
398     chr: int(accept={str})
399     /
400 
401 Returns the character decomposition mapping assigned to the character chr as string.
402 
403 An empty string is returned in case no such mapping is defined.
404 [clinic start generated code]*/
405 
406 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)407 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
408 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
409 {
410     char decomp[256];
411     int code, index, count;
412     size_t i;
413     unsigned int prefix_index;
414     Py_UCS4 c = (Py_UCS4)chr;
415 
416     code = (int)c;
417 
418     if (UCD_Check(self)) {
419         const change_record *old = get_old_record(self, c);
420         if (old->category_changed == 0)
421             return PyUnicode_FromString(""); /* unassigned */
422     }
423 
424     if (code < 0 || code >= 0x110000)
425         index = 0;
426     else {
427         index = decomp_index1[(code>>DECOMP_SHIFT)];
428         index = decomp_index2[(index<<DECOMP_SHIFT)+
429                              (code&((1<<DECOMP_SHIFT)-1))];
430     }
431 
432     /* high byte is number of hex bytes (usually one or two), low byte
433        is prefix code (from*/
434     count = decomp_data[index] >> 8;
435 
436     /* XXX: could allocate the PyString up front instead
437        (strlen(prefix) + 5 * count + 1 bytes) */
438 
439     /* Based on how index is calculated above and decomp_data is generated
440        from Tools/unicode/makeunicodedata.py, it should not be possible
441        to overflow decomp_prefix. */
442     prefix_index = decomp_data[index] & 255;
443     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
444 
445     /* copy prefix */
446     i = strlen(decomp_prefix[prefix_index]);
447     memcpy(decomp, decomp_prefix[prefix_index], i);
448 
449     while (count-- > 0) {
450         if (i)
451             decomp[i++] = ' ';
452         assert(i < sizeof(decomp));
453         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454                       decomp_data[++index]);
455         i += strlen(decomp + i);
456     }
457     return PyUnicode_FromStringAndSize(decomp, i);
458 }
459 
460 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)461 get_decomp_record(PyObject *self, Py_UCS4 code,
462                   int *index, int *prefix, int *count)
463 {
464     if (code >= 0x110000) {
465         *index = 0;
466     }
467     else if (UCD_Check(self)
468              && get_old_record(self, code)->category_changed==0) {
469         /* unassigned in old version */
470         *index = 0;
471     }
472     else {
473         *index = decomp_index1[(code>>DECOMP_SHIFT)];
474         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475                                (code&((1<<DECOMP_SHIFT)-1))];
476     }
477 
478     /* high byte is number of hex bytes (usually one or two), low byte
479        is prefix code (from*/
480     *count = decomp_data[*index] >> 8;
481     *prefix = decomp_data[*index] & 255;
482 
483     (*index)++;
484 }
485 
486 #define SBase   0xAC00
487 #define LBase   0x1100
488 #define VBase   0x1161
489 #define TBase   0x11A7
490 #define LCount  19
491 #define VCount  21
492 #define TCount  28
493 #define NCount  (VCount*TCount)
494 #define SCount  (LCount*NCount)
495 
496 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)497 nfd_nfkd(PyObject *self, PyObject *input, int k)
498 {
499     PyObject *result;
500     Py_UCS4 *output;
501     Py_ssize_t i, o, osize;
502     int kind;
503     const void *data;
504     /* Longest decomposition in Unicode 3.2: U+FDFA */
505     Py_UCS4 stack[20];
506     Py_ssize_t space, isize;
507     int index, prefix, count, stackptr;
508     unsigned char prev, cur;
509 
510     stackptr = 0;
511     isize = PyUnicode_GET_LENGTH(input);
512     space = isize;
513     /* Overallocate at most 10 characters. */
514     if (space > 10) {
515         if (space <= PY_SSIZE_T_MAX - 10)
516             space += 10;
517     }
518     else {
519         space *= 2;
520     }
521     osize = space;
522     output = PyMem_NEW(Py_UCS4, space);
523     if (!output) {
524         PyErr_NoMemory();
525         return NULL;
526     }
527     i = o = 0;
528     kind = PyUnicode_KIND(input);
529     data = PyUnicode_DATA(input);
530 
531     while (i < isize) {
532         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
533         while(stackptr) {
534             Py_UCS4 code = stack[--stackptr];
535             /* Hangul Decomposition adds three characters in
536                a single step, so we need at least that much room. */
537             if (space < 3) {
538                 Py_UCS4 *new_output;
539                 osize += 10;
540                 space += 10;
541                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
542                 if (new_output == NULL) {
543                     PyMem_Free(output);
544                     PyErr_NoMemory();
545                     return NULL;
546                 }
547                 output = new_output;
548             }
549             /* Hangul Decomposition. */
550             if (SBase <= code && code < (SBase+SCount)) {
551                 int SIndex = code - SBase;
552                 int L = LBase + SIndex / NCount;
553                 int V = VBase + (SIndex % NCount) / TCount;
554                 int T = TBase + SIndex % TCount;
555                 output[o++] = L;
556                 output[o++] = V;
557                 space -= 2;
558                 if (T != TBase) {
559                     output[o++] = T;
560                     space --;
561                 }
562                 continue;
563             }
564             /* normalization changes */
565             if (UCD_Check(self)) {
566                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
567                 if (value != 0) {
568                     stack[stackptr++] = value;
569                     continue;
570                 }
571             }
572 
573             /* Other decompositions. */
574             get_decomp_record(self, code, &index, &prefix, &count);
575 
576             /* Copy character if it is not decomposable, or has a
577                compatibility decomposition, but we do NFD. */
578             if (!count || (prefix && !k)) {
579                 output[o++] = code;
580                 space--;
581                 continue;
582             }
583             /* Copy decomposition onto the stack, in reverse
584                order.  */
585             while(count) {
586                 code = decomp_data[index + (--count)];
587                 stack[stackptr++] = code;
588             }
589         }
590     }
591 
592     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
593                                        output, o);
594     PyMem_Free(output);
595     if (!result)
596         return NULL;
597     /* result is guaranteed to be ready, as it is compact. */
598     kind = PyUnicode_KIND(result);
599     data = PyUnicode_DATA(result);
600 
601     /* Sort canonically. */
602     i = 0;
603     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
604     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
605         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
606         if (prev == 0 || cur == 0 || prev <= cur) {
607             prev = cur;
608             continue;
609         }
610         /* Non-canonical order. Need to switch *i with previous. */
611         o = i - 1;
612         while (1) {
613             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
614             PyUnicode_WRITE(kind, data, o+1,
615                             PyUnicode_READ(kind, data, o));
616             PyUnicode_WRITE(kind, data, o, tmp);
617             o--;
618             if (o < 0)
619                 break;
620             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
621             if (prev == 0 || prev <= cur)
622                 break;
623         }
624         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
625     }
626     return result;
627 }
628 
629 static int
find_nfc_index(const struct reindex * nfc,Py_UCS4 code)630 find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
631 {
632     unsigned int index;
633     for (index = 0; nfc[index].start; index++) {
634         unsigned int start = nfc[index].start;
635         if (code < start)
636             return -1;
637         if (code <= start + nfc[index].count) {
638             unsigned int delta = code - start;
639             return nfc[index].index + delta;
640         }
641     }
642     return -1;
643 }
644 
645 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)646 nfc_nfkc(PyObject *self, PyObject *input, int k)
647 {
648     PyObject *result;
649     int kind;
650     const void *data;
651     Py_UCS4 *output;
652     Py_ssize_t i, i1, o, len;
653     int f,l,index,index1,comb;
654     Py_UCS4 code;
655     Py_ssize_t skipped[20];
656     int cskipped = 0;
657 
658     result = nfd_nfkd(self, input, k);
659     if (!result)
660         return NULL;
661     /* result will be "ready". */
662     kind = PyUnicode_KIND(result);
663     data = PyUnicode_DATA(result);
664     len = PyUnicode_GET_LENGTH(result);
665 
666     /* We allocate a buffer for the output.
667        If we find that we made no changes, we still return
668        the NFD result. */
669     output = PyMem_NEW(Py_UCS4, len);
670     if (!output) {
671         PyErr_NoMemory();
672         Py_DECREF(result);
673         return 0;
674     }
675     i = o = 0;
676 
677   again:
678     while (i < len) {
679       for (index = 0; index < cskipped; index++) {
680           if (skipped[index] == i) {
681               /* *i character is skipped.
682                  Remove from list. */
683               skipped[index] = skipped[cskipped-1];
684               cskipped--;
685               i++;
686               goto again; /* continue while */
687           }
688       }
689       /* Hangul Composition. We don't need to check for <LV,T>
690          pairs, since we always have decomposed data. */
691       code = PyUnicode_READ(kind, data, i);
692       if (LBase <= code && code < (LBase+LCount) &&
693           i + 1 < len &&
694           VBase <= PyUnicode_READ(kind, data, i+1) &&
695           PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
696           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
697              and V character is a modern vowel (0x1161 ~ 0x1175). */
698           int LIndex, VIndex;
699           LIndex = code - LBase;
700           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
701           code = SBase + (LIndex*VCount+VIndex)*TCount;
702           i+=2;
703           if (i < len &&
704               TBase < PyUnicode_READ(kind, data, i) &&
705               PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
706               /* check T character is a modern trailing consonant
707                  (0x11A8 ~ 0x11C2). */
708               code += PyUnicode_READ(kind, data, i)-TBase;
709               i++;
710           }
711           output[o++] = code;
712           continue;
713       }
714 
715       /* code is still input[i] here */
716       f = find_nfc_index(nfc_first, code);
717       if (f == -1) {
718           output[o++] = code;
719           i++;
720           continue;
721       }
722       /* Find next unblocked character. */
723       i1 = i+1;
724       comb = 0;
725       /* output base character for now; might be updated later. */
726       output[o] = PyUnicode_READ(kind, data, i);
727       while (i1 < len) {
728           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
729           int comb1 = _getrecord_ex(code1)->combining;
730           if (comb) {
731               if (comb1 == 0)
732                   break;
733               if (comb >= comb1) {
734                   /* Character is blocked. */
735                   i1++;
736                   continue;
737               }
738           }
739           l = find_nfc_index(nfc_last, code1);
740           /* i1 cannot be combined with i. If i1
741              is a starter, we don't need to look further.
742              Otherwise, record the combining class. */
743           if (l == -1) {
744             not_combinable:
745               if (comb1 == 0)
746                   break;
747               comb = comb1;
748               i1++;
749               continue;
750           }
751           index = f*TOTAL_LAST + l;
752           index1 = comp_index[index >> COMP_SHIFT];
753           code = comp_data[(index1<<COMP_SHIFT)+
754                            (index&((1<<COMP_SHIFT)-1))];
755           if (code == 0)
756               goto not_combinable;
757 
758           /* Replace the original character. */
759           output[o] = code;
760           /* Mark the second character unused. */
761           assert(cskipped < 20);
762           skipped[cskipped++] = i1;
763           i1++;
764           f = find_nfc_index(nfc_first, output[o]);
765           if (f == -1)
766               break;
767       }
768       /* Output character was already written.
769          Just advance the indices. */
770       o++; i++;
771     }
772     if (o == len) {
773         /* No changes. Return original string. */
774         PyMem_Free(output);
775         return result;
776     }
777     Py_DECREF(result);
778     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
779                                        output, o);
780     PyMem_Free(output);
781     return result;
782 }
783 
784 // This needs to match the logic in makeunicodedata.py
785 // which constructs the quickcheck data.
786 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
787 
788 /* Run the Unicode normalization "quickcheck" algorithm.
789  *
790  * Return YES or NO if quickcheck determines the input is certainly
791  * normalized or certainly not, and MAYBE if quickcheck is unable to
792  * tell.
793  *
794  * If `yes_only` is true, then return MAYBE as soon as we determine
795  * the answer is not YES.
796  *
797  * For background and details on the algorithm, see UAX #15:
798  *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
799  */
800 static QuickcheckResult
is_normalized_quickcheck(PyObject * self,PyObject * input,bool nfc,bool k,bool yes_only)801 is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
802                          bool yes_only)
803 {
804     /* UCD 3.2.0 is requested, quickchecks must be disabled. */
805     if (UCD_Check(self)) {
806         return MAYBE;
807     }
808 
809     if (PyUnicode_IS_ASCII(input)) {
810         return YES;
811     }
812 
813     Py_ssize_t i, len;
814     int kind;
815     const void *data;
816     unsigned char prev_combining = 0;
817 
818     /* The two quickcheck bits at this shift have type QuickcheckResult. */
819     int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
820 
821     QuickcheckResult result = YES; /* certainly normalized, unless we find something */
822 
823     i = 0;
824     kind = PyUnicode_KIND(input);
825     data = PyUnicode_DATA(input);
826     len = PyUnicode_GET_LENGTH(input);
827     while (i < len) {
828         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
829         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
830 
831         unsigned char combining = record->combining;
832         if (combining && prev_combining > combining)
833             return NO; /* non-canonical sort order, not normalized */
834         prev_combining = combining;
835 
836         unsigned char quickcheck_whole = record->normalization_quick_check;
837         if (yes_only) {
838             if (quickcheck_whole & (3 << quickcheck_shift))
839                 return MAYBE;
840         } else {
841             switch ((quickcheck_whole >> quickcheck_shift) & 3) {
842             case NO:
843               return NO;
844             case MAYBE:
845               result = MAYBE; /* this string might need normalization */
846             }
847         }
848     }
849     return result;
850 }
851 
852 /*[clinic input]
853 unicodedata.UCD.is_normalized
854 
855     self: self
856     form: unicode
857     unistr as input: unicode
858     /
859 
860 Return whether the Unicode string unistr is in the normal form 'form'.
861 
862 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
863 [clinic start generated code]*/
864 
865 static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject * self,PyObject * form,PyObject * input)866 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
867                                    PyObject *input)
868 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
869 {
870     if (PyUnicode_READY(input) == -1) {
871         return NULL;
872     }
873 
874     if (PyUnicode_GET_LENGTH(input) == 0) {
875         /* special case empty input strings. */
876         Py_RETURN_TRUE;
877     }
878 
879     PyObject *result;
880     bool nfc = false;
881     bool k = false;
882     QuickcheckResult m;
883 
884     PyObject *cmp;
885     int match = 0;
886 
887     if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
888         nfc = true;
889     }
890     else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
891         nfc = true;
892         k = true;
893     }
894     else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
895         /* matches default values for `nfc` and `k` */
896     }
897     else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
898         k = true;
899     }
900     else {
901         PyErr_SetString(PyExc_ValueError, "invalid normalization form");
902         return NULL;
903     }
904 
905     m = is_normalized_quickcheck(self, input, nfc, k, false);
906 
907     if (m == MAYBE) {
908         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
909         if (cmp == NULL) {
910             return NULL;
911         }
912         match = PyUnicode_Compare(input, cmp);
913         Py_DECREF(cmp);
914         result = (match == 0) ? Py_True : Py_False;
915     }
916     else {
917         result = (m == YES) ? Py_True : Py_False;
918     }
919 
920     Py_INCREF(result);
921     return result;
922 }
923 
924 
925 /*[clinic input]
926 unicodedata.UCD.normalize
927 
928     self: self
929     form: unicode
930     unistr as input: unicode
931     /
932 
933 Return the normal form 'form' for the Unicode string unistr.
934 
935 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
936 [clinic start generated code]*/
937 
938 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,PyObject * form,PyObject * input)939 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
940                                PyObject *input)
941 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
942 {
943     if (PyUnicode_GET_LENGTH(input) == 0) {
944         /* Special case empty input strings, since resizing
945            them  later would cause internal errors. */
946         Py_INCREF(input);
947         return input;
948     }
949 
950     if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
951         if (is_normalized_quickcheck(self, input,
952                                      true,  false, true) == YES) {
953             Py_INCREF(input);
954             return input;
955         }
956         return nfc_nfkc(self, input, 0);
957     }
958     if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
959         if (is_normalized_quickcheck(self, input,
960                                      true,  true,  true) == YES) {
961             Py_INCREF(input);
962             return input;
963         }
964         return nfc_nfkc(self, input, 1);
965     }
966     if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
967         if (is_normalized_quickcheck(self, input,
968                                      false, false, true) == YES) {
969             Py_INCREF(input);
970             return input;
971         }
972         return nfd_nfkd(self, input, 0);
973     }
974     if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
975         if (is_normalized_quickcheck(self, input,
976                                      false, true,  true) == YES) {
977             Py_INCREF(input);
978             return input;
979         }
980         return nfd_nfkd(self, input, 1);
981     }
982     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
983     return NULL;
984 }
985 
986 /* -------------------------------------------------------------------- */
987 /* unicode character name tables */
988 
989 /* data file generated by Tools/unicode/makeunicodedata.py */
990 #include "unicodename_db.h"
991 
992 /* -------------------------------------------------------------------- */
993 /* database code (cut and pasted from the unidb package) */
994 
995 static unsigned long
_gethash(const char * s,int len,int scale)996 _gethash(const char *s, int len, int scale)
997 {
998     int i;
999     unsigned long h = 0;
1000     unsigned long ix;
1001     for (i = 0; i < len; i++) {
1002         h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
1003         ix = h & 0xff000000;
1004         if (ix)
1005             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
1006     }
1007     return h;
1008 }
1009 
1010 static const char * const hangul_syllables[][3] = {
1011     { "G",  "A",   ""   },
1012     { "GG", "AE",  "G"  },
1013     { "N",  "YA",  "GG" },
1014     { "D",  "YAE", "GS" },
1015     { "DD", "EO",  "N", },
1016     { "R",  "E",   "NJ" },
1017     { "M",  "YEO", "NH" },
1018     { "B",  "YE",  "D"  },
1019     { "BB", "O",   "L"  },
1020     { "S",  "WA",  "LG" },
1021     { "SS", "WAE", "LM" },
1022     { "",   "OE",  "LB" },
1023     { "J",  "YO",  "LS" },
1024     { "JJ", "U",   "LT" },
1025     { "C",  "WEO", "LP" },
1026     { "K",  "WE",  "LH" },
1027     { "T",  "WI",  "M"  },
1028     { "P",  "YU",  "B"  },
1029     { "H",  "EU",  "BS" },
1030     { 0,    "YI",  "S"  },
1031     { 0,    "I",   "SS" },
1032     { 0,    0,     "NG" },
1033     { 0,    0,     "J"  },
1034     { 0,    0,     "C"  },
1035     { 0,    0,     "K"  },
1036     { 0,    0,     "T"  },
1037     { 0,    0,     "P"  },
1038     { 0,    0,     "H"  }
1039 };
1040 
1041 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1042 static int
is_unified_ideograph(Py_UCS4 code)1043 is_unified_ideograph(Py_UCS4 code)
1044 {
1045     return
1046         (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
1047         (0x4E00 <= code && code <= 0x9FFF)   || /* CJK Ideograph */
1048         (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
1049         (0x2A700 <= code && code <= 0x2B738) || /* CJK Ideograph Extension C */
1050         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1051         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1052         (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1053         (0x30000 <= code && code <= 0x3134A);   /* CJK Ideograph Extension G */
1054 }
1055 
1056 /* macros used to determine if the given code point is in the PUA range that
1057  * we are using to store aliases and named sequences */
1058 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1059 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1060                           (cp < named_sequences_end))
1061 
1062 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1063 _getucname(PyObject *self,
1064            Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
1065 {
1066     /* Find the name associated with the given code point.
1067      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1068      * that we are using for aliases and named sequences. */
1069     int offset;
1070     int i;
1071     int word;
1072     const unsigned char* w;
1073 
1074     if (code >= 0x110000)
1075         return 0;
1076 
1077     /* XXX should we just skip all the code points in the PUAs here? */
1078     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1079         return 0;
1080 
1081     if (UCD_Check(self)) {
1082         /* in 3.2.0 there are no aliases and named sequences */
1083         const change_record *old;
1084         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1085             return 0;
1086         old = get_old_record(self, code);
1087         if (old->category_changed == 0) {
1088             /* unassigned */
1089             return 0;
1090         }
1091     }
1092 
1093     if (SBase <= code && code < SBase+SCount) {
1094         /* Hangul syllable. */
1095         int SIndex = code - SBase;
1096         int L = SIndex / NCount;
1097         int V = (SIndex % NCount) / TCount;
1098         int T = SIndex % TCount;
1099 
1100         if (buflen < 27)
1101             /* Worst case: HANGUL SYLLABLE <10chars>. */
1102             return 0;
1103         strcpy(buffer, "HANGUL SYLLABLE ");
1104         buffer += 16;
1105         strcpy(buffer, hangul_syllables[L][0]);
1106         buffer += strlen(hangul_syllables[L][0]);
1107         strcpy(buffer, hangul_syllables[V][1]);
1108         buffer += strlen(hangul_syllables[V][1]);
1109         strcpy(buffer, hangul_syllables[T][2]);
1110         buffer += strlen(hangul_syllables[T][2]);
1111         *buffer = '\0';
1112         return 1;
1113     }
1114 
1115     if (is_unified_ideograph(code)) {
1116         if (buflen < 28)
1117             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1118             return 0;
1119         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1120         return 1;
1121     }
1122 
1123     /* get offset into phrasebook */
1124     offset = phrasebook_offset1[(code>>phrasebook_shift)];
1125     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1126                                (code&((1<<phrasebook_shift)-1))];
1127     if (!offset)
1128         return 0;
1129 
1130     i = 0;
1131 
1132     for (;;) {
1133         /* get word index */
1134         word = phrasebook[offset] - phrasebook_short;
1135         if (word >= 0) {
1136             word = (word << 8) + phrasebook[offset+1];
1137             offset += 2;
1138         } else
1139             word = phrasebook[offset++];
1140         if (i) {
1141             if (i > buflen)
1142                 return 0; /* buffer overflow */
1143             buffer[i++] = ' ';
1144         }
1145         /* copy word string from lexicon.  the last character in the
1146            word has bit 7 set.  the last word in a string ends with
1147            0x80 */
1148         w = lexicon + lexicon_offset[word];
1149         while (*w < 128) {
1150             if (i >= buflen)
1151                 return 0; /* buffer overflow */
1152             buffer[i++] = *w++;
1153         }
1154         if (i >= buflen)
1155             return 0; /* buffer overflow */
1156         buffer[i++] = *w & 127;
1157         if (*w == 128)
1158             break; /* end of word */
1159     }
1160 
1161     return 1;
1162 }
1163 
1164 static int
capi_getucname(Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1165 capi_getucname(Py_UCS4 code,
1166                char* buffer, int buflen,
1167                int with_alias_and_seq)
1168 {
1169     return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
1170 
1171 }
1172 
1173 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)1174 _cmpname(PyObject *self, int code, const char* name, int namelen)
1175 {
1176     /* check if code corresponds to the given name */
1177     int i;
1178     char buffer[NAME_MAXLEN+1];
1179     if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1180         return 0;
1181     for (i = 0; i < namelen; i++) {
1182         if (Py_TOUPPER(name[i]) != buffer[i])
1183             return 0;
1184     }
1185     return buffer[namelen] == '\0';
1186 }
1187 
1188 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1189 find_syllable(const char *str, int *len, int *pos, int count, int column)
1190 {
1191     int i, len1;
1192     *len = -1;
1193     for (i = 0; i < count; i++) {
1194         const char *s = hangul_syllables[i][column];
1195         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1196         if (len1 <= *len)
1197             continue;
1198         if (strncmp(str, s, len1) == 0) {
1199             *len = len1;
1200             *pos = i;
1201         }
1202     }
1203     if (*len == -1) {
1204         *len = 0;
1205     }
1206 }
1207 
1208 static int
_check_alias_and_seq(unsigned int cp,Py_UCS4 * code,int with_named_seq)1209 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1210 {
1211     /* check if named sequences are allowed */
1212     if (!with_named_seq && IS_NAMED_SEQ(cp))
1213         return 0;
1214     /* if the code point is in the PUA range that we use for aliases,
1215      * convert it to obtain the right code point */
1216     if (IS_ALIAS(cp))
1217         *code = name_aliases[cp-aliases_start];
1218     else
1219         *code = cp;
1220     return 1;
1221 }
1222 
1223 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1224 _getcode(PyObject* self,
1225          const char* name, int namelen, Py_UCS4* code, int with_named_seq)
1226 {
1227     /* Return the code point associated with the given name.
1228      * Named aliases are resolved too (unless self != NULL (i.e. we are using
1229      * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
1230      * using for the named sequence, and the caller must then convert it. */
1231     unsigned int h, v;
1232     unsigned int mask = code_size-1;
1233     unsigned int i, incr;
1234 
1235     /* Check for hangul syllables. */
1236     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1237         int len, L = -1, V = -1, T = -1;
1238         const char *pos = name + 16;
1239         find_syllable(pos, &len, &L, LCount, 0);
1240         pos += len;
1241         find_syllable(pos, &len, &V, VCount, 1);
1242         pos += len;
1243         find_syllable(pos, &len, &T, TCount, 2);
1244         pos += len;
1245         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1246             *code = SBase + (L*VCount+V)*TCount + T;
1247             return 1;
1248         }
1249         /* Otherwise, it's an illegal syllable name. */
1250         return 0;
1251     }
1252 
1253     /* Check for unified ideographs. */
1254     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1255         /* Four or five hexdigits must follow. */
1256         v = 0;
1257         name += 22;
1258         namelen -= 22;
1259         if (namelen != 4 && namelen != 5)
1260             return 0;
1261         while (namelen--) {
1262             v *= 16;
1263             if (*name >= '0' && *name <= '9')
1264                 v += *name - '0';
1265             else if (*name >= 'A' && *name <= 'F')
1266                 v += *name - 'A' + 10;
1267             else
1268                 return 0;
1269             name++;
1270         }
1271         if (!is_unified_ideograph(v))
1272             return 0;
1273         *code = v;
1274         return 1;
1275     }
1276 
1277     /* the following is the same as python's dictionary lookup, with
1278        only minor changes.  see the makeunicodedata script for more
1279        details */
1280 
1281     h = (unsigned int) _gethash(name, namelen, code_magic);
1282     i = (~h) & mask;
1283     v = code_hash[i];
1284     if (!v)
1285         return 0;
1286     if (_cmpname(self, v, name, namelen)) {
1287         return _check_alias_and_seq(v, code, with_named_seq);
1288     }
1289     incr = (h ^ (h >> 3)) & mask;
1290     if (!incr)
1291         incr = mask;
1292     for (;;) {
1293         i = (i + incr) & mask;
1294         v = code_hash[i];
1295         if (!v)
1296             return 0;
1297         if (_cmpname(self, v, name, namelen)) {
1298             return _check_alias_and_seq(v, code, with_named_seq);
1299         }
1300         incr = incr << 1;
1301         if (incr > mask)
1302             incr = incr ^ code_poly;
1303     }
1304 }
1305 
1306 static int
capi_getcode(const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1307 capi_getcode(const char* name, int namelen, Py_UCS4* code,
1308              int with_named_seq)
1309 {
1310     return _getcode(NULL, name, namelen, code, with_named_seq);
1311 
1312 }
1313 
1314 static void
unicodedata_destroy_capi(PyObject * capsule)1315 unicodedata_destroy_capi(PyObject *capsule)
1316 {
1317     void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1318     PyMem_Free(capi);
1319 }
1320 
1321 static PyObject *
unicodedata_create_capi(void)1322 unicodedata_create_capi(void)
1323 {
1324     _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1325     if (capi == NULL) {
1326         PyErr_NoMemory();
1327         return NULL;
1328     }
1329     capi->getname = capi_getucname;
1330     capi->getcode = capi_getcode;
1331 
1332     PyObject *capsule = PyCapsule_New(capi,
1333                                       PyUnicodeData_CAPSULE_NAME,
1334                                       unicodedata_destroy_capi);
1335     if (capsule == NULL) {
1336         PyMem_Free(capi);
1337     }
1338     return capsule;
1339 };
1340 
1341 
1342 /* -------------------------------------------------------------------- */
1343 /* Python bindings */
1344 
1345 /*[clinic input]
1346 unicodedata.UCD.name
1347 
1348     self: self
1349     chr: int(accept={str})
1350     default: object=NULL
1351     /
1352 
1353 Returns the name assigned to the character chr as a string.
1354 
1355 If no name is defined, default is returned, or, if not given,
1356 ValueError is raised.
1357 [clinic start generated code]*/
1358 
1359 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1360 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1361 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1362 {
1363     char name[NAME_MAXLEN+1];
1364     Py_UCS4 c = (Py_UCS4)chr;
1365 
1366     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1367         if (default_value == NULL) {
1368             PyErr_SetString(PyExc_ValueError, "no such name");
1369             return NULL;
1370         }
1371         else {
1372             Py_INCREF(default_value);
1373             return default_value;
1374         }
1375     }
1376 
1377     return PyUnicode_FromString(name);
1378 }
1379 
1380 /*[clinic input]
1381 unicodedata.UCD.lookup
1382 
1383     self: self
1384     name: str(accept={str, robuffer}, zeroes=True)
1385     /
1386 
1387 Look up character by name.
1388 
1389 If a character with the given name is found, return the
1390 corresponding character.  If not found, KeyError is raised.
1391 [clinic start generated code]*/
1392 
1393 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_t name_length)1394 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1395                             Py_ssize_t name_length)
1396 /*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
1397 {
1398     Py_UCS4 code;
1399     unsigned int index;
1400     if (name_length > NAME_MAXLEN) {
1401         PyErr_SetString(PyExc_KeyError, "name too long");
1402         return NULL;
1403     }
1404 
1405     if (!_getcode(self, name, (int)name_length, &code, 1)) {
1406         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1407         return NULL;
1408     }
1409     /* check if code is in the PUA range that we use for named sequences
1410        and convert it */
1411     if (IS_NAMED_SEQ(code)) {
1412         index = code-named_sequences_start;
1413         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1414                                          named_sequences[index].seq,
1415                                          named_sequences[index].seqlen);
1416     }
1417     return PyUnicode_FromOrdinal(code);
1418 }
1419 
1420 // List of functions used to define module functions *AND* unicodedata.UCD
1421 // methods. For module functions, self is the module. For UCD methods, self
1422 // is an UCD instance. The UCD_Check() macro is used to check if self is
1423 // an UCD instance.
1424 static PyMethodDef unicodedata_functions[] = {
1425     UNICODEDATA_UCD_DECIMAL_METHODDEF
1426     UNICODEDATA_UCD_DIGIT_METHODDEF
1427     UNICODEDATA_UCD_NUMERIC_METHODDEF
1428     UNICODEDATA_UCD_CATEGORY_METHODDEF
1429     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1430     UNICODEDATA_UCD_COMBINING_METHODDEF
1431     UNICODEDATA_UCD_MIRRORED_METHODDEF
1432     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1433     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1434     UNICODEDATA_UCD_NAME_METHODDEF
1435     UNICODEDATA_UCD_LOOKUP_METHODDEF
1436     UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1437     UNICODEDATA_UCD_NORMALIZE_METHODDEF
1438     {NULL, NULL}                /* sentinel */
1439 };
1440 
1441 static int
ucd_traverse(PreviousDBVersion * self,visitproc visit,void * arg)1442 ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
1443 {
1444     Py_VISIT(Py_TYPE(self));
1445     return 0;
1446 }
1447 
1448 static void
ucd_dealloc(PreviousDBVersion * self)1449 ucd_dealloc(PreviousDBVersion *self)
1450 {
1451     PyTypeObject *tp = Py_TYPE(self);
1452     PyObject_GC_UnTrack(self);
1453     PyObject_GC_Del(self);
1454     Py_DECREF(tp);
1455 }
1456 
1457 static PyType_Slot ucd_type_slots[] = {
1458     {Py_tp_dealloc, ucd_dealloc},
1459     {Py_tp_traverse, ucd_traverse},
1460     {Py_tp_getattro, PyObject_GenericGetAttr},
1461     {Py_tp_methods, unicodedata_functions},
1462     {Py_tp_members, DB_members},
1463     {0, 0}
1464 };
1465 
1466 static PyType_Spec ucd_type_spec = {
1467     .name = "unicodedata.UCD",
1468     .basicsize = sizeof(PreviousDBVersion),
1469     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
1470               Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
1471     .slots = ucd_type_slots
1472 };
1473 
1474 PyDoc_STRVAR(unicodedata_docstring,
1475 "This module provides access to the Unicode Character Database which\n\
1476 defines character properties for all Unicode characters. The data in\n\
1477 this database is based on the UnicodeData.txt file version\n\
1478 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1479 \n\
1480 The module uses the same names and symbols as defined by the\n\
1481 UnicodeData File Format " UNIDATA_VERSION ".");
1482 
1483 static int
unicodedata_exec(PyObject * module)1484 unicodedata_exec(PyObject *module)
1485 {
1486     if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1487         return -1;
1488     }
1489 
1490     PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1491     if (ucd_type == NULL) {
1492         return -1;
1493     }
1494 
1495     if (PyModule_AddType(module, ucd_type) < 0) {
1496         Py_DECREF(ucd_type);
1497         return -1;
1498     }
1499 
1500     // Unicode database version 3.2.0 used by the IDNA encoding
1501     PyObject *v;
1502     v = new_previous_version(ucd_type, "3.2.0",
1503                              get_change_3_2_0, normalization_3_2_0);
1504     Py_DECREF(ucd_type);
1505     if (v == NULL) {
1506         return -1;
1507     }
1508     if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
1509         Py_DECREF(v);
1510         return -1;
1511     }
1512 
1513     /* Export C API */
1514     PyObject *capsule = unicodedata_create_capi();
1515     if (capsule == NULL) {
1516         return -1;
1517     }
1518     int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
1519     Py_DECREF(capsule);
1520     if (rc < 0) {
1521         return -1;
1522     }
1523     return 0;
1524 }
1525 
1526 static PyModuleDef_Slot unicodedata_slots[] = {
1527     {Py_mod_exec, unicodedata_exec},
1528     {0, NULL}
1529 };
1530 
1531 static struct PyModuleDef unicodedata_module = {
1532     PyModuleDef_HEAD_INIT,
1533     .m_name = "unicodedata",
1534     .m_doc = unicodedata_docstring,
1535     .m_size = 0,
1536     .m_methods = unicodedata_functions,
1537     .m_slots = unicodedata_slots,
1538 };
1539 
1540 PyMODINIT_FUNC
PyInit_unicodedata(void)1541 PyInit_unicodedata(void)
1542 {
1543     return PyModuleDef_Init(&unicodedata_module);
1544 }
1545 
1546 
1547 /*
1548 Local variables:
1549 c-basic-offset: 4
1550 indent-tabs-mode: nil
1551 End:
1552 */
1553