1 /* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode database.
4
5 The current version number is reported in the unidata_version constant.
6
7 Written by Marc-Andre Lemburg ([email protected]).
8 Modified for Python 2.0 by Fredrik Lundh ([email protected])
9 Modified by Martin v. Löwis ([email protected])
10
11 Copyright (c) Corporation for National Research Initiatives.
12
13 ------------------------------------------------------------------------ */
14
15 #ifndef Py_BUILD_CORE_BUILTIN
16 # define Py_BUILD_CORE_MODULE 1
17 #endif
18
19 #define PY_SSIZE_T_CLEAN
20
21 #include "Python.h"
22 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
23 #include "structmember.h" // PyMemberDef
24
25 #include <stdbool.h>
26
27 /*[clinic input]
28 module unicodedata
29 class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
30 [clinic start generated code]*/
31 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
32
33 /* character properties */
34
35 typedef struct {
36 const unsigned char category; /* index into
37 _PyUnicode_CategoryNames */
38 const unsigned char combining; /* combining class value 0 - 255 */
39 const unsigned char bidirectional; /* index into
40 _PyUnicode_BidirectionalNames */
41 const unsigned char mirrored; /* true if mirrored in bidir mode */
42 const unsigned char east_asian_width; /* index into
43 _PyUnicode_EastAsianWidth */
44 const unsigned char normalization_quick_check; /* see is_normalized() */
45 } _PyUnicode_DatabaseRecord;
46
47 typedef struct change_record {
48 /* sequence of fields should be the same as in merge_old_version */
49 const unsigned char bidir_changed;
50 const unsigned char category_changed;
51 const unsigned char decimal_changed;
52 const unsigned char mirrored_changed;
53 const unsigned char east_asian_width_changed;
54 const double numeric_changed;
55 } change_record;
56
57 /* data file generated by Tools/unicode/makeunicodedata.py */
58 #include "unicodedata_db.h"
59
60 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)61 _getrecord_ex(Py_UCS4 code)
62 {
63 int index;
64 if (code >= 0x110000)
65 index = 0;
66 else {
67 index = index1[(code>>SHIFT)];
68 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
69 }
70
71 return &_PyUnicode_Database_Records[index];
72 }
73
74 /* ------------- Previous-version API ------------------------------------- */
75 typedef struct previous_version {
76 PyObject_HEAD
77 const char *name;
78 const change_record* (*getrecord)(Py_UCS4);
79 Py_UCS4 (*normalization)(Py_UCS4);
80 } PreviousDBVersion;
81
82 #include "clinic/unicodedata.c.h"
83
84 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
85
86 static PyMemberDef DB_members[] = {
87 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
88 {NULL}
89 };
90
91 // Check if self is an unicodedata.UCD instance.
92 // If self is NULL (when the PyCapsule C API is used), return 0.
93 // PyModule_Check() is used to avoid having to retrieve the ucd_type.
94 // See unicodedata_functions comment to the rationale of this macro.
95 #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
96
97 static PyObject*
new_previous_version(PyTypeObject * ucd_type,const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))98 new_previous_version(PyTypeObject *ucd_type,
99 const char*name, const change_record* (*getrecord)(Py_UCS4),
100 Py_UCS4 (*normalization)(Py_UCS4))
101 {
102 PreviousDBVersion *self;
103 self = PyObject_GC_New(PreviousDBVersion, ucd_type);
104 if (self == NULL)
105 return NULL;
106 self->name = name;
107 self->getrecord = getrecord;
108 self->normalization = normalization;
109 PyObject_GC_Track(self);
110 return (PyObject*)self;
111 }
112
113
114 /* --- Module API --------------------------------------------------------- */
115
116 /*[clinic input]
117 unicodedata.UCD.decimal
118
119 self: self
120 chr: int(accept={str})
121 default: object=NULL
122 /
123
124 Converts a Unicode character into its equivalent decimal value.
125
126 Returns the decimal value assigned to the character chr as integer.
127 If no such value is defined, default is returned, or, if not given,
128 ValueError is raised.
129 [clinic start generated code]*/
130
131 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)132 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
133 PyObject *default_value)
134 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
135 {
136 int have_old = 0;
137 long rc;
138 Py_UCS4 c = (Py_UCS4)chr;
139
140 if (UCD_Check(self)) {
141 const change_record *old = get_old_record(self, c);
142 if (old->category_changed == 0) {
143 /* unassigned */
144 have_old = 1;
145 rc = -1;
146 }
147 else if (old->decimal_changed != 0xFF) {
148 have_old = 1;
149 rc = old->decimal_changed;
150 }
151 }
152
153 if (!have_old)
154 rc = Py_UNICODE_TODECIMAL(c);
155 if (rc < 0) {
156 if (default_value == NULL) {
157 PyErr_SetString(PyExc_ValueError,
158 "not a decimal");
159 return NULL;
160 }
161 else {
162 Py_INCREF(default_value);
163 return default_value;
164 }
165 }
166 return PyLong_FromLong(rc);
167 }
168
169 /*[clinic input]
170 unicodedata.UCD.digit
171
172 self: self
173 chr: int(accept={str})
174 default: object=NULL
175 /
176
177 Converts a Unicode character into its equivalent digit value.
178
179 Returns the digit value assigned to the character chr as integer.
180 If no such value is defined, default is returned, or, if not given,
181 ValueError is raised.
182 [clinic start generated code]*/
183
184 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)185 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
186 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
187 {
188 long rc;
189 Py_UCS4 c = (Py_UCS4)chr;
190 rc = Py_UNICODE_TODIGIT(c);
191 if (rc < 0) {
192 if (default_value == NULL) {
193 PyErr_SetString(PyExc_ValueError, "not a digit");
194 return NULL;
195 }
196 else {
197 Py_INCREF(default_value);
198 return default_value;
199 }
200 }
201 return PyLong_FromLong(rc);
202 }
203
204 /*[clinic input]
205 unicodedata.UCD.numeric
206
207 self: self
208 chr: int(accept={str})
209 default: object=NULL
210 /
211
212 Converts a Unicode character into its equivalent numeric value.
213
214 Returns the numeric value assigned to the character chr as float.
215 If no such value is defined, default is returned, or, if not given,
216 ValueError is raised.
217 [clinic start generated code]*/
218
219 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)220 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
221 PyObject *default_value)
222 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
223 {
224 int have_old = 0;
225 double rc;
226 Py_UCS4 c = (Py_UCS4)chr;
227
228 if (UCD_Check(self)) {
229 const change_record *old = get_old_record(self, c);
230 if (old->category_changed == 0) {
231 /* unassigned */
232 have_old = 1;
233 rc = -1.0;
234 }
235 else if (old->decimal_changed != 0xFF) {
236 have_old = 1;
237 rc = old->decimal_changed;
238 }
239 }
240
241 if (!have_old)
242 rc = Py_UNICODE_TONUMERIC(c);
243 if (rc == -1.0) {
244 if (default_value == NULL) {
245 PyErr_SetString(PyExc_ValueError, "not a numeric character");
246 return NULL;
247 }
248 else {
249 Py_INCREF(default_value);
250 return default_value;
251 }
252 }
253 return PyFloat_FromDouble(rc);
254 }
255
256 /*[clinic input]
257 unicodedata.UCD.category
258
259 self: self
260 chr: int(accept={str})
261 /
262
263 Returns the general category assigned to the character chr as string.
264 [clinic start generated code]*/
265
266 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)267 unicodedata_UCD_category_impl(PyObject *self, int chr)
268 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
269 {
270 int index;
271 Py_UCS4 c = (Py_UCS4)chr;
272 index = (int) _getrecord_ex(c)->category;
273 if (UCD_Check(self)) {
274 const change_record *old = get_old_record(self, c);
275 if (old->category_changed != 0xFF)
276 index = old->category_changed;
277 }
278 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
279 }
280
281 /*[clinic input]
282 unicodedata.UCD.bidirectional
283
284 self: self
285 chr: int(accept={str})
286 /
287
288 Returns the bidirectional class assigned to the character chr as string.
289
290 If no such value is defined, an empty string is returned.
291 [clinic start generated code]*/
292
293 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)294 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
295 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
296 {
297 int index;
298 Py_UCS4 c = (Py_UCS4)chr;
299 index = (int) _getrecord_ex(c)->bidirectional;
300 if (UCD_Check(self)) {
301 const change_record *old = get_old_record(self, c);
302 if (old->category_changed == 0)
303 index = 0; /* unassigned */
304 else if (old->bidir_changed != 0xFF)
305 index = old->bidir_changed;
306 }
307 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
308 }
309
310 /*[clinic input]
311 unicodedata.UCD.combining -> int
312
313 self: self
314 chr: int(accept={str})
315 /
316
317 Returns the canonical combining class assigned to the character chr as integer.
318
319 Returns 0 if no combining class is defined.
320 [clinic start generated code]*/
321
322 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)323 unicodedata_UCD_combining_impl(PyObject *self, int chr)
324 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
325 {
326 int index;
327 Py_UCS4 c = (Py_UCS4)chr;
328 index = (int) _getrecord_ex(c)->combining;
329 if (UCD_Check(self)) {
330 const change_record *old = get_old_record(self, c);
331 if (old->category_changed == 0)
332 index = 0; /* unassigned */
333 }
334 return index;
335 }
336
337 /*[clinic input]
338 unicodedata.UCD.mirrored -> int
339
340 self: self
341 chr: int(accept={str})
342 /
343
344 Returns the mirrored property assigned to the character chr as integer.
345
346 Returns 1 if the character has been identified as a "mirrored"
347 character in bidirectional text, 0 otherwise.
348 [clinic start generated code]*/
349
350 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)351 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
352 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
353 {
354 int index;
355 Py_UCS4 c = (Py_UCS4)chr;
356 index = (int) _getrecord_ex(c)->mirrored;
357 if (UCD_Check(self)) {
358 const change_record *old = get_old_record(self, c);
359 if (old->category_changed == 0)
360 index = 0; /* unassigned */
361 else if (old->mirrored_changed != 0xFF)
362 index = old->mirrored_changed;
363 }
364 return index;
365 }
366
367 /*[clinic input]
368 unicodedata.UCD.east_asian_width
369
370 self: self
371 chr: int(accept={str})
372 /
373
374 Returns the east asian width assigned to the character chr as string.
375 [clinic start generated code]*/
376
377 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)378 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
379 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
380 {
381 int index;
382 Py_UCS4 c = (Py_UCS4)chr;
383 index = (int) _getrecord_ex(c)->east_asian_width;
384 if (UCD_Check(self)) {
385 const change_record *old = get_old_record(self, c);
386 if (old->category_changed == 0)
387 index = 0; /* unassigned */
388 else if (old->east_asian_width_changed != 0xFF)
389 index = old->east_asian_width_changed;
390 }
391 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
392 }
393
394 /*[clinic input]
395 unicodedata.UCD.decomposition
396
397 self: self
398 chr: int(accept={str})
399 /
400
401 Returns the character decomposition mapping assigned to the character chr as string.
402
403 An empty string is returned in case no such mapping is defined.
404 [clinic start generated code]*/
405
406 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)407 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
408 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
409 {
410 char decomp[256];
411 int code, index, count;
412 size_t i;
413 unsigned int prefix_index;
414 Py_UCS4 c = (Py_UCS4)chr;
415
416 code = (int)c;
417
418 if (UCD_Check(self)) {
419 const change_record *old = get_old_record(self, c);
420 if (old->category_changed == 0)
421 return PyUnicode_FromString(""); /* unassigned */
422 }
423
424 if (code < 0 || code >= 0x110000)
425 index = 0;
426 else {
427 index = decomp_index1[(code>>DECOMP_SHIFT)];
428 index = decomp_index2[(index<<DECOMP_SHIFT)+
429 (code&((1<<DECOMP_SHIFT)-1))];
430 }
431
432 /* high byte is number of hex bytes (usually one or two), low byte
433 is prefix code (from*/
434 count = decomp_data[index] >> 8;
435
436 /* XXX: could allocate the PyString up front instead
437 (strlen(prefix) + 5 * count + 1 bytes) */
438
439 /* Based on how index is calculated above and decomp_data is generated
440 from Tools/unicode/makeunicodedata.py, it should not be possible
441 to overflow decomp_prefix. */
442 prefix_index = decomp_data[index] & 255;
443 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
444
445 /* copy prefix */
446 i = strlen(decomp_prefix[prefix_index]);
447 memcpy(decomp, decomp_prefix[prefix_index], i);
448
449 while (count-- > 0) {
450 if (i)
451 decomp[i++] = ' ';
452 assert(i < sizeof(decomp));
453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 decomp_data[++index]);
455 i += strlen(decomp + i);
456 }
457 return PyUnicode_FromStringAndSize(decomp, i);
458 }
459
460 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)461 get_decomp_record(PyObject *self, Py_UCS4 code,
462 int *index, int *prefix, int *count)
463 {
464 if (code >= 0x110000) {
465 *index = 0;
466 }
467 else if (UCD_Check(self)
468 && get_old_record(self, code)->category_changed==0) {
469 /* unassigned in old version */
470 *index = 0;
471 }
472 else {
473 *index = decomp_index1[(code>>DECOMP_SHIFT)];
474 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475 (code&((1<<DECOMP_SHIFT)-1))];
476 }
477
478 /* high byte is number of hex bytes (usually one or two), low byte
479 is prefix code (from*/
480 *count = decomp_data[*index] >> 8;
481 *prefix = decomp_data[*index] & 255;
482
483 (*index)++;
484 }
485
486 #define SBase 0xAC00
487 #define LBase 0x1100
488 #define VBase 0x1161
489 #define TBase 0x11A7
490 #define LCount 19
491 #define VCount 21
492 #define TCount 28
493 #define NCount (VCount*TCount)
494 #define SCount (LCount*NCount)
495
496 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)497 nfd_nfkd(PyObject *self, PyObject *input, int k)
498 {
499 PyObject *result;
500 Py_UCS4 *output;
501 Py_ssize_t i, o, osize;
502 int kind;
503 const void *data;
504 /* Longest decomposition in Unicode 3.2: U+FDFA */
505 Py_UCS4 stack[20];
506 Py_ssize_t space, isize;
507 int index, prefix, count, stackptr;
508 unsigned char prev, cur;
509
510 stackptr = 0;
511 isize = PyUnicode_GET_LENGTH(input);
512 space = isize;
513 /* Overallocate at most 10 characters. */
514 if (space > 10) {
515 if (space <= PY_SSIZE_T_MAX - 10)
516 space += 10;
517 }
518 else {
519 space *= 2;
520 }
521 osize = space;
522 output = PyMem_NEW(Py_UCS4, space);
523 if (!output) {
524 PyErr_NoMemory();
525 return NULL;
526 }
527 i = o = 0;
528 kind = PyUnicode_KIND(input);
529 data = PyUnicode_DATA(input);
530
531 while (i < isize) {
532 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
533 while(stackptr) {
534 Py_UCS4 code = stack[--stackptr];
535 /* Hangul Decomposition adds three characters in
536 a single step, so we need at least that much room. */
537 if (space < 3) {
538 Py_UCS4 *new_output;
539 osize += 10;
540 space += 10;
541 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
542 if (new_output == NULL) {
543 PyMem_Free(output);
544 PyErr_NoMemory();
545 return NULL;
546 }
547 output = new_output;
548 }
549 /* Hangul Decomposition. */
550 if (SBase <= code && code < (SBase+SCount)) {
551 int SIndex = code - SBase;
552 int L = LBase + SIndex / NCount;
553 int V = VBase + (SIndex % NCount) / TCount;
554 int T = TBase + SIndex % TCount;
555 output[o++] = L;
556 output[o++] = V;
557 space -= 2;
558 if (T != TBase) {
559 output[o++] = T;
560 space --;
561 }
562 continue;
563 }
564 /* normalization changes */
565 if (UCD_Check(self)) {
566 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
567 if (value != 0) {
568 stack[stackptr++] = value;
569 continue;
570 }
571 }
572
573 /* Other decompositions. */
574 get_decomp_record(self, code, &index, &prefix, &count);
575
576 /* Copy character if it is not decomposable, or has a
577 compatibility decomposition, but we do NFD. */
578 if (!count || (prefix && !k)) {
579 output[o++] = code;
580 space--;
581 continue;
582 }
583 /* Copy decomposition onto the stack, in reverse
584 order. */
585 while(count) {
586 code = decomp_data[index + (--count)];
587 stack[stackptr++] = code;
588 }
589 }
590 }
591
592 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
593 output, o);
594 PyMem_Free(output);
595 if (!result)
596 return NULL;
597 /* result is guaranteed to be ready, as it is compact. */
598 kind = PyUnicode_KIND(result);
599 data = PyUnicode_DATA(result);
600
601 /* Sort canonically. */
602 i = 0;
603 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
604 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
605 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
606 if (prev == 0 || cur == 0 || prev <= cur) {
607 prev = cur;
608 continue;
609 }
610 /* Non-canonical order. Need to switch *i with previous. */
611 o = i - 1;
612 while (1) {
613 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
614 PyUnicode_WRITE(kind, data, o+1,
615 PyUnicode_READ(kind, data, o));
616 PyUnicode_WRITE(kind, data, o, tmp);
617 o--;
618 if (o < 0)
619 break;
620 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
621 if (prev == 0 || prev <= cur)
622 break;
623 }
624 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
625 }
626 return result;
627 }
628
629 static int
find_nfc_index(const struct reindex * nfc,Py_UCS4 code)630 find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
631 {
632 unsigned int index;
633 for (index = 0; nfc[index].start; index++) {
634 unsigned int start = nfc[index].start;
635 if (code < start)
636 return -1;
637 if (code <= start + nfc[index].count) {
638 unsigned int delta = code - start;
639 return nfc[index].index + delta;
640 }
641 }
642 return -1;
643 }
644
645 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)646 nfc_nfkc(PyObject *self, PyObject *input, int k)
647 {
648 PyObject *result;
649 int kind;
650 const void *data;
651 Py_UCS4 *output;
652 Py_ssize_t i, i1, o, len;
653 int f,l,index,index1,comb;
654 Py_UCS4 code;
655 Py_ssize_t skipped[20];
656 int cskipped = 0;
657
658 result = nfd_nfkd(self, input, k);
659 if (!result)
660 return NULL;
661 /* result will be "ready". */
662 kind = PyUnicode_KIND(result);
663 data = PyUnicode_DATA(result);
664 len = PyUnicode_GET_LENGTH(result);
665
666 /* We allocate a buffer for the output.
667 If we find that we made no changes, we still return
668 the NFD result. */
669 output = PyMem_NEW(Py_UCS4, len);
670 if (!output) {
671 PyErr_NoMemory();
672 Py_DECREF(result);
673 return 0;
674 }
675 i = o = 0;
676
677 again:
678 while (i < len) {
679 for (index = 0; index < cskipped; index++) {
680 if (skipped[index] == i) {
681 /* *i character is skipped.
682 Remove from list. */
683 skipped[index] = skipped[cskipped-1];
684 cskipped--;
685 i++;
686 goto again; /* continue while */
687 }
688 }
689 /* Hangul Composition. We don't need to check for <LV,T>
690 pairs, since we always have decomposed data. */
691 code = PyUnicode_READ(kind, data, i);
692 if (LBase <= code && code < (LBase+LCount) &&
693 i + 1 < len &&
694 VBase <= PyUnicode_READ(kind, data, i+1) &&
695 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
696 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
697 and V character is a modern vowel (0x1161 ~ 0x1175). */
698 int LIndex, VIndex;
699 LIndex = code - LBase;
700 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
701 code = SBase + (LIndex*VCount+VIndex)*TCount;
702 i+=2;
703 if (i < len &&
704 TBase < PyUnicode_READ(kind, data, i) &&
705 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
706 /* check T character is a modern trailing consonant
707 (0x11A8 ~ 0x11C2). */
708 code += PyUnicode_READ(kind, data, i)-TBase;
709 i++;
710 }
711 output[o++] = code;
712 continue;
713 }
714
715 /* code is still input[i] here */
716 f = find_nfc_index(nfc_first, code);
717 if (f == -1) {
718 output[o++] = code;
719 i++;
720 continue;
721 }
722 /* Find next unblocked character. */
723 i1 = i+1;
724 comb = 0;
725 /* output base character for now; might be updated later. */
726 output[o] = PyUnicode_READ(kind, data, i);
727 while (i1 < len) {
728 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
729 int comb1 = _getrecord_ex(code1)->combining;
730 if (comb) {
731 if (comb1 == 0)
732 break;
733 if (comb >= comb1) {
734 /* Character is blocked. */
735 i1++;
736 continue;
737 }
738 }
739 l = find_nfc_index(nfc_last, code1);
740 /* i1 cannot be combined with i. If i1
741 is a starter, we don't need to look further.
742 Otherwise, record the combining class. */
743 if (l == -1) {
744 not_combinable:
745 if (comb1 == 0)
746 break;
747 comb = comb1;
748 i1++;
749 continue;
750 }
751 index = f*TOTAL_LAST + l;
752 index1 = comp_index[index >> COMP_SHIFT];
753 code = comp_data[(index1<<COMP_SHIFT)+
754 (index&((1<<COMP_SHIFT)-1))];
755 if (code == 0)
756 goto not_combinable;
757
758 /* Replace the original character. */
759 output[o] = code;
760 /* Mark the second character unused. */
761 assert(cskipped < 20);
762 skipped[cskipped++] = i1;
763 i1++;
764 f = find_nfc_index(nfc_first, output[o]);
765 if (f == -1)
766 break;
767 }
768 /* Output character was already written.
769 Just advance the indices. */
770 o++; i++;
771 }
772 if (o == len) {
773 /* No changes. Return original string. */
774 PyMem_Free(output);
775 return result;
776 }
777 Py_DECREF(result);
778 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
779 output, o);
780 PyMem_Free(output);
781 return result;
782 }
783
784 // This needs to match the logic in makeunicodedata.py
785 // which constructs the quickcheck data.
786 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
787
788 /* Run the Unicode normalization "quickcheck" algorithm.
789 *
790 * Return YES or NO if quickcheck determines the input is certainly
791 * normalized or certainly not, and MAYBE if quickcheck is unable to
792 * tell.
793 *
794 * If `yes_only` is true, then return MAYBE as soon as we determine
795 * the answer is not YES.
796 *
797 * For background and details on the algorithm, see UAX #15:
798 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
799 */
800 static QuickcheckResult
is_normalized_quickcheck(PyObject * self,PyObject * input,bool nfc,bool k,bool yes_only)801 is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
802 bool yes_only)
803 {
804 /* UCD 3.2.0 is requested, quickchecks must be disabled. */
805 if (UCD_Check(self)) {
806 return MAYBE;
807 }
808
809 if (PyUnicode_IS_ASCII(input)) {
810 return YES;
811 }
812
813 Py_ssize_t i, len;
814 int kind;
815 const void *data;
816 unsigned char prev_combining = 0;
817
818 /* The two quickcheck bits at this shift have type QuickcheckResult. */
819 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
820
821 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
822
823 i = 0;
824 kind = PyUnicode_KIND(input);
825 data = PyUnicode_DATA(input);
826 len = PyUnicode_GET_LENGTH(input);
827 while (i < len) {
828 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
829 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
830
831 unsigned char combining = record->combining;
832 if (combining && prev_combining > combining)
833 return NO; /* non-canonical sort order, not normalized */
834 prev_combining = combining;
835
836 unsigned char quickcheck_whole = record->normalization_quick_check;
837 if (yes_only) {
838 if (quickcheck_whole & (3 << quickcheck_shift))
839 return MAYBE;
840 } else {
841 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
842 case NO:
843 return NO;
844 case MAYBE:
845 result = MAYBE; /* this string might need normalization */
846 }
847 }
848 }
849 return result;
850 }
851
852 /*[clinic input]
853 unicodedata.UCD.is_normalized
854
855 self: self
856 form: unicode
857 unistr as input: unicode
858 /
859
860 Return whether the Unicode string unistr is in the normal form 'form'.
861
862 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
863 [clinic start generated code]*/
864
865 static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject * self,PyObject * form,PyObject * input)866 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
867 PyObject *input)
868 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
869 {
870 if (PyUnicode_READY(input) == -1) {
871 return NULL;
872 }
873
874 if (PyUnicode_GET_LENGTH(input) == 0) {
875 /* special case empty input strings. */
876 Py_RETURN_TRUE;
877 }
878
879 PyObject *result;
880 bool nfc = false;
881 bool k = false;
882 QuickcheckResult m;
883
884 PyObject *cmp;
885 int match = 0;
886
887 if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
888 nfc = true;
889 }
890 else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
891 nfc = true;
892 k = true;
893 }
894 else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
895 /* matches default values for `nfc` and `k` */
896 }
897 else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
898 k = true;
899 }
900 else {
901 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
902 return NULL;
903 }
904
905 m = is_normalized_quickcheck(self, input, nfc, k, false);
906
907 if (m == MAYBE) {
908 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
909 if (cmp == NULL) {
910 return NULL;
911 }
912 match = PyUnicode_Compare(input, cmp);
913 Py_DECREF(cmp);
914 result = (match == 0) ? Py_True : Py_False;
915 }
916 else {
917 result = (m == YES) ? Py_True : Py_False;
918 }
919
920 Py_INCREF(result);
921 return result;
922 }
923
924
925 /*[clinic input]
926 unicodedata.UCD.normalize
927
928 self: self
929 form: unicode
930 unistr as input: unicode
931 /
932
933 Return the normal form 'form' for the Unicode string unistr.
934
935 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
936 [clinic start generated code]*/
937
938 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,PyObject * form,PyObject * input)939 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
940 PyObject *input)
941 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
942 {
943 if (PyUnicode_GET_LENGTH(input) == 0) {
944 /* Special case empty input strings, since resizing
945 them later would cause internal errors. */
946 Py_INCREF(input);
947 return input;
948 }
949
950 if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
951 if (is_normalized_quickcheck(self, input,
952 true, false, true) == YES) {
953 Py_INCREF(input);
954 return input;
955 }
956 return nfc_nfkc(self, input, 0);
957 }
958 if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
959 if (is_normalized_quickcheck(self, input,
960 true, true, true) == YES) {
961 Py_INCREF(input);
962 return input;
963 }
964 return nfc_nfkc(self, input, 1);
965 }
966 if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
967 if (is_normalized_quickcheck(self, input,
968 false, false, true) == YES) {
969 Py_INCREF(input);
970 return input;
971 }
972 return nfd_nfkd(self, input, 0);
973 }
974 if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
975 if (is_normalized_quickcheck(self, input,
976 false, true, true) == YES) {
977 Py_INCREF(input);
978 return input;
979 }
980 return nfd_nfkd(self, input, 1);
981 }
982 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
983 return NULL;
984 }
985
986 /* -------------------------------------------------------------------- */
987 /* unicode character name tables */
988
989 /* data file generated by Tools/unicode/makeunicodedata.py */
990 #include "unicodename_db.h"
991
992 /* -------------------------------------------------------------------- */
993 /* database code (cut and pasted from the unidb package) */
994
995 static unsigned long
_gethash(const char * s,int len,int scale)996 _gethash(const char *s, int len, int scale)
997 {
998 int i;
999 unsigned long h = 0;
1000 unsigned long ix;
1001 for (i = 0; i < len; i++) {
1002 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
1003 ix = h & 0xff000000;
1004 if (ix)
1005 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
1006 }
1007 return h;
1008 }
1009
1010 static const char * const hangul_syllables[][3] = {
1011 { "G", "A", "" },
1012 { "GG", "AE", "G" },
1013 { "N", "YA", "GG" },
1014 { "D", "YAE", "GS" },
1015 { "DD", "EO", "N", },
1016 { "R", "E", "NJ" },
1017 { "M", "YEO", "NH" },
1018 { "B", "YE", "D" },
1019 { "BB", "O", "L" },
1020 { "S", "WA", "LG" },
1021 { "SS", "WAE", "LM" },
1022 { "", "OE", "LB" },
1023 { "J", "YO", "LS" },
1024 { "JJ", "U", "LT" },
1025 { "C", "WEO", "LP" },
1026 { "K", "WE", "LH" },
1027 { "T", "WI", "M" },
1028 { "P", "YU", "B" },
1029 { "H", "EU", "BS" },
1030 { 0, "YI", "S" },
1031 { 0, "I", "SS" },
1032 { 0, 0, "NG" },
1033 { 0, 0, "J" },
1034 { 0, 0, "C" },
1035 { 0, 0, "K" },
1036 { 0, 0, "T" },
1037 { 0, 0, "P" },
1038 { 0, 0, "H" }
1039 };
1040
1041 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1042 static int
is_unified_ideograph(Py_UCS4 code)1043 is_unified_ideograph(Py_UCS4 code)
1044 {
1045 return
1046 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1047 (0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
1048 (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
1049 (0x2A700 <= code && code <= 0x2B738) || /* CJK Ideograph Extension C */
1050 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1051 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1052 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1053 (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */
1054 }
1055
1056 /* macros used to determine if the given code point is in the PUA range that
1057 * we are using to store aliases and named sequences */
1058 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1059 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1060 (cp < named_sequences_end))
1061
1062 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1063 _getucname(PyObject *self,
1064 Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
1065 {
1066 /* Find the name associated with the given code point.
1067 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1068 * that we are using for aliases and named sequences. */
1069 int offset;
1070 int i;
1071 int word;
1072 const unsigned char* w;
1073
1074 if (code >= 0x110000)
1075 return 0;
1076
1077 /* XXX should we just skip all the code points in the PUAs here? */
1078 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1079 return 0;
1080
1081 if (UCD_Check(self)) {
1082 /* in 3.2.0 there are no aliases and named sequences */
1083 const change_record *old;
1084 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1085 return 0;
1086 old = get_old_record(self, code);
1087 if (old->category_changed == 0) {
1088 /* unassigned */
1089 return 0;
1090 }
1091 }
1092
1093 if (SBase <= code && code < SBase+SCount) {
1094 /* Hangul syllable. */
1095 int SIndex = code - SBase;
1096 int L = SIndex / NCount;
1097 int V = (SIndex % NCount) / TCount;
1098 int T = SIndex % TCount;
1099
1100 if (buflen < 27)
1101 /* Worst case: HANGUL SYLLABLE <10chars>. */
1102 return 0;
1103 strcpy(buffer, "HANGUL SYLLABLE ");
1104 buffer += 16;
1105 strcpy(buffer, hangul_syllables[L][0]);
1106 buffer += strlen(hangul_syllables[L][0]);
1107 strcpy(buffer, hangul_syllables[V][1]);
1108 buffer += strlen(hangul_syllables[V][1]);
1109 strcpy(buffer, hangul_syllables[T][2]);
1110 buffer += strlen(hangul_syllables[T][2]);
1111 *buffer = '\0';
1112 return 1;
1113 }
1114
1115 if (is_unified_ideograph(code)) {
1116 if (buflen < 28)
1117 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1118 return 0;
1119 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1120 return 1;
1121 }
1122
1123 /* get offset into phrasebook */
1124 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1125 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1126 (code&((1<<phrasebook_shift)-1))];
1127 if (!offset)
1128 return 0;
1129
1130 i = 0;
1131
1132 for (;;) {
1133 /* get word index */
1134 word = phrasebook[offset] - phrasebook_short;
1135 if (word >= 0) {
1136 word = (word << 8) + phrasebook[offset+1];
1137 offset += 2;
1138 } else
1139 word = phrasebook[offset++];
1140 if (i) {
1141 if (i > buflen)
1142 return 0; /* buffer overflow */
1143 buffer[i++] = ' ';
1144 }
1145 /* copy word string from lexicon. the last character in the
1146 word has bit 7 set. the last word in a string ends with
1147 0x80 */
1148 w = lexicon + lexicon_offset[word];
1149 while (*w < 128) {
1150 if (i >= buflen)
1151 return 0; /* buffer overflow */
1152 buffer[i++] = *w++;
1153 }
1154 if (i >= buflen)
1155 return 0; /* buffer overflow */
1156 buffer[i++] = *w & 127;
1157 if (*w == 128)
1158 break; /* end of word */
1159 }
1160
1161 return 1;
1162 }
1163
1164 static int
capi_getucname(Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1165 capi_getucname(Py_UCS4 code,
1166 char* buffer, int buflen,
1167 int with_alias_and_seq)
1168 {
1169 return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
1170
1171 }
1172
1173 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)1174 _cmpname(PyObject *self, int code, const char* name, int namelen)
1175 {
1176 /* check if code corresponds to the given name */
1177 int i;
1178 char buffer[NAME_MAXLEN+1];
1179 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1180 return 0;
1181 for (i = 0; i < namelen; i++) {
1182 if (Py_TOUPPER(name[i]) != buffer[i])
1183 return 0;
1184 }
1185 return buffer[namelen] == '\0';
1186 }
1187
1188 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1189 find_syllable(const char *str, int *len, int *pos, int count, int column)
1190 {
1191 int i, len1;
1192 *len = -1;
1193 for (i = 0; i < count; i++) {
1194 const char *s = hangul_syllables[i][column];
1195 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1196 if (len1 <= *len)
1197 continue;
1198 if (strncmp(str, s, len1) == 0) {
1199 *len = len1;
1200 *pos = i;
1201 }
1202 }
1203 if (*len == -1) {
1204 *len = 0;
1205 }
1206 }
1207
1208 static int
_check_alias_and_seq(unsigned int cp,Py_UCS4 * code,int with_named_seq)1209 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1210 {
1211 /* check if named sequences are allowed */
1212 if (!with_named_seq && IS_NAMED_SEQ(cp))
1213 return 0;
1214 /* if the code point is in the PUA range that we use for aliases,
1215 * convert it to obtain the right code point */
1216 if (IS_ALIAS(cp))
1217 *code = name_aliases[cp-aliases_start];
1218 else
1219 *code = cp;
1220 return 1;
1221 }
1222
1223 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1224 _getcode(PyObject* self,
1225 const char* name, int namelen, Py_UCS4* code, int with_named_seq)
1226 {
1227 /* Return the code point associated with the given name.
1228 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1229 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
1230 * using for the named sequence, and the caller must then convert it. */
1231 unsigned int h, v;
1232 unsigned int mask = code_size-1;
1233 unsigned int i, incr;
1234
1235 /* Check for hangul syllables. */
1236 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1237 int len, L = -1, V = -1, T = -1;
1238 const char *pos = name + 16;
1239 find_syllable(pos, &len, &L, LCount, 0);
1240 pos += len;
1241 find_syllable(pos, &len, &V, VCount, 1);
1242 pos += len;
1243 find_syllable(pos, &len, &T, TCount, 2);
1244 pos += len;
1245 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1246 *code = SBase + (L*VCount+V)*TCount + T;
1247 return 1;
1248 }
1249 /* Otherwise, it's an illegal syllable name. */
1250 return 0;
1251 }
1252
1253 /* Check for unified ideographs. */
1254 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1255 /* Four or five hexdigits must follow. */
1256 v = 0;
1257 name += 22;
1258 namelen -= 22;
1259 if (namelen != 4 && namelen != 5)
1260 return 0;
1261 while (namelen--) {
1262 v *= 16;
1263 if (*name >= '0' && *name <= '9')
1264 v += *name - '0';
1265 else if (*name >= 'A' && *name <= 'F')
1266 v += *name - 'A' + 10;
1267 else
1268 return 0;
1269 name++;
1270 }
1271 if (!is_unified_ideograph(v))
1272 return 0;
1273 *code = v;
1274 return 1;
1275 }
1276
1277 /* the following is the same as python's dictionary lookup, with
1278 only minor changes. see the makeunicodedata script for more
1279 details */
1280
1281 h = (unsigned int) _gethash(name, namelen, code_magic);
1282 i = (~h) & mask;
1283 v = code_hash[i];
1284 if (!v)
1285 return 0;
1286 if (_cmpname(self, v, name, namelen)) {
1287 return _check_alias_and_seq(v, code, with_named_seq);
1288 }
1289 incr = (h ^ (h >> 3)) & mask;
1290 if (!incr)
1291 incr = mask;
1292 for (;;) {
1293 i = (i + incr) & mask;
1294 v = code_hash[i];
1295 if (!v)
1296 return 0;
1297 if (_cmpname(self, v, name, namelen)) {
1298 return _check_alias_and_seq(v, code, with_named_seq);
1299 }
1300 incr = incr << 1;
1301 if (incr > mask)
1302 incr = incr ^ code_poly;
1303 }
1304 }
1305
1306 static int
capi_getcode(const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1307 capi_getcode(const char* name, int namelen, Py_UCS4* code,
1308 int with_named_seq)
1309 {
1310 return _getcode(NULL, name, namelen, code, with_named_seq);
1311
1312 }
1313
1314 static void
unicodedata_destroy_capi(PyObject * capsule)1315 unicodedata_destroy_capi(PyObject *capsule)
1316 {
1317 void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1318 PyMem_Free(capi);
1319 }
1320
1321 static PyObject *
unicodedata_create_capi(void)1322 unicodedata_create_capi(void)
1323 {
1324 _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1325 if (capi == NULL) {
1326 PyErr_NoMemory();
1327 return NULL;
1328 }
1329 capi->getname = capi_getucname;
1330 capi->getcode = capi_getcode;
1331
1332 PyObject *capsule = PyCapsule_New(capi,
1333 PyUnicodeData_CAPSULE_NAME,
1334 unicodedata_destroy_capi);
1335 if (capsule == NULL) {
1336 PyMem_Free(capi);
1337 }
1338 return capsule;
1339 };
1340
1341
1342 /* -------------------------------------------------------------------- */
1343 /* Python bindings */
1344
1345 /*[clinic input]
1346 unicodedata.UCD.name
1347
1348 self: self
1349 chr: int(accept={str})
1350 default: object=NULL
1351 /
1352
1353 Returns the name assigned to the character chr as a string.
1354
1355 If no name is defined, default is returned, or, if not given,
1356 ValueError is raised.
1357 [clinic start generated code]*/
1358
1359 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1360 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1361 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1362 {
1363 char name[NAME_MAXLEN+1];
1364 Py_UCS4 c = (Py_UCS4)chr;
1365
1366 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1367 if (default_value == NULL) {
1368 PyErr_SetString(PyExc_ValueError, "no such name");
1369 return NULL;
1370 }
1371 else {
1372 Py_INCREF(default_value);
1373 return default_value;
1374 }
1375 }
1376
1377 return PyUnicode_FromString(name);
1378 }
1379
1380 /*[clinic input]
1381 unicodedata.UCD.lookup
1382
1383 self: self
1384 name: str(accept={str, robuffer}, zeroes=True)
1385 /
1386
1387 Look up character by name.
1388
1389 If a character with the given name is found, return the
1390 corresponding character. If not found, KeyError is raised.
1391 [clinic start generated code]*/
1392
1393 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_t name_length)1394 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1395 Py_ssize_t name_length)
1396 /*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
1397 {
1398 Py_UCS4 code;
1399 unsigned int index;
1400 if (name_length > NAME_MAXLEN) {
1401 PyErr_SetString(PyExc_KeyError, "name too long");
1402 return NULL;
1403 }
1404
1405 if (!_getcode(self, name, (int)name_length, &code, 1)) {
1406 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1407 return NULL;
1408 }
1409 /* check if code is in the PUA range that we use for named sequences
1410 and convert it */
1411 if (IS_NAMED_SEQ(code)) {
1412 index = code-named_sequences_start;
1413 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1414 named_sequences[index].seq,
1415 named_sequences[index].seqlen);
1416 }
1417 return PyUnicode_FromOrdinal(code);
1418 }
1419
1420 // List of functions used to define module functions *AND* unicodedata.UCD
1421 // methods. For module functions, self is the module. For UCD methods, self
1422 // is an UCD instance. The UCD_Check() macro is used to check if self is
1423 // an UCD instance.
1424 static PyMethodDef unicodedata_functions[] = {
1425 UNICODEDATA_UCD_DECIMAL_METHODDEF
1426 UNICODEDATA_UCD_DIGIT_METHODDEF
1427 UNICODEDATA_UCD_NUMERIC_METHODDEF
1428 UNICODEDATA_UCD_CATEGORY_METHODDEF
1429 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1430 UNICODEDATA_UCD_COMBINING_METHODDEF
1431 UNICODEDATA_UCD_MIRRORED_METHODDEF
1432 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1433 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1434 UNICODEDATA_UCD_NAME_METHODDEF
1435 UNICODEDATA_UCD_LOOKUP_METHODDEF
1436 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1437 UNICODEDATA_UCD_NORMALIZE_METHODDEF
1438 {NULL, NULL} /* sentinel */
1439 };
1440
1441 static int
ucd_traverse(PreviousDBVersion * self,visitproc visit,void * arg)1442 ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
1443 {
1444 Py_VISIT(Py_TYPE(self));
1445 return 0;
1446 }
1447
1448 static void
ucd_dealloc(PreviousDBVersion * self)1449 ucd_dealloc(PreviousDBVersion *self)
1450 {
1451 PyTypeObject *tp = Py_TYPE(self);
1452 PyObject_GC_UnTrack(self);
1453 PyObject_GC_Del(self);
1454 Py_DECREF(tp);
1455 }
1456
1457 static PyType_Slot ucd_type_slots[] = {
1458 {Py_tp_dealloc, ucd_dealloc},
1459 {Py_tp_traverse, ucd_traverse},
1460 {Py_tp_getattro, PyObject_GenericGetAttr},
1461 {Py_tp_methods, unicodedata_functions},
1462 {Py_tp_members, DB_members},
1463 {0, 0}
1464 };
1465
1466 static PyType_Spec ucd_type_spec = {
1467 .name = "unicodedata.UCD",
1468 .basicsize = sizeof(PreviousDBVersion),
1469 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
1470 Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
1471 .slots = ucd_type_slots
1472 };
1473
1474 PyDoc_STRVAR(unicodedata_docstring,
1475 "This module provides access to the Unicode Character Database which\n\
1476 defines character properties for all Unicode characters. The data in\n\
1477 this database is based on the UnicodeData.txt file version\n\
1478 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1479 \n\
1480 The module uses the same names and symbols as defined by the\n\
1481 UnicodeData File Format " UNIDATA_VERSION ".");
1482
1483 static int
unicodedata_exec(PyObject * module)1484 unicodedata_exec(PyObject *module)
1485 {
1486 if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1487 return -1;
1488 }
1489
1490 PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1491 if (ucd_type == NULL) {
1492 return -1;
1493 }
1494
1495 if (PyModule_AddType(module, ucd_type) < 0) {
1496 Py_DECREF(ucd_type);
1497 return -1;
1498 }
1499
1500 // Unicode database version 3.2.0 used by the IDNA encoding
1501 PyObject *v;
1502 v = new_previous_version(ucd_type, "3.2.0",
1503 get_change_3_2_0, normalization_3_2_0);
1504 Py_DECREF(ucd_type);
1505 if (v == NULL) {
1506 return -1;
1507 }
1508 if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
1509 Py_DECREF(v);
1510 return -1;
1511 }
1512
1513 /* Export C API */
1514 PyObject *capsule = unicodedata_create_capi();
1515 if (capsule == NULL) {
1516 return -1;
1517 }
1518 int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
1519 Py_DECREF(capsule);
1520 if (rc < 0) {
1521 return -1;
1522 }
1523 return 0;
1524 }
1525
1526 static PyModuleDef_Slot unicodedata_slots[] = {
1527 {Py_mod_exec, unicodedata_exec},
1528 {0, NULL}
1529 };
1530
1531 static struct PyModuleDef unicodedata_module = {
1532 PyModuleDef_HEAD_INIT,
1533 .m_name = "unicodedata",
1534 .m_doc = unicodedata_docstring,
1535 .m_size = 0,
1536 .m_methods = unicodedata_functions,
1537 .m_slots = unicodedata_slots,
1538 };
1539
1540 PyMODINIT_FUNC
PyInit_unicodedata(void)1541 PyInit_unicodedata(void)
1542 {
1543 return PyModuleDef_Init(&unicodedata_module);
1544 }
1545
1546
1547 /*
1548 Local variables:
1549 c-basic-offset: 4
1550 indent-tabs-mode: nil
1551 End:
1552 */
1553