1""" Test script for the unicodedata module.
2
3    Written by Marc-Andre Lemburg ([email protected]).
4
5    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""
8
9import sys
10import unittest
11import hashlib
12import subprocess
13import test.test_support
14
15encoding = 'utf-8'
16
17
18### Run tests
19
20class UnicodeMethodsTest(unittest.TestCase):
21
22    # update this, if the database changes
23    expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf'
24
25    def test_method_checksum(self):
26        h = hashlib.sha1()
27        for i in range(0x10000):
28            char = unichr(i)
29            data = [
30                # Predicates (single char)
31                u"01"[char.isalnum()],
32                u"01"[char.isalpha()],
33                u"01"[char.isdecimal()],
34                u"01"[char.isdigit()],
35                u"01"[char.islower()],
36                u"01"[char.isnumeric()],
37                u"01"[char.isspace()],
38                u"01"[char.istitle()],
39                u"01"[char.isupper()],
40
41                # Predicates (multiple chars)
42                u"01"[(char + u'abc').isalnum()],
43                u"01"[(char + u'abc').isalpha()],
44                u"01"[(char + u'123').isdecimal()],
45                u"01"[(char + u'123').isdigit()],
46                u"01"[(char + u'abc').islower()],
47                u"01"[(char + u'123').isnumeric()],
48                u"01"[(char + u' \t').isspace()],
49                u"01"[(char + u'abc').istitle()],
50                u"01"[(char + u'ABC').isupper()],
51
52                # Mappings (single char)
53                char.lower(),
54                char.upper(),
55                char.title(),
56
57                # Mappings (multiple chars)
58                (char + u'abc').lower(),
59                (char + u'ABC').upper(),
60                (char + u'abc').title(),
61                (char + u'ABC').title(),
62
63                ]
64            h.update(u''.join(data).encode(encoding))
65        result = h.hexdigest()
66        self.assertEqual(result, self.expectedchecksum)
67
68class UnicodeDatabaseTest(unittest.TestCase):
69
70    def setUp(self):
71        # In case unicodedata is not available, this will raise an ImportError,
72        # but the other test cases will still be run
73        import unicodedata
74        self.db = unicodedata
75
76    def tearDown(self):
77        del self.db
78
79class UnicodeFunctionsTest(UnicodeDatabaseTest):
80
81    # update this, if the database changes
82    expectedchecksum = '6ccf1b1a36460d2694f9b0b0f0324942fe70ede6'
83
84    def test_function_checksum(self):
85        data = []
86        h = hashlib.sha1()
87
88        for i in range(0x10000):
89            char = unichr(i)
90            data = [
91                # Properties
92                str(self.db.digit(char, -1)),
93                str(self.db.numeric(char, -1)),
94                str(self.db.decimal(char, -1)),
95                self.db.category(char),
96                self.db.bidirectional(char),
97                self.db.decomposition(char),
98                str(self.db.mirrored(char)),
99                str(self.db.combining(char)),
100            ]
101            h.update(''.join(data))
102        result = h.hexdigest()
103        self.assertEqual(result, self.expectedchecksum)
104
105    def test_digit(self):
106        self.assertEqual(self.db.digit(u'A', None), None)
107        self.assertEqual(self.db.digit(u'9'), 9)
108        self.assertEqual(self.db.digit(u'\u215b', None), None)
109        self.assertEqual(self.db.digit(u'\u2468'), 9)
110        self.assertEqual(self.db.digit(u'\U00020000', None), None)
111
112        self.assertRaises(TypeError, self.db.digit)
113        self.assertRaises(TypeError, self.db.digit, u'xx')
114        self.assertRaises(ValueError, self.db.digit, u'x')
115
116    def test_numeric(self):
117        self.assertEqual(self.db.numeric(u'A',None), None)
118        self.assertEqual(self.db.numeric(u'9'), 9)
119        self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
120        self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
121        self.assertEqual(self.db.numeric(u'\ua627'), 7.0)
122        self.assertEqual(self.db.numeric(u'\U00020000', None), None)
123
124        self.assertRaises(TypeError, self.db.numeric)
125        self.assertRaises(TypeError, self.db.numeric, u'xx')
126        self.assertRaises(ValueError, self.db.numeric, u'x')
127
128    def test_decimal(self):
129        self.assertEqual(self.db.decimal(u'A',None), None)
130        self.assertEqual(self.db.decimal(u'9'), 9)
131        self.assertEqual(self.db.decimal(u'\u215b', None), None)
132        self.assertEqual(self.db.decimal(u'\u2468', None), None)
133        self.assertEqual(self.db.decimal(u'\U00020000', None), None)
134
135        self.assertRaises(TypeError, self.db.decimal)
136        self.assertRaises(TypeError, self.db.decimal, u'xx')
137        self.assertRaises(ValueError, self.db.decimal, u'x')
138
139    def test_category(self):
140        self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
141        self.assertEqual(self.db.category(u'a'), 'Ll')
142        self.assertEqual(self.db.category(u'A'), 'Lu')
143        self.assertEqual(self.db.category(u'\U00020000'), 'Lo')
144
145        self.assertRaises(TypeError, self.db.category)
146        self.assertRaises(TypeError, self.db.category, u'xx')
147
148    def test_bidirectional(self):
149        self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
150        self.assertEqual(self.db.bidirectional(u' '), 'WS')
151        self.assertEqual(self.db.bidirectional(u'A'), 'L')
152        self.assertEqual(self.db.bidirectional(u'\U00020000'), 'L')
153
154        self.assertRaises(TypeError, self.db.bidirectional)
155        self.assertRaises(TypeError, self.db.bidirectional, u'xx')
156
157    def test_decomposition(self):
158        self.assertEqual(self.db.decomposition(u'\uFFFE'),'')
159        self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034')
160
161        self.assertRaises(TypeError, self.db.decomposition)
162        self.assertRaises(TypeError, self.db.decomposition, u'xx')
163
164    def test_mirrored(self):
165        self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
166        self.assertEqual(self.db.mirrored(u'a'), 0)
167        self.assertEqual(self.db.mirrored(u'\u2201'), 1)
168        self.assertEqual(self.db.mirrored(u'\U00020000'), 0)
169
170        self.assertRaises(TypeError, self.db.mirrored)
171        self.assertRaises(TypeError, self.db.mirrored, u'xx')
172
173    def test_combining(self):
174        self.assertEqual(self.db.combining(u'\uFFFE'), 0)
175        self.assertEqual(self.db.combining(u'a'), 0)
176        self.assertEqual(self.db.combining(u'\u20e1'), 230)
177        self.assertEqual(self.db.combining(u'\U00020000'), 0)
178
179        self.assertRaises(TypeError, self.db.combining)
180        self.assertRaises(TypeError, self.db.combining, u'xx')
181
182    def test_normalize(self):
183        self.assertRaises(TypeError, self.db.normalize)
184        self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx')
185        self.assertEqual(self.db.normalize('NFKC', u''), u'')
186        # The rest can be found in test_normalization.py
187        # which requires an external file.
188
189    def test_pr29(self):
190        # http://www.unicode.org/review/pr-29.html
191        # See issues #1054943 and #10254.
192        composed = (u"\u0b47\u0300\u0b3e", u"\u1100\u0300\u1161",
193                    u'Li\u030dt-s\u1e73\u0301',
194                    u'\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
195                    + u'\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
196                    u'\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
197                    + 'u\u0938\u094d\u0924\u093e\u0928')
198        for text in composed:
199            self.assertEqual(self.db.normalize('NFC', text), text)
200
201    def test_issue10254(self):
202        # Crash reported in #10254
203        a = u'C\u0338' * 20  + u'C\u0327'
204        b = u'C\u0338' * 20  + u'\xC7'
205        self.assertEqual(self.db.normalize('NFC', a), b)
206
207    def test_issue29456(self):
208        # Fix #29456
209        u1176_str_a = u'\u1100\u1176\u11a8'
210        u1176_str_b = u'\u1100\u1176\u11a8'
211        u11a7_str_a = u'\u1100\u1175\u11a7'
212        u11a7_str_b = u'\uae30\u11a7'
213        u11c3_str_a = u'\u1100\u1175\u11c3'
214        u11c3_str_b = u'\uae30\u11c3'
215        self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
216        self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
217        self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
218
219
220    def test_east_asian_width(self):
221        eaw = self.db.east_asian_width
222        self.assertRaises(TypeError, eaw, 'a')
223        self.assertRaises(TypeError, eaw, u'')
224        self.assertRaises(TypeError, eaw, u'ra')
225        self.assertEqual(eaw(u'\x1e'), 'N')
226        self.assertEqual(eaw(u'\x20'), 'Na')
227        self.assertEqual(eaw(u'\uC894'), 'W')
228        self.assertEqual(eaw(u'\uFF66'), 'H')
229        self.assertEqual(eaw(u'\uFF1F'), 'F')
230        self.assertEqual(eaw(u'\u2010'), 'A')
231        self.assertEqual(eaw(u'\U00020000'), 'W')
232
233class UnicodeMiscTest(UnicodeDatabaseTest):
234
235    def test_failed_import_during_compiling(self):
236        # Issue 4367
237        # Decoding \N escapes requires the unicodedata module. If it can't be
238        # imported, we shouldn't segfault.
239
240        # This program should raise a SyntaxError in the eval.
241        code = "import sys;" \
242            "sys.modules['unicodedata'] = None;" \
243            """eval("u'\N{SOFT HYPHEN}'")"""
244        args = [sys.executable, "-c", code]
245        # We use a subprocess because the unicodedata module may already have
246        # been loaded in this process.
247        popen = subprocess.Popen(args, stderr=subprocess.PIPE)
248        popen.wait()
249        self.assertEqual(popen.returncode, 1)
250        error = "SyntaxError: (unicode error) \N escapes not supported " \
251            "(can't load unicodedata module)"
252        self.assertIn(error, popen.stderr.read())
253
254    def test_decimal_numeric_consistent(self):
255        # Test that decimal and numeric are consistent,
256        # i.e. if a character has a decimal value,
257        # its numeric value should be the same.
258        count = 0
259        for i in xrange(0x10000):
260            c = unichr(i)
261            dec = self.db.decimal(c, -1)
262            if dec != -1:
263                self.assertEqual(dec, self.db.numeric(c))
264                count += 1
265        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
266
267    def test_digit_numeric_consistent(self):
268        # Test that digit and numeric are consistent,
269        # i.e. if a character has a digit value,
270        # its numeric value should be the same.
271        count = 0
272        for i in xrange(0x10000):
273            c = unichr(i)
274            dec = self.db.digit(c, -1)
275            if dec != -1:
276                self.assertEqual(dec, self.db.numeric(c))
277                count += 1
278        self.assertTrue(count >= 10) # should have tested at least the ASCII digits
279
280    def test_bug_1704793(self):
281        self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
282
283    def test_ucd_510(self):
284        import unicodedata
285        # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
286        self.assertTrue(unicodedata.mirrored(u"\u0f3a"))
287        self.assertTrue(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a"))
288        # Also, we now have two ways of representing
289        # the upper-case mapping: as delta, or as absolute value
290        self.assertTrue(u"a".upper()==u'A')
291        self.assertTrue(u"\u1d79".upper()==u'\ua77d')
292        self.assertTrue(u".".upper()==u".")
293
294    def test_bug_5828(self):
295        self.assertEqual(u"\u1d79".lower(), u"\u1d79")
296        # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
297        self.assertEqual(
298            [
299                c for c in range(sys.maxunicode+1)
300                if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title()
301            ],
302            [0]
303        )
304
305    def test_bug_4971(self):
306        # LETTER DZ WITH CARON: DZ, Dz, dz
307        self.assertEqual(u"\u01c4".title(), u"\u01c5")
308        self.assertEqual(u"\u01c5".title(), u"\u01c5")
309        self.assertEqual(u"\u01c6".title(), u"\u01c5")
310
311    def test_linebreak_7643(self):
312        for i in range(0x10000):
313            lines = (unichr(i) + u'A').splitlines()
314            if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
315                     0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
316                self.assertEqual(len(lines), 2,
317                                 r"\u%.4x should be a linebreak" % i)
318            else:
319                self.assertEqual(len(lines), 1,
320                                 r"\u%.4x should not be a linebreak" % i)
321
322def test_main():
323    test.test_support.run_unittest(
324        UnicodeMiscTest,
325        UnicodeMethodsTest,
326        UnicodeFunctionsTest
327    )
328
329if __name__ == "__main__":
330    test_main()
331