1""" Test script for the unicodedata module. 2 3 Written by Marc-Andre Lemburg ([email protected]). 4 5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 6 7""" 8 9import sys 10import unittest 11import hashlib 12import subprocess 13import test.test_support 14 15encoding = 'utf-8' 16 17 18### Run tests 19 20class UnicodeMethodsTest(unittest.TestCase): 21 22 # update this, if the database changes 23 expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf' 24 25 def test_method_checksum(self): 26 h = hashlib.sha1() 27 for i in range(0x10000): 28 char = unichr(i) 29 data = [ 30 # Predicates (single char) 31 u"01"[char.isalnum()], 32 u"01"[char.isalpha()], 33 u"01"[char.isdecimal()], 34 u"01"[char.isdigit()], 35 u"01"[char.islower()], 36 u"01"[char.isnumeric()], 37 u"01"[char.isspace()], 38 u"01"[char.istitle()], 39 u"01"[char.isupper()], 40 41 # Predicates (multiple chars) 42 u"01"[(char + u'abc').isalnum()], 43 u"01"[(char + u'abc').isalpha()], 44 u"01"[(char + u'123').isdecimal()], 45 u"01"[(char + u'123').isdigit()], 46 u"01"[(char + u'abc').islower()], 47 u"01"[(char + u'123').isnumeric()], 48 u"01"[(char + u' \t').isspace()], 49 u"01"[(char + u'abc').istitle()], 50 u"01"[(char + u'ABC').isupper()], 51 52 # Mappings (single char) 53 char.lower(), 54 char.upper(), 55 char.title(), 56 57 # Mappings (multiple chars) 58 (char + u'abc').lower(), 59 (char + u'ABC').upper(), 60 (char + u'abc').title(), 61 (char + u'ABC').title(), 62 63 ] 64 h.update(u''.join(data).encode(encoding)) 65 result = h.hexdigest() 66 self.assertEqual(result, self.expectedchecksum) 67 68class UnicodeDatabaseTest(unittest.TestCase): 69 70 def setUp(self): 71 # In case unicodedata is not available, this will raise an ImportError, 72 # but the other test cases will still be run 73 import unicodedata 74 self.db = unicodedata 75 76 def tearDown(self): 77 del self.db 78 79class UnicodeFunctionsTest(UnicodeDatabaseTest): 80 81 # update this, if the database changes 82 expectedchecksum = '6ccf1b1a36460d2694f9b0b0f0324942fe70ede6' 83 84 def test_function_checksum(self): 85 data = [] 86 h = hashlib.sha1() 87 88 for i in range(0x10000): 89 char = unichr(i) 90 data = [ 91 # Properties 92 str(self.db.digit(char, -1)), 93 str(self.db.numeric(char, -1)), 94 str(self.db.decimal(char, -1)), 95 self.db.category(char), 96 self.db.bidirectional(char), 97 self.db.decomposition(char), 98 str(self.db.mirrored(char)), 99 str(self.db.combining(char)), 100 ] 101 h.update(''.join(data)) 102 result = h.hexdigest() 103 self.assertEqual(result, self.expectedchecksum) 104 105 def test_digit(self): 106 self.assertEqual(self.db.digit(u'A', None), None) 107 self.assertEqual(self.db.digit(u'9'), 9) 108 self.assertEqual(self.db.digit(u'\u215b', None), None) 109 self.assertEqual(self.db.digit(u'\u2468'), 9) 110 self.assertEqual(self.db.digit(u'\U00020000', None), None) 111 112 self.assertRaises(TypeError, self.db.digit) 113 self.assertRaises(TypeError, self.db.digit, u'xx') 114 self.assertRaises(ValueError, self.db.digit, u'x') 115 116 def test_numeric(self): 117 self.assertEqual(self.db.numeric(u'A',None), None) 118 self.assertEqual(self.db.numeric(u'9'), 9) 119 self.assertEqual(self.db.numeric(u'\u215b'), 0.125) 120 self.assertEqual(self.db.numeric(u'\u2468'), 9.0) 121 self.assertEqual(self.db.numeric(u'\ua627'), 7.0) 122 self.assertEqual(self.db.numeric(u'\U00020000', None), None) 123 124 self.assertRaises(TypeError, self.db.numeric) 125 self.assertRaises(TypeError, self.db.numeric, u'xx') 126 self.assertRaises(ValueError, self.db.numeric, u'x') 127 128 def test_decimal(self): 129 self.assertEqual(self.db.decimal(u'A',None), None) 130 self.assertEqual(self.db.decimal(u'9'), 9) 131 self.assertEqual(self.db.decimal(u'\u215b', None), None) 132 self.assertEqual(self.db.decimal(u'\u2468', None), None) 133 self.assertEqual(self.db.decimal(u'\U00020000', None), None) 134 135 self.assertRaises(TypeError, self.db.decimal) 136 self.assertRaises(TypeError, self.db.decimal, u'xx') 137 self.assertRaises(ValueError, self.db.decimal, u'x') 138 139 def test_category(self): 140 self.assertEqual(self.db.category(u'\uFFFE'), 'Cn') 141 self.assertEqual(self.db.category(u'a'), 'Ll') 142 self.assertEqual(self.db.category(u'A'), 'Lu') 143 self.assertEqual(self.db.category(u'\U00020000'), 'Lo') 144 145 self.assertRaises(TypeError, self.db.category) 146 self.assertRaises(TypeError, self.db.category, u'xx') 147 148 def test_bidirectional(self): 149 self.assertEqual(self.db.bidirectional(u'\uFFFE'), '') 150 self.assertEqual(self.db.bidirectional(u' '), 'WS') 151 self.assertEqual(self.db.bidirectional(u'A'), 'L') 152 self.assertEqual(self.db.bidirectional(u'\U00020000'), 'L') 153 154 self.assertRaises(TypeError, self.db.bidirectional) 155 self.assertRaises(TypeError, self.db.bidirectional, u'xx') 156 157 def test_decomposition(self): 158 self.assertEqual(self.db.decomposition(u'\uFFFE'),'') 159 self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034') 160 161 self.assertRaises(TypeError, self.db.decomposition) 162 self.assertRaises(TypeError, self.db.decomposition, u'xx') 163 164 def test_mirrored(self): 165 self.assertEqual(self.db.mirrored(u'\uFFFE'), 0) 166 self.assertEqual(self.db.mirrored(u'a'), 0) 167 self.assertEqual(self.db.mirrored(u'\u2201'), 1) 168 self.assertEqual(self.db.mirrored(u'\U00020000'), 0) 169 170 self.assertRaises(TypeError, self.db.mirrored) 171 self.assertRaises(TypeError, self.db.mirrored, u'xx') 172 173 def test_combining(self): 174 self.assertEqual(self.db.combining(u'\uFFFE'), 0) 175 self.assertEqual(self.db.combining(u'a'), 0) 176 self.assertEqual(self.db.combining(u'\u20e1'), 230) 177 self.assertEqual(self.db.combining(u'\U00020000'), 0) 178 179 self.assertRaises(TypeError, self.db.combining) 180 self.assertRaises(TypeError, self.db.combining, u'xx') 181 182 def test_normalize(self): 183 self.assertRaises(TypeError, self.db.normalize) 184 self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx') 185 self.assertEqual(self.db.normalize('NFKC', u''), u'') 186 # The rest can be found in test_normalization.py 187 # which requires an external file. 188 189 def test_pr29(self): 190 # http://www.unicode.org/review/pr-29.html 191 # See issues #1054943 and #10254. 192 composed = (u"\u0b47\u0300\u0b3e", u"\u1100\u0300\u1161", 193 u'Li\u030dt-s\u1e73\u0301', 194 u'\u092e\u093e\u0930\u094d\u0915 \u091c\u093c' 195 + u'\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917', 196 u'\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c' 197 + 'u\u0938\u094d\u0924\u093e\u0928') 198 for text in composed: 199 self.assertEqual(self.db.normalize('NFC', text), text) 200 201 def test_issue10254(self): 202 # Crash reported in #10254 203 a = u'C\u0338' * 20 + u'C\u0327' 204 b = u'C\u0338' * 20 + u'\xC7' 205 self.assertEqual(self.db.normalize('NFC', a), b) 206 207 def test_issue29456(self): 208 # Fix #29456 209 u1176_str_a = u'\u1100\u1176\u11a8' 210 u1176_str_b = u'\u1100\u1176\u11a8' 211 u11a7_str_a = u'\u1100\u1175\u11a7' 212 u11a7_str_b = u'\uae30\u11a7' 213 u11c3_str_a = u'\u1100\u1175\u11c3' 214 u11c3_str_b = u'\uae30\u11c3' 215 self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b) 216 self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) 217 self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) 218 219 220 def test_east_asian_width(self): 221 eaw = self.db.east_asian_width 222 self.assertRaises(TypeError, eaw, 'a') 223 self.assertRaises(TypeError, eaw, u'') 224 self.assertRaises(TypeError, eaw, u'ra') 225 self.assertEqual(eaw(u'\x1e'), 'N') 226 self.assertEqual(eaw(u'\x20'), 'Na') 227 self.assertEqual(eaw(u'\uC894'), 'W') 228 self.assertEqual(eaw(u'\uFF66'), 'H') 229 self.assertEqual(eaw(u'\uFF1F'), 'F') 230 self.assertEqual(eaw(u'\u2010'), 'A') 231 self.assertEqual(eaw(u'\U00020000'), 'W') 232 233class UnicodeMiscTest(UnicodeDatabaseTest): 234 235 def test_failed_import_during_compiling(self): 236 # Issue 4367 237 # Decoding \N escapes requires the unicodedata module. If it can't be 238 # imported, we shouldn't segfault. 239 240 # This program should raise a SyntaxError in the eval. 241 code = "import sys;" \ 242 "sys.modules['unicodedata'] = None;" \ 243 """eval("u'\N{SOFT HYPHEN}'")""" 244 args = [sys.executable, "-c", code] 245 # We use a subprocess because the unicodedata module may already have 246 # been loaded in this process. 247 popen = subprocess.Popen(args, stderr=subprocess.PIPE) 248 popen.wait() 249 self.assertEqual(popen.returncode, 1) 250 error = "SyntaxError: (unicode error) \N escapes not supported " \ 251 "(can't load unicodedata module)" 252 self.assertIn(error, popen.stderr.read()) 253 254 def test_decimal_numeric_consistent(self): 255 # Test that decimal and numeric are consistent, 256 # i.e. if a character has a decimal value, 257 # its numeric value should be the same. 258 count = 0 259 for i in xrange(0x10000): 260 c = unichr(i) 261 dec = self.db.decimal(c, -1) 262 if dec != -1: 263 self.assertEqual(dec, self.db.numeric(c)) 264 count += 1 265 self.assertTrue(count >= 10) # should have tested at least the ASCII digits 266 267 def test_digit_numeric_consistent(self): 268 # Test that digit and numeric are consistent, 269 # i.e. if a character has a digit value, 270 # its numeric value should be the same. 271 count = 0 272 for i in xrange(0x10000): 273 c = unichr(i) 274 dec = self.db.digit(c, -1) 275 if dec != -1: 276 self.assertEqual(dec, self.db.numeric(c)) 277 count += 1 278 self.assertTrue(count >= 10) # should have tested at least the ASCII digits 279 280 def test_bug_1704793(self): 281 self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346') 282 283 def test_ucd_510(self): 284 import unicodedata 285 # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0 286 self.assertTrue(unicodedata.mirrored(u"\u0f3a")) 287 self.assertTrue(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a")) 288 # Also, we now have two ways of representing 289 # the upper-case mapping: as delta, or as absolute value 290 self.assertTrue(u"a".upper()==u'A') 291 self.assertTrue(u"\u1d79".upper()==u'\ua77d') 292 self.assertTrue(u".".upper()==u".") 293 294 def test_bug_5828(self): 295 self.assertEqual(u"\u1d79".lower(), u"\u1d79") 296 # Only U+0000 should have U+0000 as its upper/lower/titlecase variant 297 self.assertEqual( 298 [ 299 c for c in range(sys.maxunicode+1) 300 if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title() 301 ], 302 [0] 303 ) 304 305 def test_bug_4971(self): 306 # LETTER DZ WITH CARON: DZ, Dz, dz 307 self.assertEqual(u"\u01c4".title(), u"\u01c5") 308 self.assertEqual(u"\u01c5".title(), u"\u01c5") 309 self.assertEqual(u"\u01c6".title(), u"\u01c5") 310 311 def test_linebreak_7643(self): 312 for i in range(0x10000): 313 lines = (unichr(i) + u'A').splitlines() 314 if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85, 315 0x1c, 0x1d, 0x1e, 0x2028, 0x2029): 316 self.assertEqual(len(lines), 2, 317 r"\u%.4x should be a linebreak" % i) 318 else: 319 self.assertEqual(len(lines), 1, 320 r"\u%.4x should not be a linebreak" % i) 321 322def test_main(): 323 test.test_support.run_unittest( 324 UnicodeMiscTest, 325 UnicodeMethodsTest, 326 UnicodeFunctionsTest 327 ) 328 329if __name__ == "__main__": 330 test_main() 331