1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package unicode_test 6 7import ( 8 "flag" 9 "fmt" 10 "runtime" 11 "sort" 12 "strings" 13 "testing" 14 . "unicode" 15) 16 17var upperTest = []rune{ 18 0x41, 19 0xc0, 20 0xd8, 21 0x100, 22 0x139, 23 0x14a, 24 0x178, 25 0x181, 26 0x376, 27 0x3cf, 28 0x13bd, 29 0x1f2a, 30 0x2102, 31 0x2c00, 32 0x2c10, 33 0x2c20, 34 0xa650, 35 0xa722, 36 0xff3a, 37 0x10400, 38 0x1d400, 39 0x1d7ca, 40} 41 42var notupperTest = []rune{ 43 0x40, 44 0x5b, 45 0x61, 46 0x185, 47 0x1b0, 48 0x377, 49 0x387, 50 0x2150, 51 0xab7d, 52 0xffff, 53 0x10000, 54} 55 56var letterTest = []rune{ 57 0x41, 58 0x61, 59 0xaa, 60 0xba, 61 0xc8, 62 0xdb, 63 0xf9, 64 0x2ec, 65 0x535, 66 0x620, 67 0x6e6, 68 0x93d, 69 0xa15, 70 0xb99, 71 0xdc0, 72 0xedd, 73 0x1000, 74 0x1200, 75 0x1312, 76 0x1401, 77 0x2c00, 78 0xa800, 79 0xf900, 80 0xfa30, 81 0xffda, 82 0xffdc, 83 0x10000, 84 0x10300, 85 0x10400, 86 0x20000, 87 0x2f800, 88 0x2fa1d, 89} 90 91var notletterTest = []rune{ 92 0x20, 93 0x35, 94 0x375, 95 0x619, 96 0x700, 97 0x1885, 98 0xfffe, 99 0x1ffff, 100 0x10ffff, 101} 102 103// Contains all the special cased Latin-1 chars. 104var spaceTest = []rune{ 105 0x09, 106 0x0a, 107 0x0b, 108 0x0c, 109 0x0d, 110 0x20, 111 0x85, 112 0xA0, 113 0x2000, 114 0x3000, 115} 116 117type caseT struct { 118 cas int 119 in, out rune 120} 121 122var caseTest = []caseT{ 123 // errors 124 {-1, '\n', 0xFFFD}, 125 {UpperCase, -1, -1}, 126 {UpperCase, 1 << 30, 1 << 30}, 127 128 // ASCII (special-cased so test carefully) 129 {UpperCase, '\n', '\n'}, 130 {UpperCase, 'a', 'A'}, 131 {UpperCase, 'A', 'A'}, 132 {UpperCase, '7', '7'}, 133 {LowerCase, '\n', '\n'}, 134 {LowerCase, 'a', 'a'}, 135 {LowerCase, 'A', 'a'}, 136 {LowerCase, '7', '7'}, 137 {TitleCase, '\n', '\n'}, 138 {TitleCase, 'a', 'A'}, 139 {TitleCase, 'A', 'A'}, 140 {TitleCase, '7', '7'}, 141 142 // Latin-1: easy to read the tests! 143 {UpperCase, 0x80, 0x80}, 144 {UpperCase, 'Å', 'Å'}, 145 {UpperCase, 'å', 'Å'}, 146 {LowerCase, 0x80, 0x80}, 147 {LowerCase, 'Å', 'å'}, 148 {LowerCase, 'å', 'å'}, 149 {TitleCase, 0x80, 0x80}, 150 {TitleCase, 'Å', 'Å'}, 151 {TitleCase, 'å', 'Å'}, 152 153 // 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049 154 {UpperCase, 0x0131, 'I'}, 155 {LowerCase, 0x0131, 0x0131}, 156 {TitleCase, 0x0131, 'I'}, 157 158 // 0133;LATIN SMALL LIGATURE IJ;Ll;0;L;<compat> 0069 006A;;;;N;LATIN SMALL LETTER I J;;0132;;0132 159 {UpperCase, 0x0133, 0x0132}, 160 {LowerCase, 0x0133, 0x0133}, 161 {TitleCase, 0x0133, 0x0132}, 162 163 // 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B; 164 {UpperCase, 0x212A, 0x212A}, 165 {LowerCase, 0x212A, 'k'}, 166 {TitleCase, 0x212A, 0x212A}, 167 168 // From an UpperLower sequence 169 // A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641; 170 {UpperCase, 0xA640, 0xA640}, 171 {LowerCase, 0xA640, 0xA641}, 172 {TitleCase, 0xA640, 0xA640}, 173 // A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640 174 {UpperCase, 0xA641, 0xA640}, 175 {LowerCase, 0xA641, 0xA641}, 176 {TitleCase, 0xA641, 0xA640}, 177 // A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F; 178 {UpperCase, 0xA64E, 0xA64E}, 179 {LowerCase, 0xA64E, 0xA64F}, 180 {TitleCase, 0xA64E, 0xA64E}, 181 // A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E 182 {UpperCase, 0xA65F, 0xA65E}, 183 {LowerCase, 0xA65F, 0xA65F}, 184 {TitleCase, 0xA65F, 0xA65E}, 185 186 // From another UpperLower sequence 187 // 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A; 188 {UpperCase, 0x0139, 0x0139}, 189 {LowerCase, 0x0139, 0x013A}, 190 {TitleCase, 0x0139, 0x0139}, 191 // 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L;<compat> 004C 00B7;;;;N;;;;0140; 192 {UpperCase, 0x013f, 0x013f}, 193 {LowerCase, 0x013f, 0x0140}, 194 {TitleCase, 0x013f, 0x013f}, 195 // 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N HACEK;;0147;;0147 196 {UpperCase, 0x0148, 0x0147}, 197 {LowerCase, 0x0148, 0x0148}, 198 {TitleCase, 0x0148, 0x0147}, 199 200 // Lowercase lower than uppercase. 201 // AB78;CHEROKEE SMALL LETTER GE;Ll;0;L;;;;;N;;;13A8;;13A8 202 {UpperCase, 0xab78, 0x13a8}, 203 {LowerCase, 0xab78, 0xab78}, 204 {TitleCase, 0xab78, 0x13a8}, 205 {UpperCase, 0x13a8, 0x13a8}, 206 {LowerCase, 0x13a8, 0xab78}, 207 {TitleCase, 0x13a8, 0x13a8}, 208 209 // Last block in the 5.1.0 table 210 // 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428; 211 {UpperCase, 0x10400, 0x10400}, 212 {LowerCase, 0x10400, 0x10428}, 213 {TitleCase, 0x10400, 0x10400}, 214 // 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F; 215 {UpperCase, 0x10427, 0x10427}, 216 {LowerCase, 0x10427, 0x1044F}, 217 {TitleCase, 0x10427, 0x10427}, 218 // 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400 219 {UpperCase, 0x10428, 0x10400}, 220 {LowerCase, 0x10428, 0x10428}, 221 {TitleCase, 0x10428, 0x10400}, 222 // 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427 223 {UpperCase, 0x1044F, 0x10427}, 224 {LowerCase, 0x1044F, 0x1044F}, 225 {TitleCase, 0x1044F, 0x10427}, 226 227 // First one not in the 5.1.0 table 228 // 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;; 229 {UpperCase, 0x10450, 0x10450}, 230 {LowerCase, 0x10450, 0x10450}, 231 {TitleCase, 0x10450, 0x10450}, 232 233 // Non-letters with case. 234 {LowerCase, 0x2161, 0x2171}, 235 {UpperCase, 0x0345, 0x0399}, 236} 237 238func TestIsLetter(t *testing.T) { 239 for _, r := range upperTest { 240 if !IsLetter(r) { 241 t.Errorf("IsLetter(U+%04X) = false, want true", r) 242 } 243 } 244 for _, r := range letterTest { 245 if !IsLetter(r) { 246 t.Errorf("IsLetter(U+%04X) = false, want true", r) 247 } 248 } 249 for _, r := range notletterTest { 250 if IsLetter(r) { 251 t.Errorf("IsLetter(U+%04X) = true, want false", r) 252 } 253 } 254} 255 256func TestIsUpper(t *testing.T) { 257 for _, r := range upperTest { 258 if !IsUpper(r) { 259 t.Errorf("IsUpper(U+%04X) = false, want true", r) 260 } 261 } 262 for _, r := range notupperTest { 263 if IsUpper(r) { 264 t.Errorf("IsUpper(U+%04X) = true, want false", r) 265 } 266 } 267 for _, r := range notletterTest { 268 if IsUpper(r) { 269 t.Errorf("IsUpper(U+%04X) = true, want false", r) 270 } 271 } 272} 273 274func caseString(c int) string { 275 switch c { 276 case UpperCase: 277 return "UpperCase" 278 case LowerCase: 279 return "LowerCase" 280 case TitleCase: 281 return "TitleCase" 282 } 283 return "ErrorCase" 284} 285 286func TestTo(t *testing.T) { 287 for _, c := range caseTest { 288 r := To(c.cas, c.in) 289 if c.out != r { 290 t.Errorf("To(U+%04X, %s) = U+%04X want U+%04X", c.in, caseString(c.cas), r, c.out) 291 } 292 } 293} 294 295func TestToUpperCase(t *testing.T) { 296 for _, c := range caseTest { 297 if c.cas != UpperCase { 298 continue 299 } 300 r := ToUpper(c.in) 301 if c.out != r { 302 t.Errorf("ToUpper(U+%04X) = U+%04X want U+%04X", c.in, r, c.out) 303 } 304 } 305} 306 307func TestToLowerCase(t *testing.T) { 308 for _, c := range caseTest { 309 if c.cas != LowerCase { 310 continue 311 } 312 r := ToLower(c.in) 313 if c.out != r { 314 t.Errorf("ToLower(U+%04X) = U+%04X want U+%04X", c.in, r, c.out) 315 } 316 } 317} 318 319func TestToTitleCase(t *testing.T) { 320 for _, c := range caseTest { 321 if c.cas != TitleCase { 322 continue 323 } 324 r := ToTitle(c.in) 325 if c.out != r { 326 t.Errorf("ToTitle(U+%04X) = U+%04X want U+%04X", c.in, r, c.out) 327 } 328 } 329} 330 331func TestIsSpace(t *testing.T) { 332 for _, c := range spaceTest { 333 if !IsSpace(c) { 334 t.Errorf("IsSpace(U+%04X) = false; want true", c) 335 } 336 } 337 for _, c := range letterTest { 338 if IsSpace(c) { 339 t.Errorf("IsSpace(U+%04X) = true; want false", c) 340 } 341 } 342} 343 344// Check that the optimizations for IsLetter etc. agree with the tables. 345// We only need to check the Latin-1 range. 346func TestLetterOptimizations(t *testing.T) { 347 for i := rune(0); i <= MaxLatin1; i++ { 348 if Is(Letter, i) != IsLetter(i) { 349 t.Errorf("IsLetter(U+%04X) disagrees with Is(Letter)", i) 350 } 351 if Is(Upper, i) != IsUpper(i) { 352 t.Errorf("IsUpper(U+%04X) disagrees with Is(Upper)", i) 353 } 354 if Is(Lower, i) != IsLower(i) { 355 t.Errorf("IsLower(U+%04X) disagrees with Is(Lower)", i) 356 } 357 if Is(Title, i) != IsTitle(i) { 358 t.Errorf("IsTitle(U+%04X) disagrees with Is(Title)", i) 359 } 360 if Is(White_Space, i) != IsSpace(i) { 361 t.Errorf("IsSpace(U+%04X) disagrees with Is(White_Space)", i) 362 } 363 if To(UpperCase, i) != ToUpper(i) { 364 t.Errorf("ToUpper(U+%04X) disagrees with To(Upper)", i) 365 } 366 if To(LowerCase, i) != ToLower(i) { 367 t.Errorf("ToLower(U+%04X) disagrees with To(Lower)", i) 368 } 369 if To(TitleCase, i) != ToTitle(i) { 370 t.Errorf("ToTitle(U+%04X) disagrees with To(Title)", i) 371 } 372 } 373} 374 375func TestTurkishCase(t *testing.T) { 376 lower := []rune("abcçdefgğhıijklmnoöprsştuüvyz") 377 upper := []rune("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ") 378 for i, l := range lower { 379 u := upper[i] 380 if TurkishCase.ToLower(l) != l { 381 t.Errorf("lower(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToLower(l), l) 382 } 383 if TurkishCase.ToUpper(u) != u { 384 t.Errorf("upper(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToUpper(u), u) 385 } 386 if TurkishCase.ToUpper(l) != u { 387 t.Errorf("upper(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToUpper(l), u) 388 } 389 if TurkishCase.ToLower(u) != l { 390 t.Errorf("lower(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToLower(l), l) 391 } 392 if TurkishCase.ToTitle(u) != u { 393 t.Errorf("title(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToTitle(u), u) 394 } 395 if TurkishCase.ToTitle(l) != u { 396 t.Errorf("title(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToTitle(l), u) 397 } 398 } 399} 400 401var simpleFoldTests = []string{ 402 // SimpleFold(x) returns the next equivalent rune > x or wraps 403 // around to smaller values. 404 405 // Easy cases. 406 "Aa", 407 "δΔ", 408 409 // ASCII special cases. 410 "KkK", 411 "Ssſ", 412 413 // Non-ASCII special cases. 414 "ρϱΡ", 415 "ͅΙιι", 416 417 // Extra special cases: has lower/upper but no case fold. 418 "İ", 419 "ı", 420 421 // Upper comes before lower (Cherokee). 422 "\u13b0\uab80", 423} 424 425func TestSimpleFold(t *testing.T) { 426 for _, tt := range simpleFoldTests { 427 cycle := []rune(tt) 428 r := cycle[len(cycle)-1] 429 for _, out := range cycle { 430 if r := SimpleFold(r); r != out { 431 t.Errorf("SimpleFold(%#U) = %#U, want %#U", r, r, out) 432 } 433 r = out 434 } 435 } 436 437 if r := SimpleFold(-42); r != -42 { 438 t.Errorf("SimpleFold(-42) = %v, want -42", r) 439 } 440} 441 442// Running 'go test -calibrate' runs the calibration to find a plausible 443// cutoff point for linear search of a range list vs. binary search. 444// We create a fake table and then time how long it takes to do a 445// sequence of searches within that table, for all possible inputs 446// relative to the ranges (something before all, in each, between each, after all). 447// This assumes that all possible runes are equally likely. 448// In practice most runes are ASCII so this is a conservative estimate 449// of an effective cutoff value. In practice we could probably set it higher 450// than what this function recommends. 451 452var calibrate = flag.Bool("calibrate", false, "compute crossover for linear vs. binary search") 453 454func TestCalibrate(t *testing.T) { 455 if !*calibrate { 456 return 457 } 458 459 if runtime.GOARCH == "amd64" { 460 fmt.Printf("warning: running calibration on %s\n", runtime.GOARCH) 461 } 462 463 // Find the point where binary search wins by more than 10%. 464 // The 10% bias gives linear search an edge when they're close, 465 // because on predominantly ASCII inputs linear search is even 466 // better than our benchmarks measure. 467 n := sort.Search(64, func(n int) bool { 468 tab := fakeTable(n) 469 blinear := func(b *testing.B) { 470 tab := tab 471 max := n*5 + 20 472 for i := 0; i < b.N; i++ { 473 for j := 0; j <= max; j++ { 474 linear(tab, uint16(j)) 475 } 476 } 477 } 478 bbinary := func(b *testing.B) { 479 tab := tab 480 max := n*5 + 20 481 for i := 0; i < b.N; i++ { 482 for j := 0; j <= max; j++ { 483 binary(tab, uint16(j)) 484 } 485 } 486 } 487 bmlinear := testing.Benchmark(blinear) 488 bmbinary := testing.Benchmark(bbinary) 489 fmt.Printf("n=%d: linear=%d binary=%d\n", n, bmlinear.NsPerOp(), bmbinary.NsPerOp()) 490 return bmlinear.NsPerOp()*100 > bmbinary.NsPerOp()*110 491 }) 492 fmt.Printf("calibration: linear cutoff = %d\n", n) 493} 494 495func fakeTable(n int) []Range16 { 496 var r16 []Range16 497 for i := 0; i < n; i++ { 498 r16 = append(r16, Range16{uint16(i*5 + 10), uint16(i*5 + 12), 1}) 499 } 500 return r16 501} 502 503func linear(ranges []Range16, r uint16) bool { 504 for i := range ranges { 505 range_ := &ranges[i] 506 if r < range_.Lo { 507 return false 508 } 509 if r <= range_.Hi { 510 return (r-range_.Lo)%range_.Stride == 0 511 } 512 } 513 return false 514} 515 516func binary(ranges []Range16, r uint16) bool { 517 // binary search over ranges 518 lo := 0 519 hi := len(ranges) 520 for lo < hi { 521 m := int(uint(lo+hi) >> 1) 522 range_ := &ranges[m] 523 if range_.Lo <= r && r <= range_.Hi { 524 return (r-range_.Lo)%range_.Stride == 0 525 } 526 if r < range_.Lo { 527 hi = m 528 } else { 529 lo = m + 1 530 } 531 } 532 return false 533} 534 535func TestLatinOffset(t *testing.T) { 536 var maps = []map[string]*RangeTable{ 537 Categories, 538 FoldCategory, 539 FoldScript, 540 Properties, 541 Scripts, 542 } 543 for _, m := range maps { 544 for name, tab := range m { 545 i := 0 546 for i < len(tab.R16) && tab.R16[i].Hi <= MaxLatin1 { 547 i++ 548 } 549 if tab.LatinOffset != i { 550 t.Errorf("%s: LatinOffset=%d, want %d", name, tab.LatinOffset, i) 551 } 552 } 553 } 554} 555 556func TestSpecialCaseNoMapping(t *testing.T) { 557 // Issue 25636 558 // no change for rune 'A', zero delta, under upper/lower/title case change. 559 var noChangeForCapitalA = CaseRange{'A', 'A', [MaxCase]rune{0, 0, 0}} 560 got := strings.ToLowerSpecial(SpecialCase([]CaseRange{noChangeForCapitalA}), "ABC") 561 want := "Abc" 562 if got != want { 563 t.Errorf("got %q; want %q", got, want) 564 } 565} 566 567func TestNegativeRune(t *testing.T) { 568 // Issue 43254 569 // These tests cover negative rune handling by testing values which, 570 // when cast to uint8 or uint16, look like a particular valid rune. 571 // This package has Latin-1-specific optimizations, so we test all of 572 // Latin-1 and representative non-Latin-1 values in the character 573 // categories covered by IsGraphic, etc. 574 nonLatin1 := []uint32{ 575 // Lu: LATIN CAPITAL LETTER A WITH MACRON 576 0x0100, 577 // Ll: LATIN SMALL LETTER A WITH MACRON 578 0x0101, 579 // Lt: LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON 580 0x01C5, 581 // M: COMBINING GRAVE ACCENT 582 0x0300, 583 // Nd: ARABIC-INDIC DIGIT ZERO 584 0x0660, 585 // P: GREEK QUESTION MARK 586 0x037E, 587 // S: MODIFIER LETTER LEFT ARROWHEAD 588 0x02C2, 589 // Z: OGHAM SPACE MARK 590 0x1680, 591 } 592 for i := 0; i < MaxLatin1+len(nonLatin1); i++ { 593 base := uint32(i) 594 if i >= MaxLatin1 { 595 base = nonLatin1[i-MaxLatin1] 596 } 597 598 // Note r is negative, but uint8(r) == uint8(base) and 599 // uint16(r) == uint16(base). 600 r := rune(base - 1<<31) 601 if Is(Letter, r) { 602 t.Errorf("Is(Letter, 0x%x - 1<<31) = true, want false", base) 603 } 604 if IsControl(r) { 605 t.Errorf("IsControl(0x%x - 1<<31) = true, want false", base) 606 } 607 if IsDigit(r) { 608 t.Errorf("IsDigit(0x%x - 1<<31) = true, want false", base) 609 } 610 if IsGraphic(r) { 611 t.Errorf("IsGraphic(0x%x - 1<<31) = true, want false", base) 612 } 613 if IsLetter(r) { 614 t.Errorf("IsLetter(0x%x - 1<<31) = true, want false", base) 615 } 616 if IsLower(r) { 617 t.Errorf("IsLower(0x%x - 1<<31) = true, want false", base) 618 } 619 if IsMark(r) { 620 t.Errorf("IsMark(0x%x - 1<<31) = true, want false", base) 621 } 622 if IsNumber(r) { 623 t.Errorf("IsNumber(0x%x - 1<<31) = true, want false", base) 624 } 625 if IsPrint(r) { 626 t.Errorf("IsPrint(0x%x - 1<<31) = true, want false", base) 627 } 628 if IsPunct(r) { 629 t.Errorf("IsPunct(0x%x - 1<<31) = true, want false", base) 630 } 631 if IsSpace(r) { 632 t.Errorf("IsSpace(0x%x - 1<<31) = true, want false", base) 633 } 634 if IsSymbol(r) { 635 t.Errorf("IsSymbol(0x%x - 1<<31) = true, want false", base) 636 } 637 if IsTitle(r) { 638 t.Errorf("IsTitle(0x%x - 1<<31) = true, want false", base) 639 } 640 if IsUpper(r) { 641 t.Errorf("IsUpper(0x%x - 1<<31) = true, want false", base) 642 } 643 } 644} 645