1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package unicode_test
6
7import (
8	"flag"
9	"fmt"
10	"runtime"
11	"sort"
12	"strings"
13	"testing"
14	. "unicode"
15)
16
17var upperTest = []rune{
18	0x41,
19	0xc0,
20	0xd8,
21	0x100,
22	0x139,
23	0x14a,
24	0x178,
25	0x181,
26	0x376,
27	0x3cf,
28	0x13bd,
29	0x1f2a,
30	0x2102,
31	0x2c00,
32	0x2c10,
33	0x2c20,
34	0xa650,
35	0xa722,
36	0xff3a,
37	0x10400,
38	0x1d400,
39	0x1d7ca,
40}
41
42var notupperTest = []rune{
43	0x40,
44	0x5b,
45	0x61,
46	0x185,
47	0x1b0,
48	0x377,
49	0x387,
50	0x2150,
51	0xab7d,
52	0xffff,
53	0x10000,
54}
55
56var letterTest = []rune{
57	0x41,
58	0x61,
59	0xaa,
60	0xba,
61	0xc8,
62	0xdb,
63	0xf9,
64	0x2ec,
65	0x535,
66	0x620,
67	0x6e6,
68	0x93d,
69	0xa15,
70	0xb99,
71	0xdc0,
72	0xedd,
73	0x1000,
74	0x1200,
75	0x1312,
76	0x1401,
77	0x2c00,
78	0xa800,
79	0xf900,
80	0xfa30,
81	0xffda,
82	0xffdc,
83	0x10000,
84	0x10300,
85	0x10400,
86	0x20000,
87	0x2f800,
88	0x2fa1d,
89}
90
91var notletterTest = []rune{
92	0x20,
93	0x35,
94	0x375,
95	0x619,
96	0x700,
97	0x1885,
98	0xfffe,
99	0x1ffff,
100	0x10ffff,
101}
102
103// Contains all the special cased Latin-1 chars.
104var spaceTest = []rune{
105	0x09,
106	0x0a,
107	0x0b,
108	0x0c,
109	0x0d,
110	0x20,
111	0x85,
112	0xA0,
113	0x2000,
114	0x3000,
115}
116
117type caseT struct {
118	cas     int
119	in, out rune
120}
121
122var caseTest = []caseT{
123	// errors
124	{-1, '\n', 0xFFFD},
125	{UpperCase, -1, -1},
126	{UpperCase, 1 << 30, 1 << 30},
127
128	// ASCII (special-cased so test carefully)
129	{UpperCase, '\n', '\n'},
130	{UpperCase, 'a', 'A'},
131	{UpperCase, 'A', 'A'},
132	{UpperCase, '7', '7'},
133	{LowerCase, '\n', '\n'},
134	{LowerCase, 'a', 'a'},
135	{LowerCase, 'A', 'a'},
136	{LowerCase, '7', '7'},
137	{TitleCase, '\n', '\n'},
138	{TitleCase, 'a', 'A'},
139	{TitleCase, 'A', 'A'},
140	{TitleCase, '7', '7'},
141
142	// Latin-1: easy to read the tests!
143	{UpperCase, 0x80, 0x80},
144	{UpperCase, 'Å', 'Å'},
145	{UpperCase, 'å', 'Å'},
146	{LowerCase, 0x80, 0x80},
147	{LowerCase, 'Å', 'å'},
148	{LowerCase, 'å', 'å'},
149	{TitleCase, 0x80, 0x80},
150	{TitleCase, 'Å', 'Å'},
151	{TitleCase, 'å', 'Å'},
152
153	// 0131;LATIN SMALL LETTER DOTLESS I;Ll;0;L;;;;;N;;;0049;;0049
154	{UpperCase, 0x0131, 'I'},
155	{LowerCase, 0x0131, 0x0131},
156	{TitleCase, 0x0131, 'I'},
157
158	// 0133;LATIN SMALL LIGATURE IJ;Ll;0;L;<compat> 0069 006A;;;;N;LATIN SMALL LETTER I J;;0132;;0132
159	{UpperCase, 0x0133, 0x0132},
160	{LowerCase, 0x0133, 0x0133},
161	{TitleCase, 0x0133, 0x0132},
162
163	// 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B;
164	{UpperCase, 0x212A, 0x212A},
165	{LowerCase, 0x212A, 'k'},
166	{TitleCase, 0x212A, 0x212A},
167
168	// From an UpperLower sequence
169	// A640;CYRILLIC CAPITAL LETTER ZEMLYA;Lu;0;L;;;;;N;;;;A641;
170	{UpperCase, 0xA640, 0xA640},
171	{LowerCase, 0xA640, 0xA641},
172	{TitleCase, 0xA640, 0xA640},
173	// A641;CYRILLIC SMALL LETTER ZEMLYA;Ll;0;L;;;;;N;;;A640;;A640
174	{UpperCase, 0xA641, 0xA640},
175	{LowerCase, 0xA641, 0xA641},
176	{TitleCase, 0xA641, 0xA640},
177	// A64E;CYRILLIC CAPITAL LETTER NEUTRAL YER;Lu;0;L;;;;;N;;;;A64F;
178	{UpperCase, 0xA64E, 0xA64E},
179	{LowerCase, 0xA64E, 0xA64F},
180	{TitleCase, 0xA64E, 0xA64E},
181	// A65F;CYRILLIC SMALL LETTER YN;Ll;0;L;;;;;N;;;A65E;;A65E
182	{UpperCase, 0xA65F, 0xA65E},
183	{LowerCase, 0xA65F, 0xA65F},
184	{TitleCase, 0xA65F, 0xA65E},
185
186	// From another UpperLower sequence
187	// 0139;LATIN CAPITAL LETTER L WITH ACUTE;Lu;0;L;004C 0301;;;;N;LATIN CAPITAL LETTER L ACUTE;;;013A;
188	{UpperCase, 0x0139, 0x0139},
189	{LowerCase, 0x0139, 0x013A},
190	{TitleCase, 0x0139, 0x0139},
191	// 013F;LATIN CAPITAL LETTER L WITH MIDDLE DOT;Lu;0;L;<compat> 004C 00B7;;;;N;;;;0140;
192	{UpperCase, 0x013f, 0x013f},
193	{LowerCase, 0x013f, 0x0140},
194	{TitleCase, 0x013f, 0x013f},
195	// 0148;LATIN SMALL LETTER N WITH CARON;Ll;0;L;006E 030C;;;;N;LATIN SMALL LETTER N HACEK;;0147;;0147
196	{UpperCase, 0x0148, 0x0147},
197	{LowerCase, 0x0148, 0x0148},
198	{TitleCase, 0x0148, 0x0147},
199
200	// Lowercase lower than uppercase.
201	// AB78;CHEROKEE SMALL LETTER GE;Ll;0;L;;;;;N;;;13A8;;13A8
202	{UpperCase, 0xab78, 0x13a8},
203	{LowerCase, 0xab78, 0xab78},
204	{TitleCase, 0xab78, 0x13a8},
205	{UpperCase, 0x13a8, 0x13a8},
206	{LowerCase, 0x13a8, 0xab78},
207	{TitleCase, 0x13a8, 0x13a8},
208
209	// Last block in the 5.1.0 table
210	// 10400;DESERET CAPITAL LETTER LONG I;Lu;0;L;;;;;N;;;;10428;
211	{UpperCase, 0x10400, 0x10400},
212	{LowerCase, 0x10400, 0x10428},
213	{TitleCase, 0x10400, 0x10400},
214	// 10427;DESERET CAPITAL LETTER EW;Lu;0;L;;;;;N;;;;1044F;
215	{UpperCase, 0x10427, 0x10427},
216	{LowerCase, 0x10427, 0x1044F},
217	{TitleCase, 0x10427, 0x10427},
218	// 10428;DESERET SMALL LETTER LONG I;Ll;0;L;;;;;N;;;10400;;10400
219	{UpperCase, 0x10428, 0x10400},
220	{LowerCase, 0x10428, 0x10428},
221	{TitleCase, 0x10428, 0x10400},
222	// 1044F;DESERET SMALL LETTER EW;Ll;0;L;;;;;N;;;10427;;10427
223	{UpperCase, 0x1044F, 0x10427},
224	{LowerCase, 0x1044F, 0x1044F},
225	{TitleCase, 0x1044F, 0x10427},
226
227	// First one not in the 5.1.0 table
228	// 10450;SHAVIAN LETTER PEEP;Lo;0;L;;;;;N;;;;;
229	{UpperCase, 0x10450, 0x10450},
230	{LowerCase, 0x10450, 0x10450},
231	{TitleCase, 0x10450, 0x10450},
232
233	// Non-letters with case.
234	{LowerCase, 0x2161, 0x2171},
235	{UpperCase, 0x0345, 0x0399},
236}
237
238func TestIsLetter(t *testing.T) {
239	for _, r := range upperTest {
240		if !IsLetter(r) {
241			t.Errorf("IsLetter(U+%04X) = false, want true", r)
242		}
243	}
244	for _, r := range letterTest {
245		if !IsLetter(r) {
246			t.Errorf("IsLetter(U+%04X) = false, want true", r)
247		}
248	}
249	for _, r := range notletterTest {
250		if IsLetter(r) {
251			t.Errorf("IsLetter(U+%04X) = true, want false", r)
252		}
253	}
254}
255
256func TestIsUpper(t *testing.T) {
257	for _, r := range upperTest {
258		if !IsUpper(r) {
259			t.Errorf("IsUpper(U+%04X) = false, want true", r)
260		}
261	}
262	for _, r := range notupperTest {
263		if IsUpper(r) {
264			t.Errorf("IsUpper(U+%04X) = true, want false", r)
265		}
266	}
267	for _, r := range notletterTest {
268		if IsUpper(r) {
269			t.Errorf("IsUpper(U+%04X) = true, want false", r)
270		}
271	}
272}
273
274func caseString(c int) string {
275	switch c {
276	case UpperCase:
277		return "UpperCase"
278	case LowerCase:
279		return "LowerCase"
280	case TitleCase:
281		return "TitleCase"
282	}
283	return "ErrorCase"
284}
285
286func TestTo(t *testing.T) {
287	for _, c := range caseTest {
288		r := To(c.cas, c.in)
289		if c.out != r {
290			t.Errorf("To(U+%04X, %s) = U+%04X want U+%04X", c.in, caseString(c.cas), r, c.out)
291		}
292	}
293}
294
295func TestToUpperCase(t *testing.T) {
296	for _, c := range caseTest {
297		if c.cas != UpperCase {
298			continue
299		}
300		r := ToUpper(c.in)
301		if c.out != r {
302			t.Errorf("ToUpper(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
303		}
304	}
305}
306
307func TestToLowerCase(t *testing.T) {
308	for _, c := range caseTest {
309		if c.cas != LowerCase {
310			continue
311		}
312		r := ToLower(c.in)
313		if c.out != r {
314			t.Errorf("ToLower(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
315		}
316	}
317}
318
319func TestToTitleCase(t *testing.T) {
320	for _, c := range caseTest {
321		if c.cas != TitleCase {
322			continue
323		}
324		r := ToTitle(c.in)
325		if c.out != r {
326			t.Errorf("ToTitle(U+%04X) = U+%04X want U+%04X", c.in, r, c.out)
327		}
328	}
329}
330
331func TestIsSpace(t *testing.T) {
332	for _, c := range spaceTest {
333		if !IsSpace(c) {
334			t.Errorf("IsSpace(U+%04X) = false; want true", c)
335		}
336	}
337	for _, c := range letterTest {
338		if IsSpace(c) {
339			t.Errorf("IsSpace(U+%04X) = true; want false", c)
340		}
341	}
342}
343
344// Check that the optimizations for IsLetter etc. agree with the tables.
345// We only need to check the Latin-1 range.
346func TestLetterOptimizations(t *testing.T) {
347	for i := rune(0); i <= MaxLatin1; i++ {
348		if Is(Letter, i) != IsLetter(i) {
349			t.Errorf("IsLetter(U+%04X) disagrees with Is(Letter)", i)
350		}
351		if Is(Upper, i) != IsUpper(i) {
352			t.Errorf("IsUpper(U+%04X) disagrees with Is(Upper)", i)
353		}
354		if Is(Lower, i) != IsLower(i) {
355			t.Errorf("IsLower(U+%04X) disagrees with Is(Lower)", i)
356		}
357		if Is(Title, i) != IsTitle(i) {
358			t.Errorf("IsTitle(U+%04X) disagrees with Is(Title)", i)
359		}
360		if Is(White_Space, i) != IsSpace(i) {
361			t.Errorf("IsSpace(U+%04X) disagrees with Is(White_Space)", i)
362		}
363		if To(UpperCase, i) != ToUpper(i) {
364			t.Errorf("ToUpper(U+%04X) disagrees with To(Upper)", i)
365		}
366		if To(LowerCase, i) != ToLower(i) {
367			t.Errorf("ToLower(U+%04X) disagrees with To(Lower)", i)
368		}
369		if To(TitleCase, i) != ToTitle(i) {
370			t.Errorf("ToTitle(U+%04X) disagrees with To(Title)", i)
371		}
372	}
373}
374
375func TestTurkishCase(t *testing.T) {
376	lower := []rune("abcçdefgğhıijklmnoöprsştuüvyz")
377	upper := []rune("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ")
378	for i, l := range lower {
379		u := upper[i]
380		if TurkishCase.ToLower(l) != l {
381			t.Errorf("lower(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToLower(l), l)
382		}
383		if TurkishCase.ToUpper(u) != u {
384			t.Errorf("upper(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToUpper(u), u)
385		}
386		if TurkishCase.ToUpper(l) != u {
387			t.Errorf("upper(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToUpper(l), u)
388		}
389		if TurkishCase.ToLower(u) != l {
390			t.Errorf("lower(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToLower(l), l)
391		}
392		if TurkishCase.ToTitle(u) != u {
393			t.Errorf("title(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToTitle(u), u)
394		}
395		if TurkishCase.ToTitle(l) != u {
396			t.Errorf("title(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToTitle(l), u)
397		}
398	}
399}
400
401var simpleFoldTests = []string{
402	// SimpleFold(x) returns the next equivalent rune > x or wraps
403	// around to smaller values.
404
405	// Easy cases.
406	"Aa",
407	"δΔ",
408
409	// ASCII special cases.
410	"KkK",
411	"Ssſ",
412
413	// Non-ASCII special cases.
414	"ρϱΡ",
415	"ͅΙιι",
416
417	// Extra special cases: has lower/upper but no case fold.
418	"İ",
419	"ı",
420
421	// Upper comes before lower (Cherokee).
422	"\u13b0\uab80",
423}
424
425func TestSimpleFold(t *testing.T) {
426	for _, tt := range simpleFoldTests {
427		cycle := []rune(tt)
428		r := cycle[len(cycle)-1]
429		for _, out := range cycle {
430			if r := SimpleFold(r); r != out {
431				t.Errorf("SimpleFold(%#U) = %#U, want %#U", r, r, out)
432			}
433			r = out
434		}
435	}
436
437	if r := SimpleFold(-42); r != -42 {
438		t.Errorf("SimpleFold(-42) = %v, want -42", r)
439	}
440}
441
442// Running 'go test -calibrate' runs the calibration to find a plausible
443// cutoff point for linear search of a range list vs. binary search.
444// We create a fake table and then time how long it takes to do a
445// sequence of searches within that table, for all possible inputs
446// relative to the ranges (something before all, in each, between each, after all).
447// This assumes that all possible runes are equally likely.
448// In practice most runes are ASCII so this is a conservative estimate
449// of an effective cutoff value. In practice we could probably set it higher
450// than what this function recommends.
451
452var calibrate = flag.Bool("calibrate", false, "compute crossover for linear vs. binary search")
453
454func TestCalibrate(t *testing.T) {
455	if !*calibrate {
456		return
457	}
458
459	if runtime.GOARCH == "amd64" {
460		fmt.Printf("warning: running calibration on %s\n", runtime.GOARCH)
461	}
462
463	// Find the point where binary search wins by more than 10%.
464	// The 10% bias gives linear search an edge when they're close,
465	// because on predominantly ASCII inputs linear search is even
466	// better than our benchmarks measure.
467	n := sort.Search(64, func(n int) bool {
468		tab := fakeTable(n)
469		blinear := func(b *testing.B) {
470			tab := tab
471			max := n*5 + 20
472			for i := 0; i < b.N; i++ {
473				for j := 0; j <= max; j++ {
474					linear(tab, uint16(j))
475				}
476			}
477		}
478		bbinary := func(b *testing.B) {
479			tab := tab
480			max := n*5 + 20
481			for i := 0; i < b.N; i++ {
482				for j := 0; j <= max; j++ {
483					binary(tab, uint16(j))
484				}
485			}
486		}
487		bmlinear := testing.Benchmark(blinear)
488		bmbinary := testing.Benchmark(bbinary)
489		fmt.Printf("n=%d: linear=%d binary=%d\n", n, bmlinear.NsPerOp(), bmbinary.NsPerOp())
490		return bmlinear.NsPerOp()*100 > bmbinary.NsPerOp()*110
491	})
492	fmt.Printf("calibration: linear cutoff = %d\n", n)
493}
494
495func fakeTable(n int) []Range16 {
496	var r16 []Range16
497	for i := 0; i < n; i++ {
498		r16 = append(r16, Range16{uint16(i*5 + 10), uint16(i*5 + 12), 1})
499	}
500	return r16
501}
502
503func linear(ranges []Range16, r uint16) bool {
504	for i := range ranges {
505		range_ := &ranges[i]
506		if r < range_.Lo {
507			return false
508		}
509		if r <= range_.Hi {
510			return (r-range_.Lo)%range_.Stride == 0
511		}
512	}
513	return false
514}
515
516func binary(ranges []Range16, r uint16) bool {
517	// binary search over ranges
518	lo := 0
519	hi := len(ranges)
520	for lo < hi {
521		m := int(uint(lo+hi) >> 1)
522		range_ := &ranges[m]
523		if range_.Lo <= r && r <= range_.Hi {
524			return (r-range_.Lo)%range_.Stride == 0
525		}
526		if r < range_.Lo {
527			hi = m
528		} else {
529			lo = m + 1
530		}
531	}
532	return false
533}
534
535func TestLatinOffset(t *testing.T) {
536	var maps = []map[string]*RangeTable{
537		Categories,
538		FoldCategory,
539		FoldScript,
540		Properties,
541		Scripts,
542	}
543	for _, m := range maps {
544		for name, tab := range m {
545			i := 0
546			for i < len(tab.R16) && tab.R16[i].Hi <= MaxLatin1 {
547				i++
548			}
549			if tab.LatinOffset != i {
550				t.Errorf("%s: LatinOffset=%d, want %d", name, tab.LatinOffset, i)
551			}
552		}
553	}
554}
555
556func TestSpecialCaseNoMapping(t *testing.T) {
557	// Issue 25636
558	// no change for rune 'A', zero delta, under upper/lower/title case change.
559	var noChangeForCapitalA = CaseRange{'A', 'A', [MaxCase]rune{0, 0, 0}}
560	got := strings.ToLowerSpecial(SpecialCase([]CaseRange{noChangeForCapitalA}), "ABC")
561	want := "Abc"
562	if got != want {
563		t.Errorf("got %q; want %q", got, want)
564	}
565}
566
567func TestNegativeRune(t *testing.T) {
568	// Issue 43254
569	// These tests cover negative rune handling by testing values which,
570	// when cast to uint8 or uint16, look like a particular valid rune.
571	// This package has Latin-1-specific optimizations, so we test all of
572	// Latin-1 and representative non-Latin-1 values in the character
573	// categories covered by IsGraphic, etc.
574	nonLatin1 := []uint32{
575		// Lu: LATIN CAPITAL LETTER A WITH MACRON
576		0x0100,
577		// Ll: LATIN SMALL LETTER A WITH MACRON
578		0x0101,
579		// Lt: LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
580		0x01C5,
581		// M: COMBINING GRAVE ACCENT
582		0x0300,
583		// Nd: ARABIC-INDIC DIGIT ZERO
584		0x0660,
585		// P: GREEK QUESTION MARK
586		0x037E,
587		// S: MODIFIER LETTER LEFT ARROWHEAD
588		0x02C2,
589		// Z: OGHAM SPACE MARK
590		0x1680,
591	}
592	for i := 0; i < MaxLatin1+len(nonLatin1); i++ {
593		base := uint32(i)
594		if i >= MaxLatin1 {
595			base = nonLatin1[i-MaxLatin1]
596		}
597
598		// Note r is negative, but uint8(r) == uint8(base) and
599		// uint16(r) == uint16(base).
600		r := rune(base - 1<<31)
601		if Is(Letter, r) {
602			t.Errorf("Is(Letter, 0x%x - 1<<31) = true, want false", base)
603		}
604		if IsControl(r) {
605			t.Errorf("IsControl(0x%x - 1<<31) = true, want false", base)
606		}
607		if IsDigit(r) {
608			t.Errorf("IsDigit(0x%x - 1<<31) = true, want false", base)
609		}
610		if IsGraphic(r) {
611			t.Errorf("IsGraphic(0x%x - 1<<31) = true, want false", base)
612		}
613		if IsLetter(r) {
614			t.Errorf("IsLetter(0x%x - 1<<31) = true, want false", base)
615		}
616		if IsLower(r) {
617			t.Errorf("IsLower(0x%x - 1<<31) = true, want false", base)
618		}
619		if IsMark(r) {
620			t.Errorf("IsMark(0x%x - 1<<31) = true, want false", base)
621		}
622		if IsNumber(r) {
623			t.Errorf("IsNumber(0x%x - 1<<31) = true, want false", base)
624		}
625		if IsPrint(r) {
626			t.Errorf("IsPrint(0x%x - 1<<31) = true, want false", base)
627		}
628		if IsPunct(r) {
629			t.Errorf("IsPunct(0x%x - 1<<31) = true, want false", base)
630		}
631		if IsSpace(r) {
632			t.Errorf("IsSpace(0x%x - 1<<31) = true, want false", base)
633		}
634		if IsSymbol(r) {
635			t.Errorf("IsSymbol(0x%x - 1<<31) = true, want false", base)
636		}
637		if IsTitle(r) {
638			t.Errorf("IsTitle(0x%x - 1<<31) = true, want false", base)
639		}
640		if IsUpper(r) {
641			t.Errorf("IsUpper(0x%x - 1<<31) = true, want false", base)
642		}
643	}
644}
645