1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package unicode provides data and functions to test some properties of 6// Unicode code points. 7package unicode 8 9const ( 10 MaxRune = '\U0010FFFF' // Maximum valid Unicode code point. 11 ReplacementChar = '\uFFFD' // Represents invalid code points. 12 MaxASCII = '\u007F' // maximum ASCII value. 13 MaxLatin1 = '\u00FF' // maximum Latin-1 value. 14) 15 16// RangeTable defines a set of Unicode code points by listing the ranges of 17// code points within the set. The ranges are listed in two slices 18// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges. 19// The two slices must be in sorted order and non-overlapping. 20// Also, R32 should contain only values >= 0x10000 (1<<16). 21type RangeTable struct { 22 R16 []Range16 23 R32 []Range32 24 LatinOffset int // number of entries in R16 with Hi <= MaxLatin1 25} 26 27// Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi 28// inclusive and has the specified stride. 29type Range16 struct { 30 Lo uint16 31 Hi uint16 32 Stride uint16 33} 34 35// Range32 represents of a range of Unicode code points and is used when one or 36// more of the values will not fit in 16 bits. The range runs from Lo to Hi 37// inclusive and has the specified stride. Lo and Hi must always be >= 1<<16. 38type Range32 struct { 39 Lo uint32 40 Hi uint32 41 Stride uint32 42} 43 44// CaseRange represents a range of Unicode code points for simple (one 45// code point to one code point) case conversion. 46// The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas 47// are the number to add to the code point to reach the code point for a 48// different case for that character. They may be negative. If zero, it 49// means the character is in the corresponding case. There is a special 50// case representing sequences of alternating corresponding Upper and Lower 51// pairs. It appears with a fixed Delta of 52// 53// {UpperLower, UpperLower, UpperLower} 54// 55// The constant UpperLower has an otherwise impossible delta value. 56type CaseRange struct { 57 Lo uint32 58 Hi uint32 59 Delta d 60} 61 62// SpecialCase represents language-specific case mappings such as Turkish. 63// Methods of SpecialCase customize (by overriding) the standard mappings. 64type SpecialCase []CaseRange 65 66// BUG(r): There is no mechanism for full case folding, that is, for 67// characters that involve multiple runes in the input or output. 68 69// Indices into the Delta arrays inside CaseRanges for case mapping. 70const ( 71 UpperCase = iota 72 LowerCase 73 TitleCase 74 MaxCase 75) 76 77type d [MaxCase]rune // to make the CaseRanges text shorter 78 79// If the Delta field of a [CaseRange] is UpperLower, it means 80// this CaseRange represents a sequence of the form (say) 81// [Upper] [Lower] [Upper] [Lower]. 82const ( 83 UpperLower = MaxRune + 1 // (Cannot be a valid delta.) 84) 85 86// linearMax is the maximum size table for linear search for non-Latin1 rune. 87// Derived by running 'go test -calibrate'. 88const linearMax = 18 89 90// is16 reports whether r is in the sorted slice of 16-bit ranges. 91func is16(ranges []Range16, r uint16) bool { 92 if len(ranges) <= linearMax || r <= MaxLatin1 { 93 for i := range ranges { 94 range_ := &ranges[i] 95 if r < range_.Lo { 96 return false 97 } 98 if r <= range_.Hi { 99 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 100 } 101 } 102 return false 103 } 104 105 // binary search over ranges 106 lo := 0 107 hi := len(ranges) 108 for lo < hi { 109 m := int(uint(lo+hi) >> 1) 110 range_ := &ranges[m] 111 if range_.Lo <= r && r <= range_.Hi { 112 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 113 } 114 if r < range_.Lo { 115 hi = m 116 } else { 117 lo = m + 1 118 } 119 } 120 return false 121} 122 123// is32 reports whether r is in the sorted slice of 32-bit ranges. 124func is32(ranges []Range32, r uint32) bool { 125 if len(ranges) <= linearMax { 126 for i := range ranges { 127 range_ := &ranges[i] 128 if r < range_.Lo { 129 return false 130 } 131 if r <= range_.Hi { 132 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 133 } 134 } 135 return false 136 } 137 138 // binary search over ranges 139 lo := 0 140 hi := len(ranges) 141 for lo < hi { 142 m := int(uint(lo+hi) >> 1) 143 range_ := ranges[m] 144 if range_.Lo <= r && r <= range_.Hi { 145 return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0 146 } 147 if r < range_.Lo { 148 hi = m 149 } else { 150 lo = m + 1 151 } 152 } 153 return false 154} 155 156// Is reports whether the rune is in the specified table of ranges. 157func Is(rangeTab *RangeTable, r rune) bool { 158 r16 := rangeTab.R16 159 // Compare as uint32 to correctly handle negative runes. 160 if len(r16) > 0 && uint32(r) <= uint32(r16[len(r16)-1].Hi) { 161 return is16(r16, uint16(r)) 162 } 163 r32 := rangeTab.R32 164 if len(r32) > 0 && r >= rune(r32[0].Lo) { 165 return is32(r32, uint32(r)) 166 } 167 return false 168} 169 170func isExcludingLatin(rangeTab *RangeTable, r rune) bool { 171 r16 := rangeTab.R16 172 // Compare as uint32 to correctly handle negative runes. 173 if off := rangeTab.LatinOffset; len(r16) > off && uint32(r) <= uint32(r16[len(r16)-1].Hi) { 174 return is16(r16[off:], uint16(r)) 175 } 176 r32 := rangeTab.R32 177 if len(r32) > 0 && r >= rune(r32[0].Lo) { 178 return is32(r32, uint32(r)) 179 } 180 return false 181} 182 183// IsUpper reports whether the rune is an upper case letter. 184func IsUpper(r rune) bool { 185 // See comment in IsGraphic. 186 if uint32(r) <= MaxLatin1 { 187 return properties[uint8(r)]&pLmask == pLu 188 } 189 return isExcludingLatin(Upper, r) 190} 191 192// IsLower reports whether the rune is a lower case letter. 193func IsLower(r rune) bool { 194 // See comment in IsGraphic. 195 if uint32(r) <= MaxLatin1 { 196 return properties[uint8(r)]&pLmask == pLl 197 } 198 return isExcludingLatin(Lower, r) 199} 200 201// IsTitle reports whether the rune is a title case letter. 202func IsTitle(r rune) bool { 203 if r <= MaxLatin1 { 204 return false 205 } 206 return isExcludingLatin(Title, r) 207} 208 209// to maps the rune using the specified case mapping. 210// It additionally reports whether caseRange contained a mapping for r. 211func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) { 212 if _case < 0 || MaxCase <= _case { 213 return ReplacementChar, false // as reasonable an error as any 214 } 215 // binary search over ranges 216 lo := 0 217 hi := len(caseRange) 218 for lo < hi { 219 m := int(uint(lo+hi) >> 1) 220 cr := caseRange[m] 221 if rune(cr.Lo) <= r && r <= rune(cr.Hi) { 222 delta := cr.Delta[_case] 223 if delta > MaxRune { 224 // In an Upper-Lower sequence, which always starts with 225 // an UpperCase letter, the real deltas always look like: 226 // {0, 1, 0} UpperCase (Lower is next) 227 // {-1, 0, -1} LowerCase (Upper, Title are previous) 228 // The characters at even offsets from the beginning of the 229 // sequence are upper case; the ones at odd offsets are lower. 230 // The correct mapping can be done by clearing or setting the low 231 // bit in the sequence offset. 232 // The constants UpperCase and TitleCase are even while LowerCase 233 // is odd so we take the low bit from _case. 234 return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true 235 } 236 return r + delta, true 237 } 238 if r < rune(cr.Lo) { 239 hi = m 240 } else { 241 lo = m + 1 242 } 243 } 244 return r, false 245} 246 247// To maps the rune to the specified case: [UpperCase], [LowerCase], or [TitleCase]. 248func To(_case int, r rune) rune { 249 r, _ = to(_case, r, CaseRanges) 250 return r 251} 252 253// ToUpper maps the rune to upper case. 254func ToUpper(r rune) rune { 255 if r <= MaxASCII { 256 if 'a' <= r && r <= 'z' { 257 r -= 'a' - 'A' 258 } 259 return r 260 } 261 return To(UpperCase, r) 262} 263 264// ToLower maps the rune to lower case. 265func ToLower(r rune) rune { 266 if r <= MaxASCII { 267 if 'A' <= r && r <= 'Z' { 268 r += 'a' - 'A' 269 } 270 return r 271 } 272 return To(LowerCase, r) 273} 274 275// ToTitle maps the rune to title case. 276func ToTitle(r rune) rune { 277 if r <= MaxASCII { 278 if 'a' <= r && r <= 'z' { // title case is upper case for ASCII 279 r -= 'a' - 'A' 280 } 281 return r 282 } 283 return To(TitleCase, r) 284} 285 286// ToUpper maps the rune to upper case giving priority to the special mapping. 287func (special SpecialCase) ToUpper(r rune) rune { 288 r1, hadMapping := to(UpperCase, r, []CaseRange(special)) 289 if r1 == r && !hadMapping { 290 r1 = ToUpper(r) 291 } 292 return r1 293} 294 295// ToTitle maps the rune to title case giving priority to the special mapping. 296func (special SpecialCase) ToTitle(r rune) rune { 297 r1, hadMapping := to(TitleCase, r, []CaseRange(special)) 298 if r1 == r && !hadMapping { 299 r1 = ToTitle(r) 300 } 301 return r1 302} 303 304// ToLower maps the rune to lower case giving priority to the special mapping. 305func (special SpecialCase) ToLower(r rune) rune { 306 r1, hadMapping := to(LowerCase, r, []CaseRange(special)) 307 if r1 == r && !hadMapping { 308 r1 = ToLower(r) 309 } 310 return r1 311} 312 313// caseOrbit is defined in tables.go as []foldPair. Right now all the 314// entries fit in uint16, so use uint16. If that changes, compilation 315// will fail (the constants in the composite literal will not fit in uint16) 316// and the types here can change to uint32. 317type foldPair struct { 318 From uint16 319 To uint16 320} 321 322// SimpleFold iterates over Unicode code points equivalent under 323// the Unicode-defined simple case folding. Among the code points 324// equivalent to rune (including rune itself), SimpleFold returns the 325// smallest rune > r if one exists, or else the smallest rune >= 0. 326// If r is not a valid Unicode code point, SimpleFold(r) returns r. 327// 328// For example: 329// 330// SimpleFold('A') = 'a' 331// SimpleFold('a') = 'A' 332// 333// SimpleFold('K') = 'k' 334// SimpleFold('k') = '\u212A' (Kelvin symbol, K) 335// SimpleFold('\u212A') = 'K' 336// 337// SimpleFold('1') = '1' 338// 339// SimpleFold(-2) = -2 340func SimpleFold(r rune) rune { 341 if r < 0 || r > MaxRune { 342 return r 343 } 344 345 if int(r) < len(asciiFold) { 346 return rune(asciiFold[r]) 347 } 348 349 // Consult caseOrbit table for special cases. 350 lo := 0 351 hi := len(caseOrbit) 352 for lo < hi { 353 m := int(uint(lo+hi) >> 1) 354 if rune(caseOrbit[m].From) < r { 355 lo = m + 1 356 } else { 357 hi = m 358 } 359 } 360 if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r { 361 return rune(caseOrbit[lo].To) 362 } 363 364 // No folding specified. This is a one- or two-element 365 // equivalence class containing rune and ToLower(rune) 366 // and ToUpper(rune) if they are different from rune. 367 if l := ToLower(r); l != r { 368 return l 369 } 370 return ToUpper(r) 371} 372