1 // Scintilla source code edit control 2 // Encoding: UTF-8 3 /** @file CaseConvert.cxx 4 ** Case fold characters and convert them to upper or lower case. 5 ** Tables automatically regenerated by scripts/GenerateCaseConvert.py 6 ** Should only be rarely regenerated for new versions of Unicode. 7 **/ 8 // Copyright 2013 by Neil Hodgson <[email protected]> 9 // The License.txt file describes the conditions under which this software may be distributed. 10 11 #include <cassert> 12 #include <cstring> 13 14 #include <stdexcept> 15 #include <string> 16 #include <string_view> 17 #include <vector> 18 #include <algorithm> 19 20 #include "CaseConvert.h" 21 #include "UniConversion.h" 22 23 using namespace Scintilla; 24 25 namespace { 26 // Use an unnamed namespace to protect the declarations from name conflicts 27 28 // Unicode code points are ordered by groups and follow patterns. 29 // Most characters (pitch==1) are in ranges for a particular alphabet and their 30 // upper case forms are a fixed distance away. 31 // Another pattern (pitch==2) is where each lower case letter is preceded by 32 // the upper case form. These are also grouped into ranges. 33 34 int symmetricCaseConversionRanges[] = { 35 //lower, upper, range length, range pitch 36 //++Autogenerated -- start of section automatically generated 37 //**\(\*\n\) 38 97,65,26,1, 39 224,192,23,1, 40 248,216,7,1, 41 257,256,24,2, 42 314,313,8,2, 43 331,330,23,2, 44 462,461,8,2, 45 479,478,9,2, 46 505,504,20,2, 47 547,546,9,2, 48 583,582,5,2, 49 945,913,17,1, 50 963,931,9,1, 51 985,984,12,2, 52 1072,1040,32,1, 53 1104,1024,16,1, 54 1121,1120,17,2, 55 1163,1162,27,2, 56 1218,1217,7,2, 57 1233,1232,48,2, 58 1377,1329,38,1, 59 4304,7312,43,1, 60 7681,7680,75,2, 61 7841,7840,48,2, 62 7936,7944,8,1, 63 7952,7960,6,1, 64 7968,7976,8,1, 65 7984,7992,8,1, 66 8000,8008,6,1, 67 8032,8040,8,1, 68 8560,8544,16,1, 69 9424,9398,26,1, 70 11312,11264,47,1, 71 11393,11392,50,2, 72 11520,4256,38,1, 73 42561,42560,23,2, 74 42625,42624,14,2, 75 42787,42786,7,2, 76 42803,42802,31,2, 77 42879,42878,5,2, 78 42903,42902,10,2, 79 42933,42932,6,2, 80 65345,65313,26,1, 81 66600,66560,40,1, 82 66776,66736,36,1, 83 68800,68736,51,1, 84 71872,71840,32,1, 85 93792,93760,32,1, 86 125218,125184,34,1, 87 88 //--Autogenerated -- end of section automatically generated 89 }; 90 91 // Code points that are symmetric but don't fit into a range of similar characters 92 // are listed here. 93 94 int symmetricCaseConversions[] = { 95 //lower, upper 96 //++Autogenerated -- start of section automatically generated 97 //**1 \(\*\n\) 98 255,376, 99 307,306, 100 309,308, 101 311,310, 102 378,377, 103 380,379, 104 382,381, 105 384,579, 106 387,386, 107 389,388, 108 392,391, 109 396,395, 110 402,401, 111 405,502, 112 409,408, 113 410,573, 114 414,544, 115 417,416, 116 419,418, 117 421,420, 118 424,423, 119 429,428, 120 432,431, 121 436,435, 122 438,437, 123 441,440, 124 445,444, 125 447,503, 126 454,452, 127 457,455, 128 460,458, 129 477,398, 130 499,497, 131 501,500, 132 572,571, 133 575,11390, 134 576,11391, 135 578,577, 136 592,11375, 137 593,11373, 138 594,11376, 139 595,385, 140 596,390, 141 598,393, 142 599,394, 143 601,399, 144 603,400, 145 604,42923, 146 608,403, 147 609,42924, 148 611,404, 149 613,42893, 150 614,42922, 151 616,407, 152 617,406, 153 618,42926, 154 619,11362, 155 620,42925, 156 623,412, 157 625,11374, 158 626,413, 159 629,415, 160 637,11364, 161 640,422, 162 642,42949, 163 643,425, 164 647,42929, 165 648,430, 166 649,580, 167 650,433, 168 651,434, 169 652,581, 170 658,439, 171 669,42930, 172 670,42928, 173 881,880, 174 883,882, 175 887,886, 176 891,1021, 177 892,1022, 178 893,1023, 179 940,902, 180 941,904, 181 942,905, 182 943,906, 183 972,908, 184 973,910, 185 974,911, 186 983,975, 187 1010,1017, 188 1011,895, 189 1016,1015, 190 1019,1018, 191 1231,1216, 192 4349,7357, 193 4350,7358, 194 4351,7359, 195 7545,42877, 196 7549,11363, 197 7566,42950, 198 8017,8025, 199 8019,8027, 200 8021,8029, 201 8023,8031, 202 8048,8122, 203 8049,8123, 204 8050,8136, 205 8051,8137, 206 8052,8138, 207 8053,8139, 208 8054,8154, 209 8055,8155, 210 8056,8184, 211 8057,8185, 212 8058,8170, 213 8059,8171, 214 8060,8186, 215 8061,8187, 216 8112,8120, 217 8113,8121, 218 8144,8152, 219 8145,8153, 220 8160,8168, 221 8161,8169, 222 8165,8172, 223 8526,8498, 224 8580,8579, 225 11361,11360, 226 11365,570, 227 11366,574, 228 11368,11367, 229 11370,11369, 230 11372,11371, 231 11379,11378, 232 11382,11381, 233 11500,11499, 234 11502,11501, 235 11507,11506, 236 11559,4295, 237 11565,4301, 238 42874,42873, 239 42876,42875, 240 42892,42891, 241 42897,42896, 242 42899,42898, 243 42900,42948, 244 42947,42946, 245 43859,42931, 246 247 //--Autogenerated -- end of section automatically generated 248 }; 249 250 // Characters that have complex case conversions are listed here. 251 // This includes cases where more than one character is needed for a conversion, 252 // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or 253 // lower(upper(x)) != x. 254 255 const char *complexCaseConversions = 256 // Original | Folded | Upper | Lower | 257 //++Autogenerated -- start of section automatically generated 258 //**2 \(\*\n\) 259 "\xc2\xb5|\xce\xbc|\xce\x9c||" 260 "\xc3\x9f|ss|SS||" 261 "\xc4\xb0|i\xcc\x87||i\xcc\x87|" 262 "\xc4\xb1||I||" 263 "\xc5\x89|\xca\xbcn|\xca\xbcN||" 264 "\xc5\xbf|s|S||" 265 "\xc7\x85|\xc7\x86|\xc7\x84|\xc7\x86|" 266 "\xc7\x88|\xc7\x89|\xc7\x87|\xc7\x89|" 267 "\xc7\x8b|\xc7\x8c|\xc7\x8a|\xc7\x8c|" 268 "\xc7\xb0|j\xcc\x8c|J\xcc\x8c||" 269 "\xc7\xb2|\xc7\xb3|\xc7\xb1|\xc7\xb3|" 270 "\xcd\x85|\xce\xb9|\xce\x99||" 271 "\xce\x90|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||" 272 "\xce\xb0|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||" 273 "\xcf\x82|\xcf\x83|\xce\xa3||" 274 "\xcf\x90|\xce\xb2|\xce\x92||" 275 "\xcf\x91|\xce\xb8|\xce\x98||" 276 "\xcf\x95|\xcf\x86|\xce\xa6||" 277 "\xcf\x96|\xcf\x80|\xce\xa0||" 278 "\xcf\xb0|\xce\xba|\xce\x9a||" 279 "\xcf\xb1|\xcf\x81|\xce\xa1||" 280 "\xcf\xb4|\xce\xb8||\xce\xb8|" 281 "\xcf\xb5|\xce\xb5|\xce\x95||" 282 "\xd6\x87|\xd5\xa5\xd6\x82|\xd4\xb5\xd5\x92||" 283 "\xe1\x8e\xa0|||\xea\xad\xb0|" 284 "\xe1\x8e\xa1|||\xea\xad\xb1|" 285 "\xe1\x8e\xa2|||\xea\xad\xb2|" 286 "\xe1\x8e\xa3|||\xea\xad\xb3|" 287 "\xe1\x8e\xa4|||\xea\xad\xb4|" 288 "\xe1\x8e\xa5|||\xea\xad\xb5|" 289 "\xe1\x8e\xa6|||\xea\xad\xb6|" 290 "\xe1\x8e\xa7|||\xea\xad\xb7|" 291 "\xe1\x8e\xa8|||\xea\xad\xb8|" 292 "\xe1\x8e\xa9|||\xea\xad\xb9|" 293 "\xe1\x8e\xaa|||\xea\xad\xba|" 294 "\xe1\x8e\xab|||\xea\xad\xbb|" 295 "\xe1\x8e\xac|||\xea\xad\xbc|" 296 "\xe1\x8e\xad|||\xea\xad\xbd|" 297 "\xe1\x8e\xae|||\xea\xad\xbe|" 298 "\xe1\x8e\xaf|||\xea\xad\xbf|" 299 "\xe1\x8e\xb0|||\xea\xae\x80|" 300 "\xe1\x8e\xb1|||\xea\xae\x81|" 301 "\xe1\x8e\xb2|||\xea\xae\x82|" 302 "\xe1\x8e\xb3|||\xea\xae\x83|" 303 "\xe1\x8e\xb4|||\xea\xae\x84|" 304 "\xe1\x8e\xb5|||\xea\xae\x85|" 305 "\xe1\x8e\xb6|||\xea\xae\x86|" 306 "\xe1\x8e\xb7|||\xea\xae\x87|" 307 "\xe1\x8e\xb8|||\xea\xae\x88|" 308 "\xe1\x8e\xb9|||\xea\xae\x89|" 309 "\xe1\x8e\xba|||\xea\xae\x8a|" 310 "\xe1\x8e\xbb|||\xea\xae\x8b|" 311 "\xe1\x8e\xbc|||\xea\xae\x8c|" 312 "\xe1\x8e\xbd|||\xea\xae\x8d|" 313 "\xe1\x8e\xbe|||\xea\xae\x8e|" 314 "\xe1\x8e\xbf|||\xea\xae\x8f|" 315 "\xe1\x8f\x80|||\xea\xae\x90|" 316 "\xe1\x8f\x81|||\xea\xae\x91|" 317 "\xe1\x8f\x82|||\xea\xae\x92|" 318 "\xe1\x8f\x83|||\xea\xae\x93|" 319 "\xe1\x8f\x84|||\xea\xae\x94|" 320 "\xe1\x8f\x85|||\xea\xae\x95|" 321 "\xe1\x8f\x86|||\xea\xae\x96|" 322 "\xe1\x8f\x87|||\xea\xae\x97|" 323 "\xe1\x8f\x88|||\xea\xae\x98|" 324 "\xe1\x8f\x89|||\xea\xae\x99|" 325 "\xe1\x8f\x8a|||\xea\xae\x9a|" 326 "\xe1\x8f\x8b|||\xea\xae\x9b|" 327 "\xe1\x8f\x8c|||\xea\xae\x9c|" 328 "\xe1\x8f\x8d|||\xea\xae\x9d|" 329 "\xe1\x8f\x8e|||\xea\xae\x9e|" 330 "\xe1\x8f\x8f|||\xea\xae\x9f|" 331 "\xe1\x8f\x90|||\xea\xae\xa0|" 332 "\xe1\x8f\x91|||\xea\xae\xa1|" 333 "\xe1\x8f\x92|||\xea\xae\xa2|" 334 "\xe1\x8f\x93|||\xea\xae\xa3|" 335 "\xe1\x8f\x94|||\xea\xae\xa4|" 336 "\xe1\x8f\x95|||\xea\xae\xa5|" 337 "\xe1\x8f\x96|||\xea\xae\xa6|" 338 "\xe1\x8f\x97|||\xea\xae\xa7|" 339 "\xe1\x8f\x98|||\xea\xae\xa8|" 340 "\xe1\x8f\x99|||\xea\xae\xa9|" 341 "\xe1\x8f\x9a|||\xea\xae\xaa|" 342 "\xe1\x8f\x9b|||\xea\xae\xab|" 343 "\xe1\x8f\x9c|||\xea\xae\xac|" 344 "\xe1\x8f\x9d|||\xea\xae\xad|" 345 "\xe1\x8f\x9e|||\xea\xae\xae|" 346 "\xe1\x8f\x9f|||\xea\xae\xaf|" 347 "\xe1\x8f\xa0|||\xea\xae\xb0|" 348 "\xe1\x8f\xa1|||\xea\xae\xb1|" 349 "\xe1\x8f\xa2|||\xea\xae\xb2|" 350 "\xe1\x8f\xa3|||\xea\xae\xb3|" 351 "\xe1\x8f\xa4|||\xea\xae\xb4|" 352 "\xe1\x8f\xa5|||\xea\xae\xb5|" 353 "\xe1\x8f\xa6|||\xea\xae\xb6|" 354 "\xe1\x8f\xa7|||\xea\xae\xb7|" 355 "\xe1\x8f\xa8|||\xea\xae\xb8|" 356 "\xe1\x8f\xa9|||\xea\xae\xb9|" 357 "\xe1\x8f\xaa|||\xea\xae\xba|" 358 "\xe1\x8f\xab|||\xea\xae\xbb|" 359 "\xe1\x8f\xac|||\xea\xae\xbc|" 360 "\xe1\x8f\xad|||\xea\xae\xbd|" 361 "\xe1\x8f\xae|||\xea\xae\xbe|" 362 "\xe1\x8f\xaf|||\xea\xae\xbf|" 363 "\xe1\x8f\xb0|||\xe1\x8f\xb8|" 364 "\xe1\x8f\xb1|||\xe1\x8f\xb9|" 365 "\xe1\x8f\xb2|||\xe1\x8f\xba|" 366 "\xe1\x8f\xb3|||\xe1\x8f\xbb|" 367 "\xe1\x8f\xb4|||\xe1\x8f\xbc|" 368 "\xe1\x8f\xb5|||\xe1\x8f\xbd|" 369 "\xe1\x8f\xb8|\xe1\x8f\xb0|\xe1\x8f\xb0||" 370 "\xe1\x8f\xb9|\xe1\x8f\xb1|\xe1\x8f\xb1||" 371 "\xe1\x8f\xba|\xe1\x8f\xb2|\xe1\x8f\xb2||" 372 "\xe1\x8f\xbb|\xe1\x8f\xb3|\xe1\x8f\xb3||" 373 "\xe1\x8f\xbc|\xe1\x8f\xb4|\xe1\x8f\xb4||" 374 "\xe1\x8f\xbd|\xe1\x8f\xb5|\xe1\x8f\xb5||" 375 "\xe1\xb2\x80|\xd0\xb2|\xd0\x92||" 376 "\xe1\xb2\x81|\xd0\xb4|\xd0\x94||" 377 "\xe1\xb2\x82|\xd0\xbe|\xd0\x9e||" 378 "\xe1\xb2\x83|\xd1\x81|\xd0\xa1||" 379 "\xe1\xb2\x84|\xd1\x82|\xd0\xa2||" 380 "\xe1\xb2\x85|\xd1\x82|\xd0\xa2||" 381 "\xe1\xb2\x86|\xd1\x8a|\xd0\xaa||" 382 "\xe1\xb2\x87|\xd1\xa3|\xd1\xa2||" 383 "\xe1\xb2\x88|\xea\x99\x8b|\xea\x99\x8a||" 384 "\xe1\xba\x96|h\xcc\xb1|H\xcc\xb1||" 385 "\xe1\xba\x97|t\xcc\x88|T\xcc\x88||" 386 "\xe1\xba\x98|w\xcc\x8a|W\xcc\x8a||" 387 "\xe1\xba\x99|y\xcc\x8a|Y\xcc\x8a||" 388 "\xe1\xba\x9a|a\xca\xbe|A\xca\xbe||" 389 "\xe1\xba\x9b|\xe1\xb9\xa1|\xe1\xb9\xa0||" 390 "\xe1\xba\x9e|ss||\xc3\x9f|" 391 "\xe1\xbd\x90|\xcf\x85\xcc\x93|\xce\xa5\xcc\x93||" 392 "\xe1\xbd\x92|\xcf\x85\xcc\x93\xcc\x80|\xce\xa5\xcc\x93\xcc\x80||" 393 "\xe1\xbd\x94|\xcf\x85\xcc\x93\xcc\x81|\xce\xa5\xcc\x93\xcc\x81||" 394 "\xe1\xbd\x96|\xcf\x85\xcc\x93\xcd\x82|\xce\xa5\xcc\x93\xcd\x82||" 395 "\xe1\xbe\x80|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99||" 396 "\xe1\xbe\x81|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99||" 397 "\xe1\xbe\x82|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99||" 398 "\xe1\xbe\x83|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99||" 399 "\xe1\xbe\x84|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99||" 400 "\xe1\xbe\x85|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99||" 401 "\xe1\xbe\x86|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99||" 402 "\xe1\xbe\x87|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99||" 403 "\xe1\xbe\x88|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99|\xe1\xbe\x80|" 404 "\xe1\xbe\x89|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99|\xe1\xbe\x81|" 405 "\xe1\xbe\x8a|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99|\xe1\xbe\x82|" 406 "\xe1\xbe\x8b|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99|\xe1\xbe\x83|" 407 "\xe1\xbe\x8c|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99|\xe1\xbe\x84|" 408 "\xe1\xbe\x8d|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99|\xe1\xbe\x85|" 409 "\xe1\xbe\x8e|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99|\xe1\xbe\x86|" 410 "\xe1\xbe\x8f|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99|\xe1\xbe\x87|" 411 "\xe1\xbe\x90|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99||" 412 "\xe1\xbe\x91|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99||" 413 "\xe1\xbe\x92|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99||" 414 "\xe1\xbe\x93|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99||" 415 "\xe1\xbe\x94|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99||" 416 "\xe1\xbe\x95|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99||" 417 "\xe1\xbe\x96|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99||" 418 "\xe1\xbe\x97|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99||" 419 "\xe1\xbe\x98|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99|\xe1\xbe\x90|" 420 "\xe1\xbe\x99|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99|\xe1\xbe\x91|" 421 "\xe1\xbe\x9a|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99|\xe1\xbe\x92|" 422 "\xe1\xbe\x9b|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99|\xe1\xbe\x93|" 423 "\xe1\xbe\x9c|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99|\xe1\xbe\x94|" 424 "\xe1\xbe\x9d|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99|\xe1\xbe\x95|" 425 "\xe1\xbe\x9e|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99|\xe1\xbe\x96|" 426 "\xe1\xbe\x9f|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99|\xe1\xbe\x97|" 427 "\xe1\xbe\xa0|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99||" 428 "\xe1\xbe\xa1|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99||" 429 "\xe1\xbe\xa2|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99||" 430 "\xe1\xbe\xa3|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99||" 431 "\xe1\xbe\xa4|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99||" 432 "\xe1\xbe\xa5|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99||" 433 "\xe1\xbe\xa6|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99||" 434 "\xe1\xbe\xa7|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99||" 435 "\xe1\xbe\xa8|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99|\xe1\xbe\xa0|" 436 "\xe1\xbe\xa9|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99|\xe1\xbe\xa1|" 437 "\xe1\xbe\xaa|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99|\xe1\xbe\xa2|" 438 "\xe1\xbe\xab|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99|\xe1\xbe\xa3|" 439 "\xe1\xbe\xac|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99|\xe1\xbe\xa4|" 440 "\xe1\xbe\xad|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99|\xe1\xbe\xa5|" 441 "\xe1\xbe\xae|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99|\xe1\xbe\xa6|" 442 "\xe1\xbe\xaf|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99|\xe1\xbe\xa7|" 443 "\xe1\xbe\xb2|\xe1\xbd\xb0\xce\xb9|\xe1\xbe\xba\xce\x99||" 444 "\xe1\xbe\xb3|\xce\xb1\xce\xb9|\xce\x91\xce\x99||" 445 "\xe1\xbe\xb4|\xce\xac\xce\xb9|\xce\x86\xce\x99||" 446 "\xe1\xbe\xb6|\xce\xb1\xcd\x82|\xce\x91\xcd\x82||" 447 "\xe1\xbe\xb7|\xce\xb1\xcd\x82\xce\xb9|\xce\x91\xcd\x82\xce\x99||" 448 "\xe1\xbe\xbc|\xce\xb1\xce\xb9|\xce\x91\xce\x99|\xe1\xbe\xb3|" 449 "\xe1\xbe\xbe|\xce\xb9|\xce\x99||" 450 "\xe1\xbf\x82|\xe1\xbd\xb4\xce\xb9|\xe1\xbf\x8a\xce\x99||" 451 "\xe1\xbf\x83|\xce\xb7\xce\xb9|\xce\x97\xce\x99||" 452 "\xe1\xbf\x84|\xce\xae\xce\xb9|\xce\x89\xce\x99||" 453 "\xe1\xbf\x86|\xce\xb7\xcd\x82|\xce\x97\xcd\x82||" 454 "\xe1\xbf\x87|\xce\xb7\xcd\x82\xce\xb9|\xce\x97\xcd\x82\xce\x99||" 455 "\xe1\xbf\x8c|\xce\xb7\xce\xb9|\xce\x97\xce\x99|\xe1\xbf\x83|" 456 "\xe1\xbf\x92|\xce\xb9\xcc\x88\xcc\x80|\xce\x99\xcc\x88\xcc\x80||" 457 "\xe1\xbf\x93|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||" 458 "\xe1\xbf\x96|\xce\xb9\xcd\x82|\xce\x99\xcd\x82||" 459 "\xe1\xbf\x97|\xce\xb9\xcc\x88\xcd\x82|\xce\x99\xcc\x88\xcd\x82||" 460 "\xe1\xbf\xa2|\xcf\x85\xcc\x88\xcc\x80|\xce\xa5\xcc\x88\xcc\x80||" 461 "\xe1\xbf\xa3|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||" 462 "\xe1\xbf\xa4|\xcf\x81\xcc\x93|\xce\xa1\xcc\x93||" 463 "\xe1\xbf\xa6|\xcf\x85\xcd\x82|\xce\xa5\xcd\x82||" 464 "\xe1\xbf\xa7|\xcf\x85\xcc\x88\xcd\x82|\xce\xa5\xcc\x88\xcd\x82||" 465 "\xe1\xbf\xb2|\xe1\xbd\xbc\xce\xb9|\xe1\xbf\xba\xce\x99||" 466 "\xe1\xbf\xb3|\xcf\x89\xce\xb9|\xce\xa9\xce\x99||" 467 "\xe1\xbf\xb4|\xcf\x8e\xce\xb9|\xce\x8f\xce\x99||" 468 "\xe1\xbf\xb6|\xcf\x89\xcd\x82|\xce\xa9\xcd\x82||" 469 "\xe1\xbf\xb7|\xcf\x89\xcd\x82\xce\xb9|\xce\xa9\xcd\x82\xce\x99||" 470 "\xe1\xbf\xbc|\xcf\x89\xce\xb9|\xce\xa9\xce\x99|\xe1\xbf\xb3|" 471 "\xe2\x84\xa6|\xcf\x89||\xcf\x89|" 472 "\xe2\x84\xaa|k||k|" 473 "\xe2\x84\xab|\xc3\xa5||\xc3\xa5|" 474 "\xea\xad\xb0|\xe1\x8e\xa0|\xe1\x8e\xa0||" 475 "\xea\xad\xb1|\xe1\x8e\xa1|\xe1\x8e\xa1||" 476 "\xea\xad\xb2|\xe1\x8e\xa2|\xe1\x8e\xa2||" 477 "\xea\xad\xb3|\xe1\x8e\xa3|\xe1\x8e\xa3||" 478 "\xea\xad\xb4|\xe1\x8e\xa4|\xe1\x8e\xa4||" 479 "\xea\xad\xb5|\xe1\x8e\xa5|\xe1\x8e\xa5||" 480 "\xea\xad\xb6|\xe1\x8e\xa6|\xe1\x8e\xa6||" 481 "\xea\xad\xb7|\xe1\x8e\xa7|\xe1\x8e\xa7||" 482 "\xea\xad\xb8|\xe1\x8e\xa8|\xe1\x8e\xa8||" 483 "\xea\xad\xb9|\xe1\x8e\xa9|\xe1\x8e\xa9||" 484 "\xea\xad\xba|\xe1\x8e\xaa|\xe1\x8e\xaa||" 485 "\xea\xad\xbb|\xe1\x8e\xab|\xe1\x8e\xab||" 486 "\xea\xad\xbc|\xe1\x8e\xac|\xe1\x8e\xac||" 487 "\xea\xad\xbd|\xe1\x8e\xad|\xe1\x8e\xad||" 488 "\xea\xad\xbe|\xe1\x8e\xae|\xe1\x8e\xae||" 489 "\xea\xad\xbf|\xe1\x8e\xaf|\xe1\x8e\xaf||" 490 "\xea\xae\x80|\xe1\x8e\xb0|\xe1\x8e\xb0||" 491 "\xea\xae\x81|\xe1\x8e\xb1|\xe1\x8e\xb1||" 492 "\xea\xae\x82|\xe1\x8e\xb2|\xe1\x8e\xb2||" 493 "\xea\xae\x83|\xe1\x8e\xb3|\xe1\x8e\xb3||" 494 "\xea\xae\x84|\xe1\x8e\xb4|\xe1\x8e\xb4||" 495 "\xea\xae\x85|\xe1\x8e\xb5|\xe1\x8e\xb5||" 496 "\xea\xae\x86|\xe1\x8e\xb6|\xe1\x8e\xb6||" 497 "\xea\xae\x87|\xe1\x8e\xb7|\xe1\x8e\xb7||" 498 "\xea\xae\x88|\xe1\x8e\xb8|\xe1\x8e\xb8||" 499 "\xea\xae\x89|\xe1\x8e\xb9|\xe1\x8e\xb9||" 500 "\xea\xae\x8a|\xe1\x8e\xba|\xe1\x8e\xba||" 501 "\xea\xae\x8b|\xe1\x8e\xbb|\xe1\x8e\xbb||" 502 "\xea\xae\x8c|\xe1\x8e\xbc|\xe1\x8e\xbc||" 503 "\xea\xae\x8d|\xe1\x8e\xbd|\xe1\x8e\xbd||" 504 "\xea\xae\x8e|\xe1\x8e\xbe|\xe1\x8e\xbe||" 505 "\xea\xae\x8f|\xe1\x8e\xbf|\xe1\x8e\xbf||" 506 "\xea\xae\x90|\xe1\x8f\x80|\xe1\x8f\x80||" 507 "\xea\xae\x91|\xe1\x8f\x81|\xe1\x8f\x81||" 508 "\xea\xae\x92|\xe1\x8f\x82|\xe1\x8f\x82||" 509 "\xea\xae\x93|\xe1\x8f\x83|\xe1\x8f\x83||" 510 "\xea\xae\x94|\xe1\x8f\x84|\xe1\x8f\x84||" 511 "\xea\xae\x95|\xe1\x8f\x85|\xe1\x8f\x85||" 512 "\xea\xae\x96|\xe1\x8f\x86|\xe1\x8f\x86||" 513 "\xea\xae\x97|\xe1\x8f\x87|\xe1\x8f\x87||" 514 "\xea\xae\x98|\xe1\x8f\x88|\xe1\x8f\x88||" 515 "\xea\xae\x99|\xe1\x8f\x89|\xe1\x8f\x89||" 516 "\xea\xae\x9a|\xe1\x8f\x8a|\xe1\x8f\x8a||" 517 "\xea\xae\x9b|\xe1\x8f\x8b|\xe1\x8f\x8b||" 518 "\xea\xae\x9c|\xe1\x8f\x8c|\xe1\x8f\x8c||" 519 "\xea\xae\x9d|\xe1\x8f\x8d|\xe1\x8f\x8d||" 520 "\xea\xae\x9e|\xe1\x8f\x8e|\xe1\x8f\x8e||" 521 "\xea\xae\x9f|\xe1\x8f\x8f|\xe1\x8f\x8f||" 522 "\xea\xae\xa0|\xe1\x8f\x90|\xe1\x8f\x90||" 523 "\xea\xae\xa1|\xe1\x8f\x91|\xe1\x8f\x91||" 524 "\xea\xae\xa2|\xe1\x8f\x92|\xe1\x8f\x92||" 525 "\xea\xae\xa3|\xe1\x8f\x93|\xe1\x8f\x93||" 526 "\xea\xae\xa4|\xe1\x8f\x94|\xe1\x8f\x94||" 527 "\xea\xae\xa5|\xe1\x8f\x95|\xe1\x8f\x95||" 528 "\xea\xae\xa6|\xe1\x8f\x96|\xe1\x8f\x96||" 529 "\xea\xae\xa7|\xe1\x8f\x97|\xe1\x8f\x97||" 530 "\xea\xae\xa8|\xe1\x8f\x98|\xe1\x8f\x98||" 531 "\xea\xae\xa9|\xe1\x8f\x99|\xe1\x8f\x99||" 532 "\xea\xae\xaa|\xe1\x8f\x9a|\xe1\x8f\x9a||" 533 "\xea\xae\xab|\xe1\x8f\x9b|\xe1\x8f\x9b||" 534 "\xea\xae\xac|\xe1\x8f\x9c|\xe1\x8f\x9c||" 535 "\xea\xae\xad|\xe1\x8f\x9d|\xe1\x8f\x9d||" 536 "\xea\xae\xae|\xe1\x8f\x9e|\xe1\x8f\x9e||" 537 "\xea\xae\xaf|\xe1\x8f\x9f|\xe1\x8f\x9f||" 538 "\xea\xae\xb0|\xe1\x8f\xa0|\xe1\x8f\xa0||" 539 "\xea\xae\xb1|\xe1\x8f\xa1|\xe1\x8f\xa1||" 540 "\xea\xae\xb2|\xe1\x8f\xa2|\xe1\x8f\xa2||" 541 "\xea\xae\xb3|\xe1\x8f\xa3|\xe1\x8f\xa3||" 542 "\xea\xae\xb4|\xe1\x8f\xa4|\xe1\x8f\xa4||" 543 "\xea\xae\xb5|\xe1\x8f\xa5|\xe1\x8f\xa5||" 544 "\xea\xae\xb6|\xe1\x8f\xa6|\xe1\x8f\xa6||" 545 "\xea\xae\xb7|\xe1\x8f\xa7|\xe1\x8f\xa7||" 546 "\xea\xae\xb8|\xe1\x8f\xa8|\xe1\x8f\xa8||" 547 "\xea\xae\xb9|\xe1\x8f\xa9|\xe1\x8f\xa9||" 548 "\xea\xae\xba|\xe1\x8f\xaa|\xe1\x8f\xaa||" 549 "\xea\xae\xbb|\xe1\x8f\xab|\xe1\x8f\xab||" 550 "\xea\xae\xbc|\xe1\x8f\xac|\xe1\x8f\xac||" 551 "\xea\xae\xbd|\xe1\x8f\xad|\xe1\x8f\xad||" 552 "\xea\xae\xbe|\xe1\x8f\xae|\xe1\x8f\xae||" 553 "\xea\xae\xbf|\xe1\x8f\xaf|\xe1\x8f\xaf||" 554 "\xef\xac\x80|ff|FF||" 555 "\xef\xac\x81|fi|FI||" 556 "\xef\xac\x82|fl|FL||" 557 "\xef\xac\x83|ffi|FFI||" 558 "\xef\xac\x84|ffl|FFL||" 559 "\xef\xac\x85|st|ST||" 560 "\xef\xac\x86|st|ST||" 561 "\xef\xac\x93|\xd5\xb4\xd5\xb6|\xd5\x84\xd5\x86||" 562 "\xef\xac\x94|\xd5\xb4\xd5\xa5|\xd5\x84\xd4\xb5||" 563 "\xef\xac\x95|\xd5\xb4\xd5\xab|\xd5\x84\xd4\xbb||" 564 "\xef\xac\x96|\xd5\xbe\xd5\xb6|\xd5\x8e\xd5\x86||" 565 "\xef\xac\x97|\xd5\xb4\xd5\xad|\xd5\x84\xd4\xbd||" 566 567 //--Autogenerated -- end of section automatically generated 568 ; 569 570 class CaseConverter : public ICaseConverter { 571 // Maximum length of a case conversion result is 6 bytes in UTF-8 572 enum { maxConversionLength=6 }; 573 struct ConversionString { 574 char conversion[maxConversionLength+1]; 575 ConversionString() noexcept : conversion{} { 576 } 577 }; 578 // Conversions are initially store in a vector of structs but then decomposed into 579 // parallel arrays as that is about 10% faster to search. 580 struct CharacterConversion { 581 int character; 582 ConversionString conversion; 583 CharacterConversion() noexcept : character(0) { 584 // Empty case: NUL -> "". 585 } 586 CharacterConversion(int character_, std::string_view conversion_) noexcept : character(character_) { 587 assert(conversion_.length() <= maxConversionLength); 588 conversion_.copy(conversion.conversion, conversion_.length()); 589 } 590 bool operator<(const CharacterConversion &other) const noexcept { 591 return character < other.character; 592 } 593 }; 594 typedef std::vector<CharacterConversion> CharacterToConversion; 595 CharacterToConversion characterToConversion; 596 // The parallel arrays 597 std::vector<int> characters; 598 std::vector<ConversionString> conversions; 599 600 public: 601 CaseConverter() noexcept { 602 } 603 virtual ~CaseConverter() = default; 604 bool Initialised() const noexcept { 605 return !characters.empty(); 606 } 607 void Add(int character, const char *conversion) { 608 characterToConversion.emplace_back(character, conversion); 609 } 610 const char *Find(int character) { 611 const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character); 612 if (it == characters.end()) 613 return nullptr; 614 else if (*it == character) 615 return conversions[it - characters.begin()].conversion; 616 else 617 return nullptr; 618 } 619 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) override { 620 size_t lenConverted = 0; 621 size_t mixedPos = 0; 622 unsigned char bytes[UTF8MaxBytes + 1]{}; 623 while (mixedPos < lenMixed) { 624 const unsigned char leadByte = mixed[mixedPos]; 625 const char *caseConverted = nullptr; 626 size_t lenMixedChar = 1; 627 if (UTF8IsAscii(leadByte)) { 628 caseConverted = Find(leadByte); 629 } else { 630 bytes[0] = leadByte; 631 const int widthCharBytes = UTF8BytesOfLead[leadByte]; 632 for (int b=1; b<widthCharBytes; b++) { 633 bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0; 634 } 635 const int classified = UTF8Classify(bytes, widthCharBytes); 636 if (!(classified & UTF8MaskInvalid)) { 637 // valid UTF-8 638 lenMixedChar = classified & UTF8MaskWidth; 639 const int character = UnicodeFromUTF8(bytes); 640 caseConverted = Find(character); 641 } 642 } 643 if (caseConverted) { 644 // Character has a conversion so copy that conversion in 645 while (*caseConverted) { 646 converted[lenConverted++] = *caseConverted++; 647 if (lenConverted >= sizeConverted) 648 return 0; 649 } 650 } else { 651 // Character has no conversion so copy the input to output 652 for (size_t i=0; i<lenMixedChar; i++) { 653 converted[lenConverted++] = mixed[mixedPos+i]; 654 if (lenConverted >= sizeConverted) 655 return 0; 656 } 657 } 658 mixedPos += lenMixedChar; 659 } 660 return lenConverted; 661 } 662 void FinishedAdding() { 663 std::sort(characterToConversion.begin(), characterToConversion.end()); 664 characters.reserve(characterToConversion.size()); 665 conversions.reserve(characterToConversion.size()); 666 for (const CharacterConversion &chConv : characterToConversion) { 667 characters.push_back(chConv.character); 668 conversions.push_back(chConv.conversion); 669 } 670 // Empty the original calculated data completely 671 CharacterToConversion().swap(characterToConversion); 672 } 673 }; 674 675 CaseConverter caseConvFold; 676 CaseConverter caseConvUp; 677 CaseConverter caseConvLow; 678 679 void AddSymmetric(enum CaseConversion conversion, int lower,int upper) { 680 char lowerUTF8[UTF8MaxBytes+1]; 681 UTF8FromUTF32Character(lower, lowerUTF8); 682 char upperUTF8[UTF8MaxBytes+1]; 683 UTF8FromUTF32Character(upper, upperUTF8); 684 685 switch (conversion) { 686 case CaseConversionFold: 687 caseConvFold.Add(upper, lowerUTF8); 688 break; 689 case CaseConversionUpper: 690 caseConvUp.Add(lower, upperUTF8); 691 break; 692 case CaseConversionLower: 693 caseConvLow.Add(upper, lowerUTF8); 694 break; 695 } 696 } 697 698 void SetupConversions(enum CaseConversion conversion) { 699 // First initialize for the symmetric ranges 700 for (size_t i=0; i<std::size(symmetricCaseConversionRanges);) { 701 const int lower = symmetricCaseConversionRanges[i++]; 702 const int upper = symmetricCaseConversionRanges[i++]; 703 const int length = symmetricCaseConversionRanges[i++]; 704 const int pitch = symmetricCaseConversionRanges[i++]; 705 for (int j=0; j<length*pitch; j+=pitch) { 706 AddSymmetric(conversion, lower+j, upper+j); 707 } 708 } 709 // Add the symmetric singletons 710 for (size_t i=0; i<std::size(symmetricCaseConversions);) { 711 const int lower = symmetricCaseConversions[i++]; 712 const int upper = symmetricCaseConversions[i++]; 713 AddSymmetric(conversion, lower, upper); 714 } 715 // Add the complex cases 716 const char *sComplex = complexCaseConversions; 717 while (*sComplex) { 718 // Longest ligature is 3 character so 5 for safety 719 constexpr size_t lenUTF8 = 5*UTF8MaxBytes+1; 720 unsigned char originUTF8[lenUTF8]{}; 721 char foldedUTF8[lenUTF8]{}; 722 char lowerUTF8[lenUTF8]{}; 723 char upperUTF8[lenUTF8]{}; 724 size_t i = 0; 725 while (*sComplex && *sComplex != '|') { 726 originUTF8[i++] = *sComplex; 727 sComplex++; 728 } 729 sComplex++; 730 originUTF8[i] = 0; 731 i = 0; 732 while (*sComplex && *sComplex != '|') { 733 foldedUTF8[i++] = *sComplex; 734 sComplex++; 735 } 736 sComplex++; 737 foldedUTF8[i] = 0; 738 i = 0; 739 while (*sComplex && *sComplex != '|') { 740 upperUTF8[i++] = *sComplex; 741 sComplex++; 742 } 743 sComplex++; 744 upperUTF8[i] = 0; 745 i = 0; 746 while (*sComplex && *sComplex != '|') { 747 lowerUTF8[i++] = *sComplex; 748 sComplex++; 749 } 750 sComplex++; 751 lowerUTF8[i] = 0; 752 753 const int character = UnicodeFromUTF8(originUTF8); 754 755 if (conversion == CaseConversionFold && foldedUTF8[0]) { 756 caseConvFold.Add(character, foldedUTF8); 757 } 758 759 if (conversion == CaseConversionUpper && upperUTF8[0]) { 760 caseConvUp.Add(character, upperUTF8); 761 } 762 763 if (conversion == CaseConversionLower && lowerUTF8[0]) { 764 caseConvLow.Add(character, lowerUTF8); 765 } 766 } 767 768 switch (conversion) { 769 case CaseConversionFold: 770 caseConvFold.FinishedAdding(); 771 break; 772 case CaseConversionUpper: 773 caseConvUp.FinishedAdding(); 774 break; 775 case CaseConversionLower: 776 caseConvLow.FinishedAdding(); 777 break; 778 } 779 } 780 781 CaseConverter *ConverterForConversion(enum CaseConversion conversion) noexcept { 782 switch (conversion) { 783 case CaseConversionFold: 784 return &caseConvFold; 785 case CaseConversionUpper: 786 return &caseConvUp; 787 case CaseConversionLower: 788 return &caseConvLow; 789 } 790 return nullptr; 791 } 792 793 } 794 795 namespace Scintilla { 796 797 ICaseConverter *ConverterFor(enum CaseConversion conversion) { 798 CaseConverter *pCaseConv = ConverterForConversion(conversion); 799 if (!pCaseConv->Initialised()) 800 SetupConversions(conversion); 801 return pCaseConv; 802 } 803 804 const char *CaseConvert(int character, enum CaseConversion conversion) { 805 CaseConverter *pCaseConv = ConverterForConversion(conversion); 806 if (!pCaseConv->Initialised()) 807 SetupConversions(conversion); 808 return pCaseConv->Find(character); 809 } 810 811 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) { 812 CaseConverter *pCaseConv = ConverterForConversion(conversion); 813 if (!pCaseConv->Initialised()) 814 SetupConversions(conversion); 815 return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed); 816 } 817 818 std::string CaseConvertString(const std::string &s, enum CaseConversion conversion) { 819 std::string retMapped(s.length() * maxExpansionCaseConversion, 0); 820 const size_t lenMapped = CaseConvertString(&retMapped[0], retMapped.length(), s.c_str(), s.length(), 821 conversion); 822 retMapped.resize(lenMapped); 823 return retMapped; 824 } 825 826 } 827