1 // Scintilla source code edit control 2 /** @file RESearch.cxx 3 ** Regular expression search library. 4 **/ 5 6 /* 7 * regex - Regular expression pattern matching and replacement 8 * 9 * By: Ozan S. Yigit (oz) 10 * Dept. of Computer Science 11 * York University 12 * 13 * Original code available from http://www.cs.yorku.ca/~oz/ 14 * Translation to C++ by Neil Hodgson [email protected] 15 * Removed all use of register. 16 * Converted to modern function prototypes. 17 * Put all global/static variables into an object so this code can be 18 * used from multiple threads, etc. 19 * Some extensions by Philippe Lhoste PhiLho(a)GMX.net 20 * '?' extensions by Michael Mullin [email protected] 21 * 22 * These routines are the PUBLIC DOMAIN equivalents of regex 23 * routines as found in 4.nBSD UN*X, with minor extensions. 24 * 25 * These routines are derived from various implementations found 26 * in software tools books, and Conroy's grep. They are NOT derived 27 * from licensed/restricted software. 28 * For more interesting/academic/complicated implementations, 29 * see Henry Spencer's regexp routines, or GNU Emacs pattern 30 * matching module. 31 * 32 * Modification history removed. 33 * 34 * Interfaces: 35 * RESearch::Compile: compile a regular expression into a NFA. 36 * 37 * const char *RESearch::Compile(const char *pattern, int length, 38 * bool caseSensitive, bool posix) 39 * 40 * Returns a short error string if they fail. 41 * 42 * RESearch::Execute: execute the NFA to match a pattern. 43 * 44 * int RESearch::Execute(characterIndexer &ci, int lp, int endp) 45 * 46 * re_fail: failure routine for RESearch::Execute. (no longer used) 47 * 48 * void re_fail(char *msg, char op) 49 * 50 * Regular Expressions: 51 * 52 * [1] char matches itself, unless it is a special 53 * character (metachar): . \ [ ] * + ? ^ $ 54 * and ( ) if posix option. 55 * 56 * [2] . matches any character. 57 * 58 * [3] \ matches the character following it, except: 59 * - \a, \b, \f, \n, \r, \t, \v match the corresponding C 60 * escape char, respectively BEL, BS, FF, LF, CR, TAB and VT; 61 * Note that \r and \n are never matched because Scintilla 62 * regex searches are made line per line 63 * (stripped of end-of-line chars). 64 * - if not in posix mode, when followed by a 65 * left or right round bracket (see [8]); 66 * - when followed by a digit 1 to 9 (see [9]); 67 * - when followed by a left or right angle bracket 68 * (see [10]); 69 * - when followed by d, D, s, S, w or W (see [11]); 70 * - when followed by x and two hexa digits (see [12]. 71 * Backslash is used as an escape character for all 72 * other meta-characters, and itself. 73 * 74 * [4] [set] matches one of the characters in the set. 75 * If the first character in the set is "^", 76 * it matches the characters NOT in the set, i.e. 77 * complements the set. A shorthand S-E (start dash end) 78 * is used to specify a set of characters S up to 79 * E, inclusive. S and E must be characters, otherwise 80 * the dash is taken literally (eg. in expression [\d-a]). 81 * The special characters "]" and "-" have no special 82 * meaning if they appear as the first chars in the set. 83 * To include both, put - first: [-]A-Z] 84 * (or just backslash them). 85 * examples: match: 86 * 87 * [-]|] matches these 3 chars, 88 * 89 * []-|] matches from ] to | chars 90 * 91 * [a-z] any lowercase alpha 92 * 93 * [^-]] any char except - and ] 94 * 95 * [^A-Z] any char except uppercase 96 * alpha 97 * 98 * [a-zA-Z] any alpha 99 * 100 * [5] * any regular expression form [1] to [4] 101 * (except [8], [9] and [10] forms of [3]), 102 * followed by closure char (*) 103 * matches zero or more matches of that form. 104 * 105 * [6] + same as [5], except it matches one or more. 106 * 107 * [5-6] Both [5] and [6] are greedy (they match as much as possible). 108 * Unless they are followed by the 'lazy' quantifier (?) 109 * In which case both [5] and [6] try to match as little as possible 110 * 111 * [7] ? same as [5] except it matches zero or one. 112 * 113 * [8] a regular expression in the form [1] to [13], enclosed 114 * as \(form\) (or (form) with posix flag) matches what 115 * form matches. The enclosure creates a set of tags, 116 * used for [9] and for pattern substitution. 117 * The tagged forms are numbered starting from 1. 118 * 119 * [9] a \ followed by a digit 1 to 9 matches whatever a 120 * previously tagged regular expression ([8]) matched. 121 * 122 * [10] \< a regular expression starting with a \< construct 123 * \> and/or ending with a \> construct, restricts the 124 * pattern matching to the beginning of a word, and/or 125 * the end of a word. A word is defined to be a character 126 * string beginning and/or ending with the characters 127 * A-Z a-z 0-9 and _. Scintilla extends this definition 128 * by user setting. The word must also be preceded and/or 129 * followed by any character outside those mentioned. 130 * 131 * [11] \l a backslash followed by d, D, s, S, w or W, 132 * becomes a character class (both inside and 133 * outside sets []). 134 * d: decimal digits 135 * D: any char except decimal digits 136 * s: whitespace (space, \t \n \r \f \v) 137 * S: any char except whitespace (see above) 138 * w: alphanumeric & underscore (changed by user setting) 139 * W: any char except alphanumeric & underscore (see above) 140 * 141 * [12] \xHH a backslash followed by x and two hexa digits, 142 * becomes the character whose ASCII code is equal 143 * to these digits. If not followed by two digits, 144 * it is 'x' char itself. 145 * 146 * [13] a composite regular expression xy where x and y 147 * are in the form [1] to [12] matches the longest 148 * match of x followed by a match for y. 149 * 150 * [14] ^ a regular expression starting with a ^ character 151 * $ and/or ending with a $ character, restricts the 152 * pattern matching to the beginning of the line, 153 * or the end of line. [anchors] Elsewhere in the 154 * pattern, ^ and $ are treated as ordinary characters. 155 * 156 * 157 * Acknowledgements: 158 * 159 * HCR's Hugh Redelmeier has been most helpful in various 160 * stages of development. He convinced me to include BOW 161 * and EOW constructs, originally invented by Rob Pike at 162 * the University of Toronto. 163 * 164 * References: 165 * Software tools Kernighan & Plauger 166 * Software tools in Pascal Kernighan & Plauger 167 * Grep [rsx-11 C dist] David Conroy 168 * ed - text editor Un*x Programmer's Manual 169 * Advanced editing on Un*x B. W. Kernighan 170 * RegExp routines Henry Spencer 171 * 172 * Notes: 173 * 174 * This implementation uses a bit-set representation for character 175 * classes for speed and compactness. Each character is represented 176 * by one bit in a 256-bit block. Thus, CCL always takes a 177 * constant 32 bytes in the internal nfa, and RESearch::Execute does a single 178 * bit comparison to locate the character in the set. 179 * 180 * Examples: 181 * 182 * pattern: foo*.* 183 * compile: CHR f CHR o CLO CHR o END CLO ANY END END 184 * matches: fo foo fooo foobar fobar foxx ... 185 * 186 * pattern: fo[ob]a[rz] 187 * compile: CHR f CHR o CCL bitset CHR a CCL bitset END 188 * matches: fobar fooar fobaz fooaz 189 * 190 * pattern: foo\\+ 191 * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END 192 * matches: foo\ foo\\ foo\\\ ... 193 * 194 * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo) 195 * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END 196 * matches: foo1foo foo2foo foo3foo 197 * 198 * pattern: \(fo.*\)-\1 199 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END 200 * matches: foo-foo fo-fo fob-fob foobar-foobar ... 201 */ 202 203 #include <cstddef> 204 #include <cstdlib> 205 206 #include <stdexcept> 207 #include <string> 208 #include <algorithm> 209 #include <iterator> 210 211 #include "Position.h" 212 #include "CharClassify.h" 213 #include "RESearch.h" 214 215 using namespace Scintilla; 216 217 #define OKP 1 218 #define NOP 0 219 220 #define CHR 1 221 #define ANY 2 222 #define CCL 3 223 #define BOL 4 224 #define EOL 5 225 #define BOT 6 226 #define EOT 7 227 #define BOW 8 228 #define EOW 9 229 #define REF 10 230 #define CLO 11 231 #define CLQ 12 /* 0 to 1 closure */ 232 #define LCLO 13 /* lazy closure */ 233 234 #define END 0 235 236 /* 237 * The following defines are not meant to be changeable. 238 * They are for readability only. 239 */ 240 #define BLKIND 0370 241 #define BITIND 07 242 243 static const char bitarr[] = { 1, 2, 4, 8, 16, 32, 64, '\200' }; 244 245 #define badpat(x) (*nfa = END, x) 246 247 /* 248 * Character classification table for word boundary operators BOW 249 * and EOW is passed in by the creator of this object (Scintilla 250 * Document). The Document default state is that word chars are: 251 * 0-9, a-z, A-Z and _ 252 */ 253 254 RESearch::RESearch(CharClassify *charClassTable) { 255 failure = 0; 256 charClass = charClassTable; 257 sta = NOP; /* status of lastpat */ 258 bol = 0; 259 constexpr unsigned char nul = 0; 260 std::fill(bittab, std::end(bittab), nul); 261 std::fill(tagstk, std::end(tagstk), 0); 262 std::fill(nfa, std::end(nfa), '\0'); 263 Clear(); 264 } 265 266 RESearch::~RESearch() { 267 Clear(); 268 } 269 270 void RESearch::Clear() noexcept { 271 for (int i = 0; i < MAXTAG; i++) { 272 pat[i].clear(); 273 bopat[i] = NOTFOUND; 274 eopat[i] = NOTFOUND; 275 } 276 } 277 278 void RESearch::GrabMatches(const CharacterIndexer &ci) { 279 for (unsigned int i = 0; i < MAXTAG; i++) { 280 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) { 281 const Sci::Position len = eopat[i] - bopat[i]; 282 pat[i].resize(len); 283 for (Sci::Position j = 0; j < len; j++) 284 pat[i][j] = ci.CharAt(bopat[i] + j); 285 } 286 } 287 } 288 289 void RESearch::ChSet(unsigned char c) noexcept { 290 bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND]; 291 } 292 293 void RESearch::ChSetWithCase(unsigned char c, bool caseSensitive) noexcept { 294 ChSet(c); 295 if (!caseSensitive) { 296 if ((c >= 'a') && (c <= 'z')) { 297 ChSet(c - 'a' + 'A'); 298 } else if ((c >= 'A') && (c <= 'Z')) { 299 ChSet(c - 'A' + 'a'); 300 } 301 } 302 } 303 304 static unsigned char escapeValue(unsigned char ch) noexcept { 305 switch (ch) { 306 case 'a': return '\a'; 307 case 'b': return '\b'; 308 case 'f': return '\f'; 309 case 'n': return '\n'; 310 case 'r': return '\r'; 311 case 't': return '\t'; 312 case 'v': return '\v'; 313 } 314 return 0; 315 } 316 317 static int GetHexaChar(unsigned char hd1, unsigned char hd2) noexcept { 318 int hexValue = 0; 319 if (hd1 >= '0' && hd1 <= '9') { 320 hexValue += 16 * (hd1 - '0'); 321 } else if (hd1 >= 'A' && hd1 <= 'F') { 322 hexValue += 16 * (hd1 - 'A' + 10); 323 } else if (hd1 >= 'a' && hd1 <= 'f') { 324 hexValue += 16 * (hd1 - 'a' + 10); 325 } else { 326 return -1; 327 } 328 if (hd2 >= '0' && hd2 <= '9') { 329 hexValue += hd2 - '0'; 330 } else if (hd2 >= 'A' && hd2 <= 'F') { 331 hexValue += hd2 - 'A' + 10; 332 } else if (hd2 >= 'a' && hd2 <= 'f') { 333 hexValue += hd2 - 'a' + 10; 334 } else { 335 return -1; 336 } 337 return hexValue; 338 } 339 340 /** 341 * Called when the parser finds a backslash not followed 342 * by a valid expression (like \( in non-Posix mode). 343 * @param pattern : pointer on the char after the backslash. 344 * @param incr : (out) number of chars to skip after expression evaluation. 345 * @return the char if it resolves to a simple char, 346 * or -1 for a char class. In this case, bittab is changed. 347 */ 348 int RESearch::GetBackslashExpression( 349 const char *pattern, 350 int &incr) noexcept { 351 // Since error reporting is primitive and messages are not used anyway, 352 // I choose to interpret unexpected syntax in a logical way instead 353 // of reporting errors. Otherwise, we can stick on, eg., PCRE behaviour. 354 incr = 0; // Most of the time, will skip the char "naturally". 355 int c; 356 int result = -1; 357 const unsigned char bsc = *pattern; 358 if (!bsc) { 359 // Avoid overrun 360 result = '\\'; // \ at end of pattern, take it literally 361 return result; 362 } 363 364 switch (bsc) { 365 case 'a': 366 case 'b': 367 case 'n': 368 case 'f': 369 case 'r': 370 case 't': 371 case 'v': 372 result = escapeValue(bsc); 373 break; 374 case 'x': { 375 const unsigned char hd1 = *(pattern + 1); 376 const unsigned char hd2 = *(pattern + 2); 377 const int hexValue = GetHexaChar(hd1, hd2); 378 if (hexValue >= 0) { 379 result = hexValue; 380 incr = 2; // Must skip the digits 381 } else { 382 result = 'x'; // \x without 2 digits: see it as 'x' 383 } 384 } 385 break; 386 case 'd': 387 for (c = '0'; c <= '9'; c++) { 388 ChSet(static_cast<unsigned char>(c)); 389 } 390 break; 391 case 'D': 392 for (c = 0; c < MAXCHR; c++) { 393 if (c < '0' || c > '9') { 394 ChSet(static_cast<unsigned char>(c)); 395 } 396 } 397 break; 398 case 's': 399 ChSet(' '); 400 ChSet('\t'); 401 ChSet('\n'); 402 ChSet('\r'); 403 ChSet('\f'); 404 ChSet('\v'); 405 break; 406 case 'S': 407 for (c = 0; c < MAXCHR; c++) { 408 if (c != ' ' && !(c >= 0x09 && c <= 0x0D)) { 409 ChSet(static_cast<unsigned char>(c)); 410 } 411 } 412 break; 413 case 'w': 414 for (c = 0; c < MAXCHR; c++) { 415 if (iswordc(static_cast<unsigned char>(c))) { 416 ChSet(static_cast<unsigned char>(c)); 417 } 418 } 419 break; 420 case 'W': 421 for (c = 0; c < MAXCHR; c++) { 422 if (!iswordc(static_cast<unsigned char>(c))) { 423 ChSet(static_cast<unsigned char>(c)); 424 } 425 } 426 break; 427 default: 428 result = bsc; 429 } 430 return result; 431 } 432 433 const char *RESearch::Compile(const char *pattern, Sci::Position length, bool caseSensitive, bool posix) noexcept { 434 char *mp=nfa; /* nfa pointer */ 435 char *lp; /* saved pointer */ 436 char *sp=nfa; /* another one */ 437 char *mpMax = mp + MAXNFA - BITBLK - 10; 438 439 int tagi = 0; /* tag stack index */ 440 int tagc = 1; /* actual tag count */ 441 442 int n; 443 char mask; /* xor mask -CCL/NCL */ 444 int c1, c2, prevChar; 445 446 if (!pattern || !length) { 447 if (sta) 448 return nullptr; 449 else 450 return badpat("No previous regular expression"); 451 } 452 sta = NOP; 453 454 const char *p=pattern; /* pattern pointer */ 455 for (int i=0; i<length; i++, p++) { 456 if (mp > mpMax) 457 return badpat("Pattern too long"); 458 lp = mp; 459 switch (*p) { 460 461 case '.': /* match any char */ 462 *mp++ = ANY; 463 break; 464 465 case '^': /* match beginning */ 466 if (p == pattern) { 467 *mp++ = BOL; 468 } else { 469 *mp++ = CHR; 470 *mp++ = *p; 471 } 472 break; 473 474 case '$': /* match endofline */ 475 if (!*(p+1)) { 476 *mp++ = EOL; 477 } else { 478 *mp++ = CHR; 479 *mp++ = *p; 480 } 481 break; 482 483 case '[': /* match char class */ 484 *mp++ = CCL; 485 prevChar = 0; 486 487 i++; 488 if (*++p == '^') { 489 mask = '\377'; 490 i++; 491 p++; 492 } else { 493 mask = 0; 494 } 495 496 if (*p == '-') { /* real dash */ 497 i++; 498 prevChar = *p; 499 ChSet(*p++); 500 } 501 if (*p == ']') { /* real brace */ 502 i++; 503 prevChar = *p; 504 ChSet(*p++); 505 } 506 while (*p && *p != ']') { 507 if (*p == '-') { 508 if (prevChar < 0) { 509 // Previous def. was a char class like \d, take dash literally 510 prevChar = *p; 511 ChSet(*p); 512 } else if (*(p+1)) { 513 if (*(p+1) != ']') { 514 c1 = prevChar + 1; 515 i++; 516 c2 = static_cast<unsigned char>(*++p); 517 if (c2 == '\\') { 518 if (!*(p+1)) { // End of RE 519 return badpat("Missing ]"); 520 } else { 521 i++; 522 p++; 523 int incr; 524 c2 = GetBackslashExpression(p, incr); 525 i += incr; 526 p += incr; 527 if (c2 >= 0) { 528 // Convention: \c (c is any char) is case sensitive, whatever the option 529 ChSet(static_cast<unsigned char>(c2)); 530 prevChar = c2; 531 } else { 532 // bittab is already changed 533 prevChar = -1; 534 } 535 } 536 } 537 if (prevChar < 0) { 538 // Char after dash is char class like \d, take dash literally 539 prevChar = '-'; 540 ChSet('-'); 541 } else { 542 // Put all chars between c1 and c2 included in the char set 543 while (c1 <= c2) { 544 ChSetWithCase(static_cast<unsigned char>(c1++), caseSensitive); 545 } 546 } 547 } else { 548 // Dash before the ], take it literally 549 prevChar = *p; 550 ChSet(*p); 551 } 552 } else { 553 return badpat("Missing ]"); 554 } 555 } else if (*p == '\\' && *(p+1)) { 556 i++; 557 p++; 558 int incr; 559 const int c = GetBackslashExpression(p, incr); 560 i += incr; 561 p += incr; 562 if (c >= 0) { 563 // Convention: \c (c is any char) is case sensitive, whatever the option 564 ChSet(static_cast<unsigned char>(c)); 565 prevChar = c; 566 } else { 567 // bittab is already changed 568 prevChar = -1; 569 } 570 } else { 571 prevChar = static_cast<unsigned char>(*p); 572 ChSetWithCase(*p, caseSensitive); 573 } 574 i++; 575 p++; 576 } 577 if (!*p) 578 return badpat("Missing ]"); 579 580 for (n = 0; n < BITBLK; bittab[n++] = 0) 581 *mp++ = static_cast<char>(mask ^ bittab[n]); 582 583 break; 584 585 case '*': /* match 0 or more... */ 586 case '+': /* match 1 or more... */ 587 case '?': 588 if (p == pattern) 589 return badpat("Empty closure"); 590 lp = sp; /* previous opcode */ 591 if (*lp == CLO || *lp == LCLO) /* equivalence... */ 592 break; 593 switch (*lp) { 594 595 case BOL: 596 case BOT: 597 case EOT: 598 case BOW: 599 case EOW: 600 case REF: 601 return badpat("Illegal closure"); 602 default: 603 break; 604 } 605 606 if (*p == '+') 607 for (sp = mp; lp < sp; lp++) 608 *mp++ = *lp; 609 610 *mp++ = END; 611 *mp++ = END; 612 sp = mp; 613 614 while (--mp > lp) 615 *mp = mp[-1]; 616 if (*p == '?') *mp = CLQ; 617 else if (*(p+1) == '?') *mp = LCLO; 618 else *mp = CLO; 619 620 mp = sp; 621 break; 622 623 case '\\': /* tags, backrefs... */ 624 i++; 625 switch (*++p) { 626 case '<': 627 *mp++ = BOW; 628 break; 629 case '>': 630 if (*sp == BOW) 631 return badpat("Null pattern inside \\<\\>"); 632 *mp++ = EOW; 633 break; 634 case '1': 635 case '2': 636 case '3': 637 case '4': 638 case '5': 639 case '6': 640 case '7': 641 case '8': 642 case '9': 643 n = *p-'0'; 644 if (tagi > 0 && tagstk[tagi] == n) 645 return badpat("Cyclical reference"); 646 if (tagc > n) { 647 *mp++ = REF; 648 *mp++ = static_cast<char>(n); 649 } else { 650 return badpat("Undetermined reference"); 651 } 652 break; 653 default: 654 if (!posix && *p == '(') { 655 if (tagc < MAXTAG) { 656 tagstk[++tagi] = tagc; 657 *mp++ = BOT; 658 *mp++ = static_cast<char>(tagc++); 659 } else { 660 return badpat("Too many \\(\\) pairs"); 661 } 662 } else if (!posix && *p == ')') { 663 if (*sp == BOT) 664 return badpat("Null pattern inside \\(\\)"); 665 if (tagi > 0) { 666 *mp++ = EOT; 667 *mp++ = static_cast<char>(tagstk[tagi--]); 668 } else { 669 return badpat("Unmatched \\)"); 670 } 671 } else { 672 int incr; 673 int c = GetBackslashExpression(p, incr); 674 i += incr; 675 p += incr; 676 if (c >= 0) { 677 *mp++ = CHR; 678 *mp++ = static_cast<unsigned char>(c); 679 } else { 680 *mp++ = CCL; 681 mask = 0; 682 for (n = 0; n < BITBLK; bittab[n++] = 0) 683 *mp++ = static_cast<char>(mask ^ bittab[n]); 684 } 685 } 686 } 687 break; 688 689 default : /* an ordinary char */ 690 if (posix && *p == '(') { 691 if (tagc < MAXTAG) { 692 tagstk[++tagi] = tagc; 693 *mp++ = BOT; 694 *mp++ = static_cast<char>(tagc++); 695 } else { 696 return badpat("Too many () pairs"); 697 } 698 } else if (posix && *p == ')') { 699 if (*sp == BOT) 700 return badpat("Null pattern inside ()"); 701 if (tagi > 0) { 702 *mp++ = EOT; 703 *mp++ = static_cast<char>(tagstk[tagi--]); 704 } else { 705 return badpat("Unmatched )"); 706 } 707 } else { 708 unsigned char c = *p; 709 if (!c) // End of RE 710 c = '\\'; // We take it as raw backslash 711 if (caseSensitive || !iswordc(c)) { 712 *mp++ = CHR; 713 *mp++ = c; 714 } else { 715 *mp++ = CCL; 716 mask = 0; 717 ChSetWithCase(c, false); 718 for (n = 0; n < BITBLK; bittab[n++] = 0) 719 *mp++ = static_cast<char>(mask ^ bittab[n]); 720 } 721 } 722 break; 723 } 724 sp = lp; 725 } 726 if (tagi > 0) 727 return badpat((posix ? "Unmatched (" : "Unmatched \\(")); 728 *mp = END; 729 sta = OKP; 730 return nullptr; 731 } 732 733 /* 734 * RESearch::Execute: 735 * execute nfa to find a match. 736 * 737 * special cases: (nfa[0]) 738 * BOL 739 * Match only once, starting from the 740 * beginning. 741 * CHR 742 * First locate the character without 743 * calling PMatch, and if found, call 744 * PMatch for the remaining string. 745 * END 746 * RESearch::Compile failed, poor luser did not 747 * check for it. Fail fast. 748 * 749 * If a match is found, bopat[0] and eopat[0] are set 750 * to the beginning and the end of the matched fragment, 751 * respectively. 752 * 753 */ 754 int RESearch::Execute(const CharacterIndexer &ci, Sci::Position lp, Sci::Position endp) { 755 unsigned char c; 756 Sci::Position ep = NOTFOUND; 757 char *ap = nfa; 758 759 bol = lp; 760 failure = 0; 761 762 Clear(); 763 764 switch (*ap) { 765 766 case BOL: /* anchored: match from BOL only */ 767 ep = PMatch(ci, lp, endp, ap); 768 break; 769 case EOL: /* just searching for end of line normal path doesn't work */ 770 if (*(ap+1) == END) { 771 lp = endp; 772 ep = lp; 773 break; 774 } else { 775 return 0; 776 } 777 case CHR: /* ordinary char: locate it fast */ 778 c = *(ap+1); 779 while ((lp < endp) && (static_cast<unsigned char>(ci.CharAt(lp)) != c)) 780 lp++; 781 if (lp >= endp) /* if EOS, fail, else fall through. */ 782 return 0; 783 [[fallthrough]]; 784 default: /* regular matching all the way. */ 785 while (lp < endp) { 786 ep = PMatch(ci, lp, endp, ap); 787 if (ep != NOTFOUND) 788 break; 789 lp++; 790 } 791 break; 792 case END: /* munged automaton. fail always */ 793 return 0; 794 } 795 if (ep == NOTFOUND) 796 return 0; 797 798 bopat[0] = lp; 799 eopat[0] = ep; 800 return 1; 801 } 802 803 /* 804 * PMatch: internal routine for the hard part 805 * 806 * This code is partly snarfed from an early grep written by 807 * David Conroy. The backref and tag stuff, and various other 808 * innovations are by oz. 809 * 810 * special case optimizations: (nfa[n], nfa[n+1]) 811 * CLO ANY 812 * We KNOW .* will match everything up to the 813 * end of line. Thus, directly go to the end of 814 * line, without recursive PMatch calls. As in 815 * the other closure cases, the remaining pattern 816 * must be matched by moving backwards on the 817 * string recursively, to find a match for xy 818 * (x is ".*" and y is the remaining pattern) 819 * where the match satisfies the LONGEST match for 820 * x followed by a match for y. 821 * CLO CHR 822 * We can again scan the string forward for the 823 * single char and at the point of failure, we 824 * execute the remaining nfa recursively, same as 825 * above. 826 * 827 * At the end of a successful match, bopat[n] and eopat[n] 828 * are set to the beginning and end of subpatterns matched 829 * by tagged expressions (n = 1 to 9). 830 */ 831 832 extern void re_fail(char *,char); 833 834 static inline int isinset(const char *ap, unsigned char c) noexcept { 835 return ap[(c & BLKIND) >> 3] & bitarr[c & BITIND]; 836 } 837 838 /* 839 * skip values for CLO XXX to skip past the closure 840 */ 841 842 #define ANYSKIP 2 /* [CLO] ANY END */ 843 #define CHRSKIP 3 /* [CLO] CHR chr END */ 844 #define CCLSKIP 34 /* [CLO] CCL 32 bytes END */ 845 846 Sci::Position RESearch::PMatch(const CharacterIndexer &ci, Sci::Position lp, Sci::Position endp, char *ap) { 847 int op, c, n; 848 Sci::Position e; /* extra pointer for CLO */ 849 Sci::Position bp; /* beginning of subpat... */ 850 Sci::Position ep; /* ending of subpat... */ 851 Sci::Position are; /* to save the line ptr. */ 852 Sci::Position llp; /* lazy lp for LCLO */ 853 854 while ((op = *ap++) != END) 855 switch (op) { 856 857 case CHR: 858 if (ci.CharAt(lp++) != *ap++) 859 return NOTFOUND; 860 break; 861 case ANY: 862 if (lp++ >= endp) 863 return NOTFOUND; 864 break; 865 case CCL: 866 if (lp >= endp) 867 return NOTFOUND; 868 if (!isinset(ap, ci.CharAt(lp++))) 869 return NOTFOUND; 870 ap += BITBLK; 871 break; 872 case BOL: 873 if (lp != bol) 874 return NOTFOUND; 875 break; 876 case EOL: 877 if (lp < endp) 878 return NOTFOUND; 879 break; 880 case BOT: 881 bopat[static_cast<int>(*ap++)] = lp; 882 break; 883 case EOT: 884 eopat[static_cast<int>(*ap++)] = lp; 885 break; 886 case BOW: 887 if ((lp!=bol && iswordc(ci.CharAt(lp-1))) || !iswordc(ci.CharAt(lp))) 888 return NOTFOUND; 889 break; 890 case EOW: 891 if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp))) 892 return NOTFOUND; 893 break; 894 case REF: 895 n = *ap++; 896 bp = bopat[n]; 897 ep = eopat[n]; 898 while (bp < ep) 899 if (ci.CharAt(bp++) != ci.CharAt(lp++)) 900 return NOTFOUND; 901 break; 902 case LCLO: 903 case CLQ: 904 case CLO: 905 are = lp; 906 switch (*ap) { 907 908 case ANY: 909 if (op == CLO || op == LCLO) 910 while (lp < endp) 911 lp++; 912 else if (lp < endp) 913 lp++; 914 915 n = ANYSKIP; 916 break; 917 case CHR: 918 c = *(ap+1); 919 if (op == CLO || op == LCLO) 920 while ((lp < endp) && (c == ci.CharAt(lp))) 921 lp++; 922 else if ((lp < endp) && (c == ci.CharAt(lp))) 923 lp++; 924 n = CHRSKIP; 925 break; 926 case CCL: 927 while ((lp < endp) && isinset(ap+1, ci.CharAt(lp))) 928 lp++; 929 n = CCLSKIP; 930 break; 931 default: 932 failure = true; 933 //re_fail("closure: bad nfa.", *ap); 934 return NOTFOUND; 935 } 936 ap += n; 937 938 llp = lp; 939 e = NOTFOUND; 940 while (llp >= are) { 941 Sci::Position q; 942 if ((q = PMatch(ci, llp, endp, ap)) != NOTFOUND) { 943 e = q; 944 lp = llp; 945 if (op != LCLO) return e; 946 } 947 if (*ap == END) return e; 948 --llp; 949 } 950 if (*ap == EOT) 951 PMatch(ci, lp, endp, ap); 952 return e; 953 default: 954 //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op)); 955 return NOTFOUND; 956 } 957 return lp; 958 } 959 960 961