1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package regexp 6 7import ( 8 "io" 9 "regexp/syntax" 10 "sync" 11) 12 13// A queue is a 'sparse array' holding pending threads of execution. 14// See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html 15type queue struct { 16 sparse []uint32 17 dense []entry 18} 19 20// An entry is an entry on a queue. 21// It holds both the instruction pc and the actual thread. 22// Some queue entries are just place holders so that the machine 23// knows it has considered that pc. Such entries have t == nil. 24type entry struct { 25 pc uint32 26 t *thread 27} 28 29// A thread is the state of a single path through the machine: 30// an instruction and a corresponding capture array. 31// See https://swtch.com/~rsc/regexp/regexp2.html 32type thread struct { 33 inst *syntax.Inst 34 cap []int 35} 36 37// A machine holds all the state during an NFA simulation for p. 38type machine struct { 39 re *Regexp // corresponding Regexp 40 p *syntax.Prog // compiled program 41 q0, q1 queue // two queues for runq, nextq 42 pool []*thread // pool of available threads 43 matched bool // whether a match was found 44 matchcap []int // capture information for the match 45 46 inputs inputs 47} 48 49type inputs struct { 50 // cached inputs, to avoid allocation 51 bytes inputBytes 52 string inputString 53 reader inputReader 54} 55 56func (i *inputs) newBytes(b []byte) input { 57 i.bytes.str = b 58 return &i.bytes 59} 60 61func (i *inputs) newString(s string) input { 62 i.string.str = s 63 return &i.string 64} 65 66func (i *inputs) newReader(r io.RuneReader) input { 67 i.reader.r = r 68 i.reader.atEOT = false 69 i.reader.pos = 0 70 return &i.reader 71} 72 73func (i *inputs) clear() { 74 // We need to clear 1 of these. 75 // Avoid the expense of clearing the others (pointer write barrier). 76 if i.bytes.str != nil { 77 i.bytes.str = nil 78 } else if i.reader.r != nil { 79 i.reader.r = nil 80 } else { 81 i.string.str = "" 82 } 83} 84 85func (i *inputs) init(r io.RuneReader, b []byte, s string) (input, int) { 86 if r != nil { 87 return i.newReader(r), 0 88 } 89 if b != nil { 90 return i.newBytes(b), len(b) 91 } 92 return i.newString(s), len(s) 93} 94 95func (m *machine) init(ncap int) { 96 for _, t := range m.pool { 97 t.cap = t.cap[:ncap] 98 } 99 m.matchcap = m.matchcap[:ncap] 100} 101 102// alloc allocates a new thread with the given instruction. 103// It uses the free pool if possible. 104func (m *machine) alloc(i *syntax.Inst) *thread { 105 var t *thread 106 if n := len(m.pool); n > 0 { 107 t = m.pool[n-1] 108 m.pool = m.pool[:n-1] 109 } else { 110 t = new(thread) 111 t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) 112 } 113 t.inst = i 114 return t 115} 116 117// A lazyFlag is a lazily-evaluated syntax.EmptyOp, 118// for checking zero-width flags like ^ $ \A \z \B \b. 119// It records the pair of relevant runes and does not 120// determine the implied flags until absolutely necessary 121// (most of the time, that means never). 122type lazyFlag uint64 123 124func newLazyFlag(r1, r2 rune) lazyFlag { 125 return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2))) 126} 127 128func (f lazyFlag) match(op syntax.EmptyOp) bool { 129 if op == 0 { 130 return true 131 } 132 r1 := rune(f >> 32) 133 if op&syntax.EmptyBeginLine != 0 { 134 if r1 != '\n' && r1 >= 0 { 135 return false 136 } 137 op &^= syntax.EmptyBeginLine 138 } 139 if op&syntax.EmptyBeginText != 0 { 140 if r1 >= 0 { 141 return false 142 } 143 op &^= syntax.EmptyBeginText 144 } 145 if op == 0 { 146 return true 147 } 148 r2 := rune(f) 149 if op&syntax.EmptyEndLine != 0 { 150 if r2 != '\n' && r2 >= 0 { 151 return false 152 } 153 op &^= syntax.EmptyEndLine 154 } 155 if op&syntax.EmptyEndText != 0 { 156 if r2 >= 0 { 157 return false 158 } 159 op &^= syntax.EmptyEndText 160 } 161 if op == 0 { 162 return true 163 } 164 if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) { 165 op &^= syntax.EmptyWordBoundary 166 } else { 167 op &^= syntax.EmptyNoWordBoundary 168 } 169 return op == 0 170} 171 172// match runs the machine over the input starting at pos. 173// It reports whether a match was found. 174// If so, m.matchcap holds the submatch information. 175func (m *machine) match(i input, pos int) bool { 176 startCond := m.re.cond 177 if startCond == ^syntax.EmptyOp(0) { // impossible 178 return false 179 } 180 m.matched = false 181 for i := range m.matchcap { 182 m.matchcap[i] = -1 183 } 184 runq, nextq := &m.q0, &m.q1 185 r, r1 := endOfText, endOfText 186 width, width1 := 0, 0 187 r, width = i.step(pos) 188 if r != endOfText { 189 r1, width1 = i.step(pos + width) 190 } 191 var flag lazyFlag 192 if pos == 0 { 193 flag = newLazyFlag(-1, r) 194 } else { 195 flag = i.context(pos) 196 } 197 for { 198 if len(runq.dense) == 0 { 199 if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 200 // Anchored match, past beginning of text. 201 break 202 } 203 if m.matched { 204 // Have match; finished exploring alternatives. 205 break 206 } 207 if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() { 208 // Match requires literal prefix; fast search for it. 209 advance := i.index(m.re, pos) 210 if advance < 0 { 211 break 212 } 213 pos += advance 214 r, width = i.step(pos) 215 r1, width1 = i.step(pos + width) 216 } 217 } 218 if !m.matched { 219 if len(m.matchcap) > 0 { 220 m.matchcap[0] = pos 221 } 222 m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil) 223 } 224 flag = newLazyFlag(r, r1) 225 m.step(runq, nextq, pos, pos+width, r, &flag) 226 if width == 0 { 227 break 228 } 229 if len(m.matchcap) == 0 && m.matched { 230 // Found a match and not paying attention 231 // to where it is, so any match will do. 232 break 233 } 234 pos += width 235 r, width = r1, width1 236 if r != endOfText { 237 r1, width1 = i.step(pos + width) 238 } 239 runq, nextq = nextq, runq 240 } 241 m.clear(nextq) 242 return m.matched 243} 244 245// clear frees all threads on the thread queue. 246func (m *machine) clear(q *queue) { 247 for _, d := range q.dense { 248 if d.t != nil { 249 m.pool = append(m.pool, d.t) 250 } 251 } 252 q.dense = q.dense[:0] 253} 254 255// step executes one step of the machine, running each of the threads 256// on runq and appending new threads to nextq. 257// The step processes the rune c (which may be endOfText), 258// which starts at position pos and ends at nextPos. 259// nextCond gives the setting for the empty-width flags after c. 260func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) { 261 longest := m.re.longest 262 for j := 0; j < len(runq.dense); j++ { 263 d := &runq.dense[j] 264 t := d.t 265 if t == nil { 266 continue 267 } 268 if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { 269 m.pool = append(m.pool, t) 270 continue 271 } 272 i := t.inst 273 add := false 274 switch i.Op { 275 default: 276 panic("bad inst") 277 278 case syntax.InstMatch: 279 if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { 280 t.cap[1] = pos 281 copy(m.matchcap, t.cap) 282 } 283 if !longest { 284 // First-match mode: cut off all lower-priority threads. 285 for _, d := range runq.dense[j+1:] { 286 if d.t != nil { 287 m.pool = append(m.pool, d.t) 288 } 289 } 290 runq.dense = runq.dense[:0] 291 } 292 m.matched = true 293 294 case syntax.InstRune: 295 add = i.MatchRune(c) 296 case syntax.InstRune1: 297 add = c == i.Rune[0] 298 case syntax.InstRuneAny: 299 add = true 300 case syntax.InstRuneAnyNotNL: 301 add = c != '\n' 302 } 303 if add { 304 t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) 305 } 306 if t != nil { 307 m.pool = append(m.pool, t) 308 } 309 } 310 runq.dense = runq.dense[:0] 311} 312 313// add adds an entry to q for pc, unless the q already has such an entry. 314// It also recursively adds an entry for all instructions reachable from pc by following 315// empty-width conditions satisfied by cond. pos gives the current position 316// in the input. 317func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond *lazyFlag, t *thread) *thread { 318Again: 319 if pc == 0 { 320 return t 321 } 322 if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { 323 return t 324 } 325 326 j := len(q.dense) 327 q.dense = q.dense[:j+1] 328 d := &q.dense[j] 329 d.t = nil 330 d.pc = pc 331 q.sparse[pc] = uint32(j) 332 333 i := &m.p.Inst[pc] 334 switch i.Op { 335 default: 336 panic("unhandled") 337 case syntax.InstFail: 338 // nothing 339 case syntax.InstAlt, syntax.InstAltMatch: 340 t = m.add(q, i.Out, pos, cap, cond, t) 341 pc = i.Arg 342 goto Again 343 case syntax.InstEmptyWidth: 344 if cond.match(syntax.EmptyOp(i.Arg)) { 345 pc = i.Out 346 goto Again 347 } 348 case syntax.InstNop: 349 pc = i.Out 350 goto Again 351 case syntax.InstCapture: 352 if int(i.Arg) < len(cap) { 353 opos := cap[i.Arg] 354 cap[i.Arg] = pos 355 m.add(q, i.Out, pos, cap, cond, nil) 356 cap[i.Arg] = opos 357 } else { 358 pc = i.Out 359 goto Again 360 } 361 case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 362 if t == nil { 363 t = m.alloc(i) 364 } else { 365 t.inst = i 366 } 367 if len(cap) > 0 && &t.cap[0] != &cap[0] { 368 copy(t.cap, cap) 369 } 370 d.t = t 371 t = nil 372 } 373 return t 374} 375 376type onePassMachine struct { 377 inputs inputs 378 matchcap []int 379} 380 381var onePassPool sync.Pool 382 383func newOnePassMachine() *onePassMachine { 384 m, ok := onePassPool.Get().(*onePassMachine) 385 if !ok { 386 m = new(onePassMachine) 387 } 388 return m 389} 390 391func freeOnePassMachine(m *onePassMachine) { 392 m.inputs.clear() 393 onePassPool.Put(m) 394} 395 396// doOnePass implements r.doExecute using the one-pass execution engine. 397func (re *Regexp) doOnePass(ir io.RuneReader, ib []byte, is string, pos, ncap int, dstCap []int) []int { 398 startCond := re.cond 399 if startCond == ^syntax.EmptyOp(0) { // impossible 400 return nil 401 } 402 403 m := newOnePassMachine() 404 if cap(m.matchcap) < ncap { 405 m.matchcap = make([]int, ncap) 406 } else { 407 m.matchcap = m.matchcap[:ncap] 408 } 409 410 matched := false 411 for i := range m.matchcap { 412 m.matchcap[i] = -1 413 } 414 415 i, _ := m.inputs.init(ir, ib, is) 416 417 r, r1 := endOfText, endOfText 418 width, width1 := 0, 0 419 r, width = i.step(pos) 420 if r != endOfText { 421 r1, width1 = i.step(pos + width) 422 } 423 var flag lazyFlag 424 if pos == 0 { 425 flag = newLazyFlag(-1, r) 426 } else { 427 flag = i.context(pos) 428 } 429 pc := re.onepass.Start 430 inst := &re.onepass.Inst[pc] 431 // If there is a simple literal prefix, skip over it. 432 if pos == 0 && flag.match(syntax.EmptyOp(inst.Arg)) && 433 len(re.prefix) > 0 && i.canCheckPrefix() { 434 // Match requires literal prefix; fast search for it. 435 if !i.hasPrefix(re) { 436 goto Return 437 } 438 pos += len(re.prefix) 439 r, width = i.step(pos) 440 r1, width1 = i.step(pos + width) 441 flag = i.context(pos) 442 pc = int(re.prefixEnd) 443 } 444 for { 445 inst = &re.onepass.Inst[pc] 446 pc = int(inst.Out) 447 switch inst.Op { 448 default: 449 panic("bad inst") 450 case syntax.InstMatch: 451 matched = true 452 if len(m.matchcap) > 0 { 453 m.matchcap[0] = 0 454 m.matchcap[1] = pos 455 } 456 goto Return 457 case syntax.InstRune: 458 if !inst.MatchRune(r) { 459 goto Return 460 } 461 case syntax.InstRune1: 462 if r != inst.Rune[0] { 463 goto Return 464 } 465 case syntax.InstRuneAny: 466 // Nothing 467 case syntax.InstRuneAnyNotNL: 468 if r == '\n' { 469 goto Return 470 } 471 // peek at the input rune to see which branch of the Alt to take 472 case syntax.InstAlt, syntax.InstAltMatch: 473 pc = int(onePassNext(inst, r)) 474 continue 475 case syntax.InstFail: 476 goto Return 477 case syntax.InstNop: 478 continue 479 case syntax.InstEmptyWidth: 480 if !flag.match(syntax.EmptyOp(inst.Arg)) { 481 goto Return 482 } 483 continue 484 case syntax.InstCapture: 485 if int(inst.Arg) < len(m.matchcap) { 486 m.matchcap[inst.Arg] = pos 487 } 488 continue 489 } 490 if width == 0 { 491 break 492 } 493 flag = newLazyFlag(r, r1) 494 pos += width 495 r, width = r1, width1 496 if r != endOfText { 497 r1, width1 = i.step(pos + width) 498 } 499 } 500 501Return: 502 if !matched { 503 freeOnePassMachine(m) 504 return nil 505 } 506 507 dstCap = append(dstCap, m.matchcap...) 508 freeOnePassMachine(m) 509 return dstCap 510} 511 512// doMatch reports whether either r, b or s match the regexp. 513func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool { 514 return re.doExecute(r, b, s, 0, 0, nil) != nil 515} 516 517// doExecute finds the leftmost match in the input, appends the position 518// of its subexpressions to dstCap and returns dstCap. 519// 520// nil is returned if no matches are found and non-nil if matches are found. 521func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int { 522 if dstCap == nil { 523 // Make sure 'return dstCap' is non-nil. 524 dstCap = arrayNoInts[:0:0] 525 } 526 527 if r == nil && len(b)+len(s) < re.minInputLen { 528 return nil 529 } 530 531 if re.onepass != nil { 532 return re.doOnePass(r, b, s, pos, ncap, dstCap) 533 } 534 if r == nil && len(b)+len(s) < re.maxBitStateLen { 535 return re.backtrack(b, s, pos, ncap, dstCap) 536 } 537 538 m := re.get() 539 i, _ := m.inputs.init(r, b, s) 540 541 m.init(ncap) 542 if !m.match(i, pos) { 543 re.put(m) 544 return nil 545 } 546 547 dstCap = append(dstCap, m.matchcap...) 548 re.put(m) 549 return dstCap 550} 551 552// arrayNoInts is returned by doExecute match if nil dstCap is passed 553// to it with ncap=0. 554var arrayNoInts [0]int 555