1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package syntax
6
7import (
8	"sort"
9	"strings"
10	"unicode"
11	"unicode/utf8"
12)
13
14// An Error describes a failure to parse a regular expression
15// and gives the offending expression.
16type Error struct {
17	Code ErrorCode
18	Expr string
19}
20
21func (e *Error) Error() string {
22	return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
23}
24
25// An ErrorCode describes a failure to parse a regular expression.
26type ErrorCode string
27
28const (
29	// Unexpected error
30	ErrInternalError ErrorCode = "regexp/syntax: internal error"
31
32	// Parse errors
33	ErrInvalidCharClass      ErrorCode = "invalid character class"
34	ErrInvalidCharRange      ErrorCode = "invalid character class range"
35	ErrInvalidEscape         ErrorCode = "invalid escape sequence"
36	ErrInvalidNamedCapture   ErrorCode = "invalid named capture"
37	ErrInvalidPerlOp         ErrorCode = "invalid or unsupported Perl syntax"
38	ErrInvalidRepeatOp       ErrorCode = "invalid nested repetition operator"
39	ErrInvalidRepeatSize     ErrorCode = "invalid repeat count"
40	ErrInvalidUTF8           ErrorCode = "invalid UTF-8"
41	ErrMissingBracket        ErrorCode = "missing closing ]"
42	ErrMissingParen          ErrorCode = "missing closing )"
43	ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
44	ErrTrailingBackslash     ErrorCode = "trailing backslash at end of expression"
45	ErrUnexpectedParen       ErrorCode = "unexpected )"
46	ErrNestingDepth          ErrorCode = "expression nests too deeply"
47	ErrLarge                 ErrorCode = "expression too large"
48)
49
50func (e ErrorCode) String() string {
51	return string(e)
52}
53
54// Flags control the behavior of the parser and record information about regexp context.
55type Flags uint16
56
57const (
58	FoldCase      Flags = 1 << iota // case-insensitive match
59	Literal                         // treat pattern as literal string
60	ClassNL                         // allow character classes like [^a-z] and [[:space:]] to match newline
61	DotNL                           // allow . to match newline
62	OneLine                         // treat ^ and $ as only matching at beginning and end of text
63	NonGreedy                       // make repetition operators default to non-greedy
64	PerlX                           // allow Perl extensions
65	UnicodeGroups                   // allow \p{Han}, \P{Han} for Unicode group and negation
66	WasDollar                       // regexp OpEndText was $, not \z
67	Simple                          // regexp contains no counted repetition
68
69	MatchNL = ClassNL | DotNL
70
71	Perl        = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
72	POSIX Flags = 0                                         // POSIX syntax
73)
74
75// Pseudo-ops for parsing stack.
76const (
77	opLeftParen = opPseudo + iota
78	opVerticalBar
79)
80
81// maxHeight is the maximum height of a regexp parse tree.
82// It is somewhat arbitrarily chosen, but the idea is to be large enough
83// that no one will actually hit in real use but at the same time small enough
84// that recursion on the Regexp tree will not hit the 1GB Go stack limit.
85// The maximum amount of stack for a single recursive frame is probably
86// closer to 1kB, so this could potentially be raised, but it seems unlikely
87// that people have regexps nested even this deeply.
88// We ran a test on Google's C++ code base and turned up only
89// a single use case with depth > 100; it had depth 128.
90// Using depth 1000 should be plenty of margin.
91// As an optimization, we don't even bother calculating heights
92// until we've allocated at least maxHeight Regexp structures.
93const maxHeight = 1000
94
95// maxSize is the maximum size of a compiled regexp in Insts.
96// It too is somewhat arbitrarily chosen, but the idea is to be large enough
97// to allow significant regexps while at the same time small enough that
98// the compiled form will not take up too much memory.
99// 128 MB is enough for a 3.3 million Inst structures, which roughly
100// corresponds to a 3.3 MB regexp.
101const (
102	maxSize  = 128 << 20 / instSize
103	instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words
104)
105
106// maxRunes is the maximum number of runes allowed in a regexp tree
107// counting the runes in all the nodes.
108// Ignoring character classes p.numRunes is always less than the length of the regexp.
109// Character classes can make it much larger: each \pL adds 1292 runes.
110// 128 MB is enough for 32M runes, which is over 26k \pL instances.
111// Note that repetitions do not make copies of the rune slices,
112// so \pL{1000} is only one rune slice, not 1000.
113// We could keep a cache of character classes we've seen,
114// so that all the \pL we see use the same rune list,
115// but that doesn't remove the problem entirely:
116// consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()].
117// And because the Rune slice is exposed directly in the Regexp,
118// there is not an opportunity to change the representation to allow
119// partial sharing between different character classes.
120// So the limit is the best we can do.
121const (
122	maxRunes = 128 << 20 / runeSize
123	runeSize = 4 // rune is int32
124)
125
126type parser struct {
127	flags       Flags     // parse mode flags
128	stack       []*Regexp // stack of parsed expressions
129	free        *Regexp
130	numCap      int // number of capturing groups seen
131	wholeRegexp string
132	tmpClass    []rune            // temporary char class work space
133	numRegexp   int               // number of regexps allocated
134	numRunes    int               // number of runes in char classes
135	repeats     int64             // product of all repetitions seen
136	height      map[*Regexp]int   // regexp height, for height limit check
137	size        map[*Regexp]int64 // regexp compiled size, for size limit check
138}
139
140func (p *parser) newRegexp(op Op) *Regexp {
141	re := p.free
142	if re != nil {
143		p.free = re.Sub0[0]
144		*re = Regexp{}
145	} else {
146		re = new(Regexp)
147		p.numRegexp++
148	}
149	re.Op = op
150	return re
151}
152
153func (p *parser) reuse(re *Regexp) {
154	if p.height != nil {
155		delete(p.height, re)
156	}
157	re.Sub0[0] = p.free
158	p.free = re
159}
160
161func (p *parser) checkLimits(re *Regexp) {
162	if p.numRunes > maxRunes {
163		panic(ErrLarge)
164	}
165	p.checkSize(re)
166	p.checkHeight(re)
167}
168
169func (p *parser) checkSize(re *Regexp) {
170	if p.size == nil {
171		// We haven't started tracking size yet.
172		// Do a relatively cheap check to see if we need to start.
173		// Maintain the product of all the repeats we've seen
174		// and don't track if the total number of regexp nodes
175		// we've seen times the repeat product is in budget.
176		if p.repeats == 0 {
177			p.repeats = 1
178		}
179		if re.Op == OpRepeat {
180			n := re.Max
181			if n == -1 {
182				n = re.Min
183			}
184			if n <= 0 {
185				n = 1
186			}
187			if int64(n) > maxSize/p.repeats {
188				p.repeats = maxSize
189			} else {
190				p.repeats *= int64(n)
191			}
192		}
193		if int64(p.numRegexp) < maxSize/p.repeats {
194			return
195		}
196
197		// We need to start tracking size.
198		// Make the map and belatedly populate it
199		// with info about everything we've constructed so far.
200		p.size = make(map[*Regexp]int64)
201		for _, re := range p.stack {
202			p.checkSize(re)
203		}
204	}
205
206	if p.calcSize(re, true) > maxSize {
207		panic(ErrLarge)
208	}
209}
210
211func (p *parser) calcSize(re *Regexp, force bool) int64 {
212	if !force {
213		if size, ok := p.size[re]; ok {
214			return size
215		}
216	}
217
218	var size int64
219	switch re.Op {
220	case OpLiteral:
221		size = int64(len(re.Rune))
222	case OpCapture, OpStar:
223		// star can be 1+ or 2+; assume 2 pessimistically
224		size = 2 + p.calcSize(re.Sub[0], false)
225	case OpPlus, OpQuest:
226		size = 1 + p.calcSize(re.Sub[0], false)
227	case OpConcat:
228		for _, sub := range re.Sub {
229			size += p.calcSize(sub, false)
230		}
231	case OpAlternate:
232		for _, sub := range re.Sub {
233			size += p.calcSize(sub, false)
234		}
235		if len(re.Sub) > 1 {
236			size += int64(len(re.Sub)) - 1
237		}
238	case OpRepeat:
239		sub := p.calcSize(re.Sub[0], false)
240		if re.Max == -1 {
241			if re.Min == 0 {
242				size = 2 + sub // x*
243			} else {
244				size = 1 + int64(re.Min)*sub // xxx+
245			}
246			break
247		}
248		// x{2,5} = xx(x(x(x)?)?)?
249		size = int64(re.Max)*sub + int64(re.Max-re.Min)
250	}
251
252	size = max(1, size)
253	p.size[re] = size
254	return size
255}
256
257func (p *parser) checkHeight(re *Regexp) {
258	if p.numRegexp < maxHeight {
259		return
260	}
261	if p.height == nil {
262		p.height = make(map[*Regexp]int)
263		for _, re := range p.stack {
264			p.checkHeight(re)
265		}
266	}
267	if p.calcHeight(re, true) > maxHeight {
268		panic(ErrNestingDepth)
269	}
270}
271
272func (p *parser) calcHeight(re *Regexp, force bool) int {
273	if !force {
274		if h, ok := p.height[re]; ok {
275			return h
276		}
277	}
278	h := 1
279	for _, sub := range re.Sub {
280		hsub := p.calcHeight(sub, false)
281		if h < 1+hsub {
282			h = 1 + hsub
283		}
284	}
285	p.height[re] = h
286	return h
287}
288
289// Parse stack manipulation.
290
291// push pushes the regexp re onto the parse stack and returns the regexp.
292func (p *parser) push(re *Regexp) *Regexp {
293	p.numRunes += len(re.Rune)
294	if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
295		// Single rune.
296		if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
297			return nil
298		}
299		re.Op = OpLiteral
300		re.Rune = re.Rune[:1]
301		re.Flags = p.flags &^ FoldCase
302	} else if re.Op == OpCharClass && len(re.Rune) == 4 &&
303		re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
304		unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
305		unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
306		re.Op == OpCharClass && len(re.Rune) == 2 &&
307			re.Rune[0]+1 == re.Rune[1] &&
308			unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
309			unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
310		// Case-insensitive rune like [Aa] or [Δδ].
311		if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
312			return nil
313		}
314
315		// Rewrite as (case-insensitive) literal.
316		re.Op = OpLiteral
317		re.Rune = re.Rune[:1]
318		re.Flags = p.flags | FoldCase
319	} else {
320		// Incremental concatenation.
321		p.maybeConcat(-1, 0)
322	}
323
324	p.stack = append(p.stack, re)
325	p.checkLimits(re)
326	return re
327}
328
329// maybeConcat implements incremental concatenation
330// of literal runes into string nodes. The parser calls this
331// before each push, so only the top fragment of the stack
332// might need processing. Since this is called before a push,
333// the topmost literal is no longer subject to operators like *
334// (Otherwise ab* would turn into (ab)*.)
335// If r >= 0 and there's a node left over, maybeConcat uses it
336// to push r with the given flags.
337// maybeConcat reports whether r was pushed.
338func (p *parser) maybeConcat(r rune, flags Flags) bool {
339	n := len(p.stack)
340	if n < 2 {
341		return false
342	}
343
344	re1 := p.stack[n-1]
345	re2 := p.stack[n-2]
346	if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
347		return false
348	}
349
350	// Push re1 into re2.
351	re2.Rune = append(re2.Rune, re1.Rune...)
352
353	// Reuse re1 if possible.
354	if r >= 0 {
355		re1.Rune = re1.Rune0[:1]
356		re1.Rune[0] = r
357		re1.Flags = flags
358		return true
359	}
360
361	p.stack = p.stack[:n-1]
362	p.reuse(re1)
363	return false // did not push r
364}
365
366// literal pushes a literal regexp for the rune r on the stack.
367func (p *parser) literal(r rune) {
368	re := p.newRegexp(OpLiteral)
369	re.Flags = p.flags
370	if p.flags&FoldCase != 0 {
371		r = minFoldRune(r)
372	}
373	re.Rune0[0] = r
374	re.Rune = re.Rune0[:1]
375	p.push(re)
376}
377
378// minFoldRune returns the minimum rune fold-equivalent to r.
379func minFoldRune(r rune) rune {
380	if r < minFold || r > maxFold {
381		return r
382	}
383	m := r
384	r0 := r
385	for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
386		m = min(m, r)
387	}
388	return m
389}
390
391// op pushes a regexp with the given op onto the stack
392// and returns that regexp.
393func (p *parser) op(op Op) *Regexp {
394	re := p.newRegexp(op)
395	re.Flags = p.flags
396	return p.push(re)
397}
398
399// repeat replaces the top stack element with itself repeated according to op, min, max.
400// before is the regexp suffix starting at the repetition operator.
401// after is the regexp suffix following after the repetition operator.
402// repeat returns an updated 'after' and an error, if any.
403func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
404	flags := p.flags
405	if p.flags&PerlX != 0 {
406		if len(after) > 0 && after[0] == '?' {
407			after = after[1:]
408			flags ^= NonGreedy
409		}
410		if lastRepeat != "" {
411			// In Perl it is not allowed to stack repetition operators:
412			// a** is a syntax error, not a doubled star, and a++ means
413			// something else entirely, which we don't support!
414			return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
415		}
416	}
417	n := len(p.stack)
418	if n == 0 {
419		return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
420	}
421	sub := p.stack[n-1]
422	if sub.Op >= opPseudo {
423		return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
424	}
425
426	re := p.newRegexp(op)
427	re.Min = min
428	re.Max = max
429	re.Flags = flags
430	re.Sub = re.Sub0[:1]
431	re.Sub[0] = sub
432	p.stack[n-1] = re
433	p.checkLimits(re)
434
435	if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) {
436		return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
437	}
438
439	return after, nil
440}
441
442// repeatIsValid reports whether the repetition re is valid.
443// Valid means that the combination of the top-level repetition
444// and any inner repetitions does not exceed n copies of the
445// innermost thing.
446// This function rewalks the regexp tree and is called for every repetition,
447// so we have to worry about inducing quadratic behavior in the parser.
448// We avoid this by only calling repeatIsValid when min or max >= 2.
449// In that case the depth of any >= 2 nesting can only get to 9 without
450// triggering a parse error, so each subtree can only be rewalked 9 times.
451func repeatIsValid(re *Regexp, n int) bool {
452	if re.Op == OpRepeat {
453		m := re.Max
454		if m == 0 {
455			return true
456		}
457		if m < 0 {
458			m = re.Min
459		}
460		if m > n {
461			return false
462		}
463		if m > 0 {
464			n /= m
465		}
466	}
467	for _, sub := range re.Sub {
468		if !repeatIsValid(sub, n) {
469			return false
470		}
471	}
472	return true
473}
474
475// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
476func (p *parser) concat() *Regexp {
477	p.maybeConcat(-1, 0)
478
479	// Scan down to find pseudo-operator | or (.
480	i := len(p.stack)
481	for i > 0 && p.stack[i-1].Op < opPseudo {
482		i--
483	}
484	subs := p.stack[i:]
485	p.stack = p.stack[:i]
486
487	// Empty concatenation is special case.
488	if len(subs) == 0 {
489		return p.push(p.newRegexp(OpEmptyMatch))
490	}
491
492	return p.push(p.collapse(subs, OpConcat))
493}
494
495// alternate replaces the top of the stack (above the topmost '(') with its alternation.
496func (p *parser) alternate() *Regexp {
497	// Scan down to find pseudo-operator (.
498	// There are no | above (.
499	i := len(p.stack)
500	for i > 0 && p.stack[i-1].Op < opPseudo {
501		i--
502	}
503	subs := p.stack[i:]
504	p.stack = p.stack[:i]
505
506	// Make sure top class is clean.
507	// All the others already are (see swapVerticalBar).
508	if len(subs) > 0 {
509		cleanAlt(subs[len(subs)-1])
510	}
511
512	// Empty alternate is special case
513	// (shouldn't happen but easy to handle).
514	if len(subs) == 0 {
515		return p.push(p.newRegexp(OpNoMatch))
516	}
517
518	return p.push(p.collapse(subs, OpAlternate))
519}
520
521// cleanAlt cleans re for eventual inclusion in an alternation.
522func cleanAlt(re *Regexp) {
523	switch re.Op {
524	case OpCharClass:
525		re.Rune = cleanClass(&re.Rune)
526		if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
527			re.Rune = nil
528			re.Op = OpAnyChar
529			return
530		}
531		if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
532			re.Rune = nil
533			re.Op = OpAnyCharNotNL
534			return
535		}
536		if cap(re.Rune)-len(re.Rune) > 100 {
537			// re.Rune will not grow any more.
538			// Make a copy or inline to reclaim storage.
539			re.Rune = append(re.Rune0[:0], re.Rune...)
540		}
541	}
542}
543
544// collapse returns the result of applying op to sub.
545// If sub contains op nodes, they all get hoisted up
546// so that there is never a concat of a concat or an
547// alternate of an alternate.
548func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
549	if len(subs) == 1 {
550		return subs[0]
551	}
552	re := p.newRegexp(op)
553	re.Sub = re.Sub0[:0]
554	for _, sub := range subs {
555		if sub.Op == op {
556			re.Sub = append(re.Sub, sub.Sub...)
557			p.reuse(sub)
558		} else {
559			re.Sub = append(re.Sub, sub)
560		}
561	}
562	if op == OpAlternate {
563		re.Sub = p.factor(re.Sub)
564		if len(re.Sub) == 1 {
565			old := re
566			re = re.Sub[0]
567			p.reuse(old)
568		}
569	}
570	return re
571}
572
573// factor factors common prefixes from the alternation list sub.
574// It returns a replacement list that reuses the same storage and
575// frees (passes to p.reuse) any removed *Regexps.
576//
577// For example,
578//
579//	ABC|ABD|AEF|BCX|BCY
580//
581// simplifies by literal prefix extraction to
582//
583//	A(B(C|D)|EF)|BC(X|Y)
584//
585// which simplifies by character class introduction to
586//
587//	A(B[CD]|EF)|BC[XY]
588func (p *parser) factor(sub []*Regexp) []*Regexp {
589	if len(sub) < 2 {
590		return sub
591	}
592
593	// Round 1: Factor out common literal prefixes.
594	var str []rune
595	var strflags Flags
596	start := 0
597	out := sub[:0]
598	for i := 0; i <= len(sub); i++ {
599		// Invariant: the Regexps that were in sub[0:start] have been
600		// used or marked for reuse, and the slice space has been reused
601		// for out (len(out) <= start).
602		//
603		// Invariant: sub[start:i] consists of regexps that all begin
604		// with str as modified by strflags.
605		var istr []rune
606		var iflags Flags
607		if i < len(sub) {
608			istr, iflags = p.leadingString(sub[i])
609			if iflags == strflags {
610				same := 0
611				for same < len(str) && same < len(istr) && str[same] == istr[same] {
612					same++
613				}
614				if same > 0 {
615					// Matches at least one rune in current range.
616					// Keep going around.
617					str = str[:same]
618					continue
619				}
620			}
621		}
622
623		// Found end of a run with common leading literal string:
624		// sub[start:i] all begin with str[0:len(str)], but sub[i]
625		// does not even begin with str[0].
626		//
627		// Factor out common string and append factored expression to out.
628		if i == start {
629			// Nothing to do - run of length 0.
630		} else if i == start+1 {
631			// Just one: don't bother factoring.
632			out = append(out, sub[start])
633		} else {
634			// Construct factored form: prefix(suffix1|suffix2|...)
635			prefix := p.newRegexp(OpLiteral)
636			prefix.Flags = strflags
637			prefix.Rune = append(prefix.Rune[:0], str...)
638
639			for j := start; j < i; j++ {
640				sub[j] = p.removeLeadingString(sub[j], len(str))
641				p.checkLimits(sub[j])
642			}
643			suffix := p.collapse(sub[start:i], OpAlternate) // recurse
644
645			re := p.newRegexp(OpConcat)
646			re.Sub = append(re.Sub[:0], prefix, suffix)
647			out = append(out, re)
648		}
649
650		// Prepare for next iteration.
651		start = i
652		str = istr
653		strflags = iflags
654	}
655	sub = out
656
657	// Round 2: Factor out common simple prefixes,
658	// just the first piece of each concatenation.
659	// This will be good enough a lot of the time.
660	//
661	// Complex subexpressions (e.g. involving quantifiers)
662	// are not safe to factor because that collapses their
663	// distinct paths through the automaton, which affects
664	// correctness in some cases.
665	start = 0
666	out = sub[:0]
667	var first *Regexp
668	for i := 0; i <= len(sub); i++ {
669		// Invariant: the Regexps that were in sub[0:start] have been
670		// used or marked for reuse, and the slice space has been reused
671		// for out (len(out) <= start).
672		//
673		// Invariant: sub[start:i] consists of regexps that all begin with ifirst.
674		var ifirst *Regexp
675		if i < len(sub) {
676			ifirst = p.leadingRegexp(sub[i])
677			if first != nil && first.Equal(ifirst) &&
678				// first must be a character class OR a fixed repeat of a character class.
679				(isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) {
680				continue
681			}
682		}
683
684		// Found end of a run with common leading regexp:
685		// sub[start:i] all begin with first but sub[i] does not.
686		//
687		// Factor out common regexp and append factored expression to out.
688		if i == start {
689			// Nothing to do - run of length 0.
690		} else if i == start+1 {
691			// Just one: don't bother factoring.
692			out = append(out, sub[start])
693		} else {
694			// Construct factored form: prefix(suffix1|suffix2|...)
695			prefix := first
696			for j := start; j < i; j++ {
697				reuse := j != start // prefix came from sub[start]
698				sub[j] = p.removeLeadingRegexp(sub[j], reuse)
699				p.checkLimits(sub[j])
700			}
701			suffix := p.collapse(sub[start:i], OpAlternate) // recurse
702
703			re := p.newRegexp(OpConcat)
704			re.Sub = append(re.Sub[:0], prefix, suffix)
705			out = append(out, re)
706		}
707
708		// Prepare for next iteration.
709		start = i
710		first = ifirst
711	}
712	sub = out
713
714	// Round 3: Collapse runs of single literals into character classes.
715	start = 0
716	out = sub[:0]
717	for i := 0; i <= len(sub); i++ {
718		// Invariant: the Regexps that were in sub[0:start] have been
719		// used or marked for reuse, and the slice space has been reused
720		// for out (len(out) <= start).
721		//
722		// Invariant: sub[start:i] consists of regexps that are either
723		// literal runes or character classes.
724		if i < len(sub) && isCharClass(sub[i]) {
725			continue
726		}
727
728		// sub[i] is not a char or char class;
729		// emit char class for sub[start:i]...
730		if i == start {
731			// Nothing to do - run of length 0.
732		} else if i == start+1 {
733			out = append(out, sub[start])
734		} else {
735			// Make new char class.
736			// Start with most complex regexp in sub[start].
737			max := start
738			for j := start + 1; j < i; j++ {
739				if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
740					max = j
741				}
742			}
743			sub[start], sub[max] = sub[max], sub[start]
744
745			for j := start + 1; j < i; j++ {
746				mergeCharClass(sub[start], sub[j])
747				p.reuse(sub[j])
748			}
749			cleanAlt(sub[start])
750			out = append(out, sub[start])
751		}
752
753		// ... and then emit sub[i].
754		if i < len(sub) {
755			out = append(out, sub[i])
756		}
757		start = i + 1
758	}
759	sub = out
760
761	// Round 4: Collapse runs of empty matches into a single empty match.
762	start = 0
763	out = sub[:0]
764	for i := range sub {
765		if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
766			continue
767		}
768		out = append(out, sub[i])
769	}
770	sub = out
771
772	return sub
773}
774
775// leadingString returns the leading literal string that re begins with.
776// The string refers to storage in re or its children.
777func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
778	if re.Op == OpConcat && len(re.Sub) > 0 {
779		re = re.Sub[0]
780	}
781	if re.Op != OpLiteral {
782		return nil, 0
783	}
784	return re.Rune, re.Flags & FoldCase
785}
786
787// removeLeadingString removes the first n leading runes
788// from the beginning of re. It returns the replacement for re.
789func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
790	if re.Op == OpConcat && len(re.Sub) > 0 {
791		// Removing a leading string in a concatenation
792		// might simplify the concatenation.
793		sub := re.Sub[0]
794		sub = p.removeLeadingString(sub, n)
795		re.Sub[0] = sub
796		if sub.Op == OpEmptyMatch {
797			p.reuse(sub)
798			switch len(re.Sub) {
799			case 0, 1:
800				// Impossible but handle.
801				re.Op = OpEmptyMatch
802				re.Sub = nil
803			case 2:
804				old := re
805				re = re.Sub[1]
806				p.reuse(old)
807			default:
808				copy(re.Sub, re.Sub[1:])
809				re.Sub = re.Sub[:len(re.Sub)-1]
810			}
811		}
812		return re
813	}
814
815	if re.Op == OpLiteral {
816		re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
817		if len(re.Rune) == 0 {
818			re.Op = OpEmptyMatch
819		}
820	}
821	return re
822}
823
824// leadingRegexp returns the leading regexp that re begins with.
825// The regexp refers to storage in re or its children.
826func (p *parser) leadingRegexp(re *Regexp) *Regexp {
827	if re.Op == OpEmptyMatch {
828		return nil
829	}
830	if re.Op == OpConcat && len(re.Sub) > 0 {
831		sub := re.Sub[0]
832		if sub.Op == OpEmptyMatch {
833			return nil
834		}
835		return sub
836	}
837	return re
838}
839
840// removeLeadingRegexp removes the leading regexp in re.
841// It returns the replacement for re.
842// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
843func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
844	if re.Op == OpConcat && len(re.Sub) > 0 {
845		if reuse {
846			p.reuse(re.Sub[0])
847		}
848		re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
849		switch len(re.Sub) {
850		case 0:
851			re.Op = OpEmptyMatch
852			re.Sub = nil
853		case 1:
854			old := re
855			re = re.Sub[0]
856			p.reuse(old)
857		}
858		return re
859	}
860	if reuse {
861		p.reuse(re)
862	}
863	return p.newRegexp(OpEmptyMatch)
864}
865
866func literalRegexp(s string, flags Flags) *Regexp {
867	re := &Regexp{Op: OpLiteral}
868	re.Flags = flags
869	re.Rune = re.Rune0[:0] // use local storage for small strings
870	for _, c := range s {
871		if len(re.Rune) >= cap(re.Rune) {
872			// string is too long to fit in Rune0.  let Go handle it
873			re.Rune = []rune(s)
874			break
875		}
876		re.Rune = append(re.Rune, c)
877	}
878	return re
879}
880
881// Parsing.
882
883// Parse parses a regular expression string s, controlled by the specified
884// Flags, and returns a regular expression parse tree. The syntax is
885// described in the top-level comment.
886func Parse(s string, flags Flags) (*Regexp, error) {
887	return parse(s, flags)
888}
889
890func parse(s string, flags Flags) (_ *Regexp, err error) {
891	defer func() {
892		switch r := recover(); r {
893		default:
894			panic(r)
895		case nil:
896			// ok
897		case ErrLarge: // too big
898			err = &Error{Code: ErrLarge, Expr: s}
899		case ErrNestingDepth:
900			err = &Error{Code: ErrNestingDepth, Expr: s}
901		}
902	}()
903
904	if flags&Literal != 0 {
905		// Trivial parser for literal string.
906		if err := checkUTF8(s); err != nil {
907			return nil, err
908		}
909		return literalRegexp(s, flags), nil
910	}
911
912	// Otherwise, must do real work.
913	var (
914		p          parser
915		c          rune
916		op         Op
917		lastRepeat string
918	)
919	p.flags = flags
920	p.wholeRegexp = s
921	t := s
922	for t != "" {
923		repeat := ""
924	BigSwitch:
925		switch t[0] {
926		default:
927			if c, t, err = nextRune(t); err != nil {
928				return nil, err
929			}
930			p.literal(c)
931
932		case '(':
933			if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
934				// Flag changes and non-capturing groups.
935				if t, err = p.parsePerlFlags(t); err != nil {
936					return nil, err
937				}
938				break
939			}
940			p.numCap++
941			p.op(opLeftParen).Cap = p.numCap
942			t = t[1:]
943		case '|':
944			p.parseVerticalBar()
945			t = t[1:]
946		case ')':
947			if err = p.parseRightParen(); err != nil {
948				return nil, err
949			}
950			t = t[1:]
951		case '^':
952			if p.flags&OneLine != 0 {
953				p.op(OpBeginText)
954			} else {
955				p.op(OpBeginLine)
956			}
957			t = t[1:]
958		case '$':
959			if p.flags&OneLine != 0 {
960				p.op(OpEndText).Flags |= WasDollar
961			} else {
962				p.op(OpEndLine)
963			}
964			t = t[1:]
965		case '.':
966			if p.flags&DotNL != 0 {
967				p.op(OpAnyChar)
968			} else {
969				p.op(OpAnyCharNotNL)
970			}
971			t = t[1:]
972		case '[':
973			if t, err = p.parseClass(t); err != nil {
974				return nil, err
975			}
976		case '*', '+', '?':
977			before := t
978			switch t[0] {
979			case '*':
980				op = OpStar
981			case '+':
982				op = OpPlus
983			case '?':
984				op = OpQuest
985			}
986			after := t[1:]
987			if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil {
988				return nil, err
989			}
990			repeat = before
991			t = after
992		case '{':
993			op = OpRepeat
994			before := t
995			min, max, after, ok := p.parseRepeat(t)
996			if !ok {
997				// If the repeat cannot be parsed, { is a literal.
998				p.literal('{')
999				t = t[1:]
1000				break
1001			}
1002			if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
1003				// Numbers were too big, or max is present and min > max.
1004				return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
1005			}
1006			if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
1007				return nil, err
1008			}
1009			repeat = before
1010			t = after
1011		case '\\':
1012			if p.flags&PerlX != 0 && len(t) >= 2 {
1013				switch t[1] {
1014				case 'A':
1015					p.op(OpBeginText)
1016					t = t[2:]
1017					break BigSwitch
1018				case 'b':
1019					p.op(OpWordBoundary)
1020					t = t[2:]
1021					break BigSwitch
1022				case 'B':
1023					p.op(OpNoWordBoundary)
1024					t = t[2:]
1025					break BigSwitch
1026				case 'C':
1027					// any byte; not supported
1028					return nil, &Error{ErrInvalidEscape, t[:2]}
1029				case 'Q':
1030					// \Q ... \E: the ... is always literals
1031					var lit string
1032					lit, t, _ = strings.Cut(t[2:], `\E`)
1033					for lit != "" {
1034						c, rest, err := nextRune(lit)
1035						if err != nil {
1036							return nil, err
1037						}
1038						p.literal(c)
1039						lit = rest
1040					}
1041					break BigSwitch
1042				case 'z':
1043					p.op(OpEndText)
1044					t = t[2:]
1045					break BigSwitch
1046				}
1047			}
1048
1049			re := p.newRegexp(OpCharClass)
1050			re.Flags = p.flags
1051
1052			// Look for Unicode character group like \p{Han}
1053			if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
1054				r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
1055				if err != nil {
1056					return nil, err
1057				}
1058				if r != nil {
1059					re.Rune = r
1060					t = rest
1061					p.push(re)
1062					break BigSwitch
1063				}
1064			}
1065
1066			// Perl character class escape.
1067			if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
1068				re.Rune = r
1069				t = rest
1070				p.push(re)
1071				break BigSwitch
1072			}
1073			p.reuse(re)
1074
1075			// Ordinary single-character escape.
1076			if c, t, err = p.parseEscape(t); err != nil {
1077				return nil, err
1078			}
1079			p.literal(c)
1080		}
1081		lastRepeat = repeat
1082	}
1083
1084	p.concat()
1085	if p.swapVerticalBar() {
1086		// pop vertical bar
1087		p.stack = p.stack[:len(p.stack)-1]
1088	}
1089	p.alternate()
1090
1091	n := len(p.stack)
1092	if n != 1 {
1093		return nil, &Error{ErrMissingParen, s}
1094	}
1095	return p.stack[0], nil
1096}
1097
1098// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
1099// If s is not of that form, it returns ok == false.
1100// If s has the right form but the values are too big, it returns min == -1, ok == true.
1101func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
1102	if s == "" || s[0] != '{' {
1103		return
1104	}
1105	s = s[1:]
1106	var ok1 bool
1107	if min, s, ok1 = p.parseInt(s); !ok1 {
1108		return
1109	}
1110	if s == "" {
1111		return
1112	}
1113	if s[0] != ',' {
1114		max = min
1115	} else {
1116		s = s[1:]
1117		if s == "" {
1118			return
1119		}
1120		if s[0] == '}' {
1121			max = -1
1122		} else if max, s, ok1 = p.parseInt(s); !ok1 {
1123			return
1124		} else if max < 0 {
1125			// parseInt found too big a number
1126			min = -1
1127		}
1128	}
1129	if s == "" || s[0] != '}' {
1130		return
1131	}
1132	rest = s[1:]
1133	ok = true
1134	return
1135}
1136
1137// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
1138// like (?i) or (?: or (?i:.  It removes the prefix from s and updates the parse state.
1139// The caller must have ensured that s begins with "(?".
1140func (p *parser) parsePerlFlags(s string) (rest string, err error) {
1141	t := s
1142
1143	// Check for named captures, first introduced in Python's regexp library.
1144	// As usual, there are three slightly different syntaxes:
1145	//
1146	//   (?P<name>expr)   the original, introduced by Python
1147	//   (?<name>expr)    the .NET alteration, adopted by Perl 5.10
1148	//   (?'name'expr)    another .NET alteration, adopted by Perl 5.10
1149	//
1150	// Perl 5.10 gave in and implemented the Python version too,
1151	// but they claim that the last two are the preferred forms.
1152	// PCRE and languages based on it (specifically, PHP and Ruby)
1153	// support all three as well. EcmaScript 4 uses only the Python form.
1154	//
1155	// In both the open source world (via Code Search) and the
1156	// Google source tree, (?P<expr>name) and (?<expr>name) are the
1157	// dominant forms of named captures and both are supported.
1158	startsWithP := len(t) > 4 && t[2] == 'P' && t[3] == '<'
1159	startsWithName := len(t) > 3 && t[2] == '<'
1160
1161	if startsWithP || startsWithName {
1162		// position of expr start
1163		exprStartPos := 4
1164		if startsWithName {
1165			exprStartPos = 3
1166		}
1167
1168		// Pull out name.
1169		end := strings.IndexRune(t, '>')
1170		if end < 0 {
1171			if err = checkUTF8(t); err != nil {
1172				return "", err
1173			}
1174			return "", &Error{ErrInvalidNamedCapture, s}
1175		}
1176
1177		capture := t[:end+1]        // "(?P<name>" or "(?<name>"
1178		name := t[exprStartPos:end] // "name"
1179		if err = checkUTF8(name); err != nil {
1180			return "", err
1181		}
1182		if !isValidCaptureName(name) {
1183			return "", &Error{ErrInvalidNamedCapture, capture}
1184		}
1185
1186		// Like ordinary capture, but named.
1187		p.numCap++
1188		re := p.op(opLeftParen)
1189		re.Cap = p.numCap
1190		re.Name = name
1191		return t[end+1:], nil
1192	}
1193
1194	// Non-capturing group. Might also twiddle Perl flags.
1195	var c rune
1196	t = t[2:] // skip (?
1197	flags := p.flags
1198	sign := +1
1199	sawFlag := false
1200Loop:
1201	for t != "" {
1202		if c, t, err = nextRune(t); err != nil {
1203			return "", err
1204		}
1205		switch c {
1206		default:
1207			break Loop
1208
1209		// Flags.
1210		case 'i':
1211			flags |= FoldCase
1212			sawFlag = true
1213		case 'm':
1214			flags &^= OneLine
1215			sawFlag = true
1216		case 's':
1217			flags |= DotNL
1218			sawFlag = true
1219		case 'U':
1220			flags |= NonGreedy
1221			sawFlag = true
1222
1223		// Switch to negation.
1224		case '-':
1225			if sign < 0 {
1226				break Loop
1227			}
1228			sign = -1
1229			// Invert flags so that | above turn into &^ and vice versa.
1230			// We'll invert flags again before using it below.
1231			flags = ^flags
1232			sawFlag = false
1233
1234		// End of flags, starting group or not.
1235		case ':', ')':
1236			if sign < 0 {
1237				if !sawFlag {
1238					break Loop
1239				}
1240				flags = ^flags
1241			}
1242			if c == ':' {
1243				// Open new group
1244				p.op(opLeftParen)
1245			}
1246			p.flags = flags
1247			return t, nil
1248		}
1249	}
1250
1251	return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
1252}
1253
1254// isValidCaptureName reports whether name
1255// is a valid capture name: [A-Za-z0-9_]+.
1256// PCRE limits names to 32 bytes.
1257// Python rejects names starting with digits.
1258// We don't enforce either of those.
1259func isValidCaptureName(name string) bool {
1260	if name == "" {
1261		return false
1262	}
1263	for _, c := range name {
1264		if c != '_' && !isalnum(c) {
1265			return false
1266		}
1267	}
1268	return true
1269}
1270
1271// parseInt parses a decimal integer.
1272func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
1273	if s == "" || s[0] < '0' || '9' < s[0] {
1274		return
1275	}
1276	// Disallow leading zeros.
1277	if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
1278		return
1279	}
1280	t := s
1281	for s != "" && '0' <= s[0] && s[0] <= '9' {
1282		s = s[1:]
1283	}
1284	rest = s
1285	ok = true
1286	// Have digits, compute value.
1287	t = t[:len(t)-len(s)]
1288	for i := 0; i < len(t); i++ {
1289		// Avoid overflow.
1290		if n >= 1e8 {
1291			n = -1
1292			break
1293		}
1294		n = n*10 + int(t[i]) - '0'
1295	}
1296	return
1297}
1298
1299// can this be represented as a character class?
1300// single-rune literal string, char class, ., and .|\n.
1301func isCharClass(re *Regexp) bool {
1302	return re.Op == OpLiteral && len(re.Rune) == 1 ||
1303		re.Op == OpCharClass ||
1304		re.Op == OpAnyCharNotNL ||
1305		re.Op == OpAnyChar
1306}
1307
1308// does re match r?
1309func matchRune(re *Regexp, r rune) bool {
1310	switch re.Op {
1311	case OpLiteral:
1312		return len(re.Rune) == 1 && re.Rune[0] == r
1313	case OpCharClass:
1314		for i := 0; i < len(re.Rune); i += 2 {
1315			if re.Rune[i] <= r && r <= re.Rune[i+1] {
1316				return true
1317			}
1318		}
1319		return false
1320	case OpAnyCharNotNL:
1321		return r != '\n'
1322	case OpAnyChar:
1323		return true
1324	}
1325	return false
1326}
1327
1328// parseVerticalBar handles a | in the input.
1329func (p *parser) parseVerticalBar() {
1330	p.concat()
1331
1332	// The concatenation we just parsed is on top of the stack.
1333	// If it sits above an opVerticalBar, swap it below
1334	// (things below an opVerticalBar become an alternation).
1335	// Otherwise, push a new vertical bar.
1336	if !p.swapVerticalBar() {
1337		p.op(opVerticalBar)
1338	}
1339}
1340
1341// mergeCharClass makes dst = dst|src.
1342// The caller must ensure that dst.Op >= src.Op,
1343// to reduce the amount of copying.
1344func mergeCharClass(dst, src *Regexp) {
1345	switch dst.Op {
1346	case OpAnyChar:
1347		// src doesn't add anything.
1348	case OpAnyCharNotNL:
1349		// src might add \n
1350		if matchRune(src, '\n') {
1351			dst.Op = OpAnyChar
1352		}
1353	case OpCharClass:
1354		// src is simpler, so either literal or char class
1355		if src.Op == OpLiteral {
1356			dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1357		} else {
1358			dst.Rune = appendClass(dst.Rune, src.Rune)
1359		}
1360	case OpLiteral:
1361		// both literal
1362		if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
1363			break
1364		}
1365		dst.Op = OpCharClass
1366		dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
1367		dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1368	}
1369}
1370
1371// If the top of the stack is an element followed by an opVerticalBar
1372// swapVerticalBar swaps the two and returns true.
1373// Otherwise it returns false.
1374func (p *parser) swapVerticalBar() bool {
1375	// If above and below vertical bar are literal or char class,
1376	// can merge into a single char class.
1377	n := len(p.stack)
1378	if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
1379		re1 := p.stack[n-1]
1380		re3 := p.stack[n-3]
1381		// Make re3 the more complex of the two.
1382		if re1.Op > re3.Op {
1383			re1, re3 = re3, re1
1384			p.stack[n-3] = re3
1385		}
1386		mergeCharClass(re3, re1)
1387		p.reuse(re1)
1388		p.stack = p.stack[:n-1]
1389		return true
1390	}
1391
1392	if n >= 2 {
1393		re1 := p.stack[n-1]
1394		re2 := p.stack[n-2]
1395		if re2.Op == opVerticalBar {
1396			if n >= 3 {
1397				// Now out of reach.
1398				// Clean opportunistically.
1399				cleanAlt(p.stack[n-3])
1400			}
1401			p.stack[n-2] = re1
1402			p.stack[n-1] = re2
1403			return true
1404		}
1405	}
1406	return false
1407}
1408
1409// parseRightParen handles a ) in the input.
1410func (p *parser) parseRightParen() error {
1411	p.concat()
1412	if p.swapVerticalBar() {
1413		// pop vertical bar
1414		p.stack = p.stack[:len(p.stack)-1]
1415	}
1416	p.alternate()
1417
1418	n := len(p.stack)
1419	if n < 2 {
1420		return &Error{ErrUnexpectedParen, p.wholeRegexp}
1421	}
1422	re1 := p.stack[n-1]
1423	re2 := p.stack[n-2]
1424	p.stack = p.stack[:n-2]
1425	if re2.Op != opLeftParen {
1426		return &Error{ErrUnexpectedParen, p.wholeRegexp}
1427	}
1428	// Restore flags at time of paren.
1429	p.flags = re2.Flags
1430	if re2.Cap == 0 {
1431		// Just for grouping.
1432		p.push(re1)
1433	} else {
1434		re2.Op = OpCapture
1435		re2.Sub = re2.Sub0[:1]
1436		re2.Sub[0] = re1
1437		p.push(re2)
1438	}
1439	return nil
1440}
1441
1442// parseEscape parses an escape sequence at the beginning of s
1443// and returns the rune.
1444func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
1445	t := s[1:]
1446	if t == "" {
1447		return 0, "", &Error{ErrTrailingBackslash, ""}
1448	}
1449	c, t, err := nextRune(t)
1450	if err != nil {
1451		return 0, "", err
1452	}
1453
1454Switch:
1455	switch c {
1456	default:
1457		if c < utf8.RuneSelf && !isalnum(c) {
1458			// Escaped non-word characters are always themselves.
1459			// PCRE is not quite so rigorous: it accepts things like
1460			// \q, but we don't. We once rejected \_, but too many
1461			// programs and people insist on using it, so allow \_.
1462			return c, t, nil
1463		}
1464
1465	// Octal escapes.
1466	case '1', '2', '3', '4', '5', '6', '7':
1467		// Single non-zero digit is a backreference; not supported
1468		if t == "" || t[0] < '0' || t[0] > '7' {
1469			break
1470		}
1471		fallthrough
1472	case '0':
1473		// Consume up to three octal digits; already have one.
1474		r = c - '0'
1475		for i := 1; i < 3; i++ {
1476			if t == "" || t[0] < '0' || t[0] > '7' {
1477				break
1478			}
1479			r = r*8 + rune(t[0]) - '0'
1480			t = t[1:]
1481		}
1482		return r, t, nil
1483
1484	// Hexadecimal escapes.
1485	case 'x':
1486		if t == "" {
1487			break
1488		}
1489		if c, t, err = nextRune(t); err != nil {
1490			return 0, "", err
1491		}
1492		if c == '{' {
1493			// Any number of digits in braces.
1494			// Perl accepts any text at all; it ignores all text
1495			// after the first non-hex digit. We require only hex digits,
1496			// and at least one.
1497			nhex := 0
1498			r = 0
1499			for {
1500				if t == "" {
1501					break Switch
1502				}
1503				if c, t, err = nextRune(t); err != nil {
1504					return 0, "", err
1505				}
1506				if c == '}' {
1507					break
1508				}
1509				v := unhex(c)
1510				if v < 0 {
1511					break Switch
1512				}
1513				r = r*16 + v
1514				if r > unicode.MaxRune {
1515					break Switch
1516				}
1517				nhex++
1518			}
1519			if nhex == 0 {
1520				break Switch
1521			}
1522			return r, t, nil
1523		}
1524
1525		// Easy case: two hex digits.
1526		x := unhex(c)
1527		if c, t, err = nextRune(t); err != nil {
1528			return 0, "", err
1529		}
1530		y := unhex(c)
1531		if x < 0 || y < 0 {
1532			break
1533		}
1534		return x*16 + y, t, nil
1535
1536	// C escapes. There is no case 'b', to avoid misparsing
1537	// the Perl word-boundary \b as the C backspace \b
1538	// when in POSIX mode. In Perl, /\b/ means word-boundary
1539	// but /[\b]/ means backspace. We don't support that.
1540	// If you want a backspace, embed a literal backspace
1541	// character or use \x08.
1542	case 'a':
1543		return '\a', t, err
1544	case 'f':
1545		return '\f', t, err
1546	case 'n':
1547		return '\n', t, err
1548	case 'r':
1549		return '\r', t, err
1550	case 't':
1551		return '\t', t, err
1552	case 'v':
1553		return '\v', t, err
1554	}
1555	return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
1556}
1557
1558// parseClassChar parses a character class character at the beginning of s
1559// and returns it.
1560func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
1561	if s == "" {
1562		return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
1563	}
1564
1565	// Allow regular escape sequences even though
1566	// many need not be escaped in this context.
1567	if s[0] == '\\' {
1568		return p.parseEscape(s)
1569	}
1570
1571	return nextRune(s)
1572}
1573
1574type charGroup struct {
1575	sign  int
1576	class []rune
1577}
1578
1579//go:generate perl make_perl_groups.pl perl_groups.go
1580
1581// parsePerlClassEscape parses a leading Perl character class escape like \d
1582// from the beginning of s. If one is present, it appends the characters to r
1583// and returns the new slice r and the remainder of the string.
1584func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
1585	if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
1586		return
1587	}
1588	g := perlGroup[s[0:2]]
1589	if g.sign == 0 {
1590		return
1591	}
1592	return p.appendGroup(r, g), s[2:]
1593}
1594
1595// parseNamedClass parses a leading POSIX named character class like [:alnum:]
1596// from the beginning of s. If one is present, it appends the characters to r
1597// and returns the new slice r and the remainder of the string.
1598func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
1599	if len(s) < 2 || s[0] != '[' || s[1] != ':' {
1600		return
1601	}
1602
1603	i := strings.Index(s[2:], ":]")
1604	if i < 0 {
1605		return
1606	}
1607	i += 2
1608	name, s := s[0:i+2], s[i+2:]
1609	g := posixGroup[name]
1610	if g.sign == 0 {
1611		return nil, "", &Error{ErrInvalidCharRange, name}
1612	}
1613	return p.appendGroup(r, g), s, nil
1614}
1615
1616func (p *parser) appendGroup(r []rune, g charGroup) []rune {
1617	if p.flags&FoldCase == 0 {
1618		if g.sign < 0 {
1619			r = appendNegatedClass(r, g.class)
1620		} else {
1621			r = appendClass(r, g.class)
1622		}
1623	} else {
1624		tmp := p.tmpClass[:0]
1625		tmp = appendFoldedClass(tmp, g.class)
1626		p.tmpClass = tmp
1627		tmp = cleanClass(&p.tmpClass)
1628		if g.sign < 0 {
1629			r = appendNegatedClass(r, tmp)
1630		} else {
1631			r = appendClass(r, tmp)
1632		}
1633	}
1634	return r
1635}
1636
1637var anyTable = &unicode.RangeTable{
1638	R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
1639	R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
1640}
1641
1642// unicodeTable returns the unicode.RangeTable identified by name
1643// and the table of additional fold-equivalent code points.
1644func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1645	// Special case: "Any" means any.
1646	if name == "Any" {
1647		return anyTable, anyTable
1648	}
1649	if t := unicode.Categories[name]; t != nil {
1650		return t, unicode.FoldCategory[name]
1651	}
1652	if t := unicode.Scripts[name]; t != nil {
1653		return t, unicode.FoldScript[name]
1654	}
1655	return nil, nil
1656}
1657
1658// parseUnicodeClass parses a leading Unicode character class like \p{Han}
1659// from the beginning of s. If one is present, it appends the characters to r
1660// and returns the new slice r and the remainder of the string.
1661func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
1662	if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
1663		return
1664	}
1665
1666	// Committed to parse or return error.
1667	sign := +1
1668	if s[1] == 'P' {
1669		sign = -1
1670	}
1671	t := s[2:]
1672	c, t, err := nextRune(t)
1673	if err != nil {
1674		return
1675	}
1676	var seq, name string
1677	if c != '{' {
1678		// Single-letter name.
1679		seq = s[:len(s)-len(t)]
1680		name = seq[2:]
1681	} else {
1682		// Name is in braces.
1683		end := strings.IndexRune(s, '}')
1684		if end < 0 {
1685			if err = checkUTF8(s); err != nil {
1686				return
1687			}
1688			return nil, "", &Error{ErrInvalidCharRange, s}
1689		}
1690		seq, t = s[:end+1], s[end+1:]
1691		name = s[3:end]
1692		if err = checkUTF8(name); err != nil {
1693			return
1694		}
1695	}
1696
1697	// Group can have leading negation too.  \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
1698	if name != "" && name[0] == '^' {
1699		sign = -sign
1700		name = name[1:]
1701	}
1702
1703	tab, fold := unicodeTable(name)
1704	if tab == nil {
1705		return nil, "", &Error{ErrInvalidCharRange, seq}
1706	}
1707
1708	if p.flags&FoldCase == 0 || fold == nil {
1709		if sign > 0 {
1710			r = appendTable(r, tab)
1711		} else {
1712			r = appendNegatedTable(r, tab)
1713		}
1714	} else {
1715		// Merge and clean tab and fold in a temporary buffer.
1716		// This is necessary for the negative case and just tidy
1717		// for the positive case.
1718		tmp := p.tmpClass[:0]
1719		tmp = appendTable(tmp, tab)
1720		tmp = appendTable(tmp, fold)
1721		p.tmpClass = tmp
1722		tmp = cleanClass(&p.tmpClass)
1723		if sign > 0 {
1724			r = appendClass(r, tmp)
1725		} else {
1726			r = appendNegatedClass(r, tmp)
1727		}
1728	}
1729	return r, t, nil
1730}
1731
1732// parseClass parses a character class at the beginning of s
1733// and pushes it onto the parse stack.
1734func (p *parser) parseClass(s string) (rest string, err error) {
1735	t := s[1:] // chop [
1736	re := p.newRegexp(OpCharClass)
1737	re.Flags = p.flags
1738	re.Rune = re.Rune0[:0]
1739
1740	sign := +1
1741	if t != "" && t[0] == '^' {
1742		sign = -1
1743		t = t[1:]
1744
1745		// If character class does not match \n, add it here,
1746		// so that negation later will do the right thing.
1747		if p.flags&ClassNL == 0 {
1748			re.Rune = append(re.Rune, '\n', '\n')
1749		}
1750	}
1751
1752	class := re.Rune
1753	first := true // ] and - are okay as first char in class
1754	for t == "" || t[0] != ']' || first {
1755		// POSIX: - is only okay unescaped as first or last in class.
1756		// Perl: - is okay anywhere.
1757		if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
1758			_, size := utf8.DecodeRuneInString(t[1:])
1759			return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
1760		}
1761		first = false
1762
1763		// Look for POSIX [:alnum:] etc.
1764		if len(t) > 2 && t[0] == '[' && t[1] == ':' {
1765			nclass, nt, err := p.parseNamedClass(t, class)
1766			if err != nil {
1767				return "", err
1768			}
1769			if nclass != nil {
1770				class, t = nclass, nt
1771				continue
1772			}
1773		}
1774
1775		// Look for Unicode character group like \p{Han}.
1776		nclass, nt, err := p.parseUnicodeClass(t, class)
1777		if err != nil {
1778			return "", err
1779		}
1780		if nclass != nil {
1781			class, t = nclass, nt
1782			continue
1783		}
1784
1785		// Look for Perl character class symbols (extension).
1786		if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
1787			class, t = nclass, nt
1788			continue
1789		}
1790
1791		// Single character or simple range.
1792		rng := t
1793		var lo, hi rune
1794		if lo, t, err = p.parseClassChar(t, s); err != nil {
1795			return "", err
1796		}
1797		hi = lo
1798		// [a-] means (a|-) so check for final ].
1799		if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
1800			t = t[1:]
1801			if hi, t, err = p.parseClassChar(t, s); err != nil {
1802				return "", err
1803			}
1804			if hi < lo {
1805				rng = rng[:len(rng)-len(t)]
1806				return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
1807			}
1808		}
1809		if p.flags&FoldCase == 0 {
1810			class = appendRange(class, lo, hi)
1811		} else {
1812			class = appendFoldedRange(class, lo, hi)
1813		}
1814	}
1815	t = t[1:] // chop ]
1816
1817	// Use &re.Rune instead of &class to avoid allocation.
1818	re.Rune = class
1819	class = cleanClass(&re.Rune)
1820	if sign < 0 {
1821		class = negateClass(class)
1822	}
1823	re.Rune = class
1824	p.push(re)
1825	return t, nil
1826}
1827
1828// cleanClass sorts the ranges (pairs of elements of r),
1829// merges them, and eliminates duplicates.
1830func cleanClass(rp *[]rune) []rune {
1831
1832	// Sort by lo increasing, hi decreasing to break ties.
1833	sort.Sort(ranges{rp})
1834
1835	r := *rp
1836	if len(r) < 2 {
1837		return r
1838	}
1839
1840	// Merge abutting, overlapping.
1841	w := 2 // write index
1842	for i := 2; i < len(r); i += 2 {
1843		lo, hi := r[i], r[i+1]
1844		if lo <= r[w-1]+1 {
1845			// merge with previous range
1846			if hi > r[w-1] {
1847				r[w-1] = hi
1848			}
1849			continue
1850		}
1851		// new disjoint range
1852		r[w] = lo
1853		r[w+1] = hi
1854		w += 2
1855	}
1856
1857	return r[:w]
1858}
1859
1860// inCharClass reports whether r is in the class.
1861// It assumes the class has been cleaned by cleanClass.
1862func inCharClass(r rune, class []rune) bool {
1863	_, ok := sort.Find(len(class)/2, func(i int) int {
1864		lo, hi := class[2*i], class[2*i+1]
1865		if r > hi {
1866			return +1
1867		}
1868		if r < lo {
1869			return -1
1870		}
1871		return 0
1872	})
1873	return ok
1874}
1875
1876// appendLiteral returns the result of appending the literal x to the class r.
1877func appendLiteral(r []rune, x rune, flags Flags) []rune {
1878	if flags&FoldCase != 0 {
1879		return appendFoldedRange(r, x, x)
1880	}
1881	return appendRange(r, x, x)
1882}
1883
1884// appendRange returns the result of appending the range lo-hi to the class r.
1885func appendRange(r []rune, lo, hi rune) []rune {
1886	// Expand last range or next to last range if it overlaps or abuts.
1887	// Checking two ranges helps when appending case-folded
1888	// alphabets, so that one range can be expanding A-Z and the
1889	// other expanding a-z.
1890	n := len(r)
1891	for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
1892		if n >= i {
1893			rlo, rhi := r[n-i], r[n-i+1]
1894			if lo <= rhi+1 && rlo <= hi+1 {
1895				if lo < rlo {
1896					r[n-i] = lo
1897				}
1898				if hi > rhi {
1899					r[n-i+1] = hi
1900				}
1901				return r
1902			}
1903		}
1904	}
1905
1906	return append(r, lo, hi)
1907}
1908
1909const (
1910	// minimum and maximum runes involved in folding.
1911	// checked during test.
1912	minFold = 0x0041
1913	maxFold = 0x1e943
1914)
1915
1916// appendFoldedRange returns the result of appending the range lo-hi
1917// and its case folding-equivalent runes to the class r.
1918func appendFoldedRange(r []rune, lo, hi rune) []rune {
1919	// Optimizations.
1920	if lo <= minFold && hi >= maxFold {
1921		// Range is full: folding can't add more.
1922		return appendRange(r, lo, hi)
1923	}
1924	if hi < minFold || lo > maxFold {
1925		// Range is outside folding possibilities.
1926		return appendRange(r, lo, hi)
1927	}
1928	if lo < minFold {
1929		// [lo, minFold-1] needs no folding.
1930		r = appendRange(r, lo, minFold-1)
1931		lo = minFold
1932	}
1933	if hi > maxFold {
1934		// [maxFold+1, hi] needs no folding.
1935		r = appendRange(r, maxFold+1, hi)
1936		hi = maxFold
1937	}
1938
1939	// Brute force. Depend on appendRange to coalesce ranges on the fly.
1940	for c := lo; c <= hi; c++ {
1941		r = appendRange(r, c, c)
1942		f := unicode.SimpleFold(c)
1943		for f != c {
1944			r = appendRange(r, f, f)
1945			f = unicode.SimpleFold(f)
1946		}
1947	}
1948	return r
1949}
1950
1951// appendClass returns the result of appending the class x to the class r.
1952// It assume x is clean.
1953func appendClass(r []rune, x []rune) []rune {
1954	for i := 0; i < len(x); i += 2 {
1955		r = appendRange(r, x[i], x[i+1])
1956	}
1957	return r
1958}
1959
1960// appendFoldedClass returns the result of appending the case folding of the class x to the class r.
1961func appendFoldedClass(r []rune, x []rune) []rune {
1962	for i := 0; i < len(x); i += 2 {
1963		r = appendFoldedRange(r, x[i], x[i+1])
1964	}
1965	return r
1966}
1967
1968// appendNegatedClass returns the result of appending the negation of the class x to the class r.
1969// It assumes x is clean.
1970func appendNegatedClass(r []rune, x []rune) []rune {
1971	nextLo := '\u0000'
1972	for i := 0; i < len(x); i += 2 {
1973		lo, hi := x[i], x[i+1]
1974		if nextLo <= lo-1 {
1975			r = appendRange(r, nextLo, lo-1)
1976		}
1977		nextLo = hi + 1
1978	}
1979	if nextLo <= unicode.MaxRune {
1980		r = appendRange(r, nextLo, unicode.MaxRune)
1981	}
1982	return r
1983}
1984
1985// appendTable returns the result of appending x to the class r.
1986func appendTable(r []rune, x *unicode.RangeTable) []rune {
1987	for _, xr := range x.R16 {
1988		lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1989		if stride == 1 {
1990			r = appendRange(r, lo, hi)
1991			continue
1992		}
1993		for c := lo; c <= hi; c += stride {
1994			r = appendRange(r, c, c)
1995		}
1996	}
1997	for _, xr := range x.R32 {
1998		lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1999		if stride == 1 {
2000			r = appendRange(r, lo, hi)
2001			continue
2002		}
2003		for c := lo; c <= hi; c += stride {
2004			r = appendRange(r, c, c)
2005		}
2006	}
2007	return r
2008}
2009
2010// appendNegatedTable returns the result of appending the negation of x to the class r.
2011func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
2012	nextLo := '\u0000' // lo end of next class to add
2013	for _, xr := range x.R16 {
2014		lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
2015		if stride == 1 {
2016			if nextLo <= lo-1 {
2017				r = appendRange(r, nextLo, lo-1)
2018			}
2019			nextLo = hi + 1
2020			continue
2021		}
2022		for c := lo; c <= hi; c += stride {
2023			if nextLo <= c-1 {
2024				r = appendRange(r, nextLo, c-1)
2025			}
2026			nextLo = c + 1
2027		}
2028	}
2029	for _, xr := range x.R32 {
2030		lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
2031		if stride == 1 {
2032			if nextLo <= lo-1 {
2033				r = appendRange(r, nextLo, lo-1)
2034			}
2035			nextLo = hi + 1
2036			continue
2037		}
2038		for c := lo; c <= hi; c += stride {
2039			if nextLo <= c-1 {
2040				r = appendRange(r, nextLo, c-1)
2041			}
2042			nextLo = c + 1
2043		}
2044	}
2045	if nextLo <= unicode.MaxRune {
2046		r = appendRange(r, nextLo, unicode.MaxRune)
2047	}
2048	return r
2049}
2050
2051// negateClass overwrites r and returns r's negation.
2052// It assumes the class r is already clean.
2053func negateClass(r []rune) []rune {
2054	nextLo := '\u0000' // lo end of next class to add
2055	w := 0             // write index
2056	for i := 0; i < len(r); i += 2 {
2057		lo, hi := r[i], r[i+1]
2058		if nextLo <= lo-1 {
2059			r[w] = nextLo
2060			r[w+1] = lo - 1
2061			w += 2
2062		}
2063		nextLo = hi + 1
2064	}
2065	r = r[:w]
2066	if nextLo <= unicode.MaxRune {
2067		// It's possible for the negation to have one more
2068		// range - this one - than the original class, so use append.
2069		r = append(r, nextLo, unicode.MaxRune)
2070	}
2071	return r
2072}
2073
2074// ranges implements sort.Interface on a []rune.
2075// The choice of receiver type definition is strange
2076// but avoids an allocation since we already have
2077// a *[]rune.
2078type ranges struct {
2079	p *[]rune
2080}
2081
2082func (ra ranges) Less(i, j int) bool {
2083	p := *ra.p
2084	i *= 2
2085	j *= 2
2086	return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
2087}
2088
2089func (ra ranges) Len() int {
2090	return len(*ra.p) / 2
2091}
2092
2093func (ra ranges) Swap(i, j int) {
2094	p := *ra.p
2095	i *= 2
2096	j *= 2
2097	p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
2098}
2099
2100func checkUTF8(s string) error {
2101	for s != "" {
2102		rune, size := utf8.DecodeRuneInString(s)
2103		if rune == utf8.RuneError && size == 1 {
2104			return &Error{Code: ErrInvalidUTF8, Expr: s}
2105		}
2106		s = s[size:]
2107	}
2108	return nil
2109}
2110
2111func nextRune(s string) (c rune, t string, err error) {
2112	c, size := utf8.DecodeRuneInString(s)
2113	if c == utf8.RuneError && size == 1 {
2114		return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
2115	}
2116	return c, s[size:], nil
2117}
2118
2119func isalnum(c rune) bool {
2120	return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
2121}
2122
2123func unhex(c rune) rune {
2124	if '0' <= c && c <= '9' {
2125		return c - '0'
2126	}
2127	if 'a' <= c && c <= 'f' {
2128		return c - 'a' + 10
2129	}
2130	if 'A' <= c && c <= 'F' {
2131		return c - 'A' + 10
2132	}
2133	return -1
2134}
2135