1 /*!
2 Defines a translator that converts an `Ast` to an `Hir`.
3 */
4 
5 use core::cell::{Cell, RefCell};
6 
7 use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
8 
9 use crate::{
10     ast::{self, Ast, Span, Visitor},
11     either::Either,
12     hir::{self, Error, ErrorKind, Hir, HirKind},
13     unicode::{self, ClassQuery},
14 };
15 
16 type Result<T> = core::result::Result<T, Error>;
17 
18 /// A builder for constructing an AST->HIR translator.
19 #[derive(Clone, Debug)]
20 pub struct TranslatorBuilder {
21     utf8: bool,
22     line_terminator: u8,
23     flags: Flags,
24 }
25 
26 impl Default for TranslatorBuilder {
default() -> TranslatorBuilder27     fn default() -> TranslatorBuilder {
28         TranslatorBuilder::new()
29     }
30 }
31 
32 impl TranslatorBuilder {
33     /// Create a new translator builder with a default c onfiguration.
new() -> TranslatorBuilder34     pub fn new() -> TranslatorBuilder {
35         TranslatorBuilder {
36             utf8: true,
37             line_terminator: b'\n',
38             flags: Flags::default(),
39         }
40     }
41 
42     /// Build a translator using the current configuration.
build(&self) -> Translator43     pub fn build(&self) -> Translator {
44         Translator {
45             stack: RefCell::new(vec![]),
46             flags: Cell::new(self.flags),
47             utf8: self.utf8,
48             line_terminator: self.line_terminator,
49         }
50     }
51 
52     /// When disabled, translation will permit the construction of a regular
53     /// expression that may match invalid UTF-8.
54     ///
55     /// When enabled (the default), the translator is guaranteed to produce an
56     /// expression that, for non-empty matches, will only ever produce spans
57     /// that are entirely valid UTF-8 (otherwise, the translator will return an
58     /// error).
59     ///
60     /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
61     /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
62     /// syntax) will be allowed even though they can produce matches that split
63     /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
64     /// matches, and it is expected that the regex engine itself must handle
65     /// these cases if necessary (perhaps by suppressing any zero-width matches
66     /// that split a codepoint).
utf8(&mut self, yes: bool) -> &mut TranslatorBuilder67     pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
68         self.utf8 = yes;
69         self
70     }
71 
72     /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
73     ///
74     /// Namely, instead of `.` (by default) matching everything except for `\n`,
75     /// this will cause `.` to match everything except for the byte given.
76     ///
77     /// If `.` is used in a context where Unicode mode is enabled and this byte
78     /// isn't ASCII, then an error will be returned. When Unicode mode is
79     /// disabled, then any byte is permitted, but will return an error if UTF-8
80     /// mode is enabled and it is a non-ASCII byte.
81     ///
82     /// In short, any ASCII value for a line terminator is always okay. But a
83     /// non-ASCII byte might result in an error depending on whether Unicode
84     /// mode or UTF-8 mode are enabled.
85     ///
86     /// Note that if `R` mode is enabled then it always takes precedence and
87     /// the line terminator will be treated as `\r` and `\n` simultaneously.
88     ///
89     /// Note also that this *doesn't* impact the look-around assertions
90     /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
91     /// configuration in the regex engine itself.
line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder92     pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder {
93         self.line_terminator = byte;
94         self
95     }
96 
97     /// Enable or disable the case insensitive flag (`i`) by default.
case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder98     pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
99         self.flags.case_insensitive = if yes { Some(true) } else { None };
100         self
101     }
102 
103     /// Enable or disable the multi-line matching flag (`m`) by default.
multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder104     pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
105         self.flags.multi_line = if yes { Some(true) } else { None };
106         self
107     }
108 
109     /// Enable or disable the "dot matches any character" flag (`s`) by
110     /// default.
dot_matches_new_line( &mut self, yes: bool, ) -> &mut TranslatorBuilder111     pub fn dot_matches_new_line(
112         &mut self,
113         yes: bool,
114     ) -> &mut TranslatorBuilder {
115         self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
116         self
117     }
118 
119     /// Enable or disable the CRLF mode flag (`R`) by default.
crlf(&mut self, yes: bool) -> &mut TranslatorBuilder120     pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder {
121         self.flags.crlf = if yes { Some(true) } else { None };
122         self
123     }
124 
125     /// Enable or disable the "swap greed" flag (`U`) by default.
swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder126     pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
127         self.flags.swap_greed = if yes { Some(true) } else { None };
128         self
129     }
130 
131     /// Enable or disable the Unicode flag (`u`) by default.
unicode(&mut self, yes: bool) -> &mut TranslatorBuilder132     pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
133         self.flags.unicode = if yes { None } else { Some(false) };
134         self
135     }
136 }
137 
138 /// A translator maps abstract syntax to a high level intermediate
139 /// representation.
140 ///
141 /// A translator may be benefit from reuse. That is, a translator can translate
142 /// many abstract syntax trees.
143 ///
144 /// A `Translator` can be configured in more detail via a
145 /// [`TranslatorBuilder`].
146 #[derive(Clone, Debug)]
147 pub struct Translator {
148     /// Our call stack, but on the heap.
149     stack: RefCell<Vec<HirFrame>>,
150     /// The current flag settings.
151     flags: Cell<Flags>,
152     /// Whether we're allowed to produce HIR that can match arbitrary bytes.
153     utf8: bool,
154     /// The line terminator to use for `.`.
155     line_terminator: u8,
156 }
157 
158 impl Translator {
159     /// Create a new translator using the default configuration.
new() -> Translator160     pub fn new() -> Translator {
161         TranslatorBuilder::new().build()
162     }
163 
164     /// Translate the given abstract syntax tree (AST) into a high level
165     /// intermediate representation (HIR).
166     ///
167     /// If there was a problem doing the translation, then an HIR-specific
168     /// error is returned.
169     ///
170     /// The original pattern string used to produce the `Ast` *must* also be
171     /// provided. The translator does not use the pattern string during any
172     /// correct translation, but is used for error reporting.
translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir>173     pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
174         ast::visit(ast, TranslatorI::new(self, pattern))
175     }
176 }
177 
178 /// An HirFrame is a single stack frame, represented explicitly, which is
179 /// created for each item in the Ast that we traverse.
180 ///
181 /// Note that technically, this type doesn't represent our entire stack
182 /// frame. In particular, the Ast visitor represents any state associated with
183 /// traversing the Ast itself.
184 #[derive(Clone, Debug)]
185 enum HirFrame {
186     /// An arbitrary HIR expression. These get pushed whenever we hit a base
187     /// case in the Ast. They get popped after an inductive (i.e., recursive)
188     /// step is complete.
189     Expr(Hir),
190     /// A literal that is being constructed, character by character, from the
191     /// AST. We need this because the AST gives each individual character its
192     /// own node. So as we see characters, we peek at the top-most HirFrame.
193     /// If it's a literal, then we add to it. Otherwise, we push a new literal.
194     /// When it comes time to pop it, we convert it to an Hir via Hir::literal.
195     Literal(Vec<u8>),
196     /// A Unicode character class. This frame is mutated as we descend into
197     /// the Ast of a character class (which is itself its own mini recursive
198     /// structure).
199     ClassUnicode(hir::ClassUnicode),
200     /// A byte-oriented character class. This frame is mutated as we descend
201     /// into the Ast of a character class (which is itself its own mini
202     /// recursive structure).
203     ///
204     /// Byte character classes are created when Unicode mode (`u`) is disabled.
205     /// If `utf8` is enabled (the default), then a byte character is only
206     /// permitted to match ASCII text.
207     ClassBytes(hir::ClassBytes),
208     /// This is pushed whenever a repetition is observed. After visiting every
209     /// sub-expression in the repetition, the translator's stack is expected to
210     /// have this sentinel at the top.
211     ///
212     /// This sentinel only exists to stop other things (like flattening
213     /// literals) from reaching across repetition operators.
214     Repetition,
215     /// This is pushed on to the stack upon first seeing any kind of capture,
216     /// indicated by parentheses (including non-capturing groups). It is popped
217     /// upon leaving a group.
218     Group {
219         /// The old active flags when this group was opened.
220         ///
221         /// If this group sets flags, then the new active flags are set to the
222         /// result of merging the old flags with the flags introduced by this
223         /// group. If the group doesn't set any flags, then this is simply
224         /// equivalent to whatever flags were set when the group was opened.
225         ///
226         /// When this group is popped, the active flags should be restored to
227         /// the flags set here.
228         ///
229         /// The "active" flags correspond to whatever flags are set in the
230         /// Translator.
231         old_flags: Flags,
232     },
233     /// This is pushed whenever a concatenation is observed. After visiting
234     /// every sub-expression in the concatenation, the translator's stack is
235     /// popped until it sees a Concat frame.
236     Concat,
237     /// This is pushed whenever an alternation is observed. After visiting
238     /// every sub-expression in the alternation, the translator's stack is
239     /// popped until it sees an Alternation frame.
240     Alternation,
241     /// This is pushed immediately before each sub-expression in an
242     /// alternation. This separates the branches of an alternation on the
243     /// stack and prevents literal flattening from reaching across alternation
244     /// branches.
245     ///
246     /// It is popped after each expression in a branch until an 'Alternation'
247     /// frame is observed when doing a post visit on an alternation.
248     AlternationBranch,
249 }
250 
251 impl HirFrame {
252     /// Assert that the current stack frame is an Hir expression and return it.
unwrap_expr(self) -> Hir253     fn unwrap_expr(self) -> Hir {
254         match self {
255             HirFrame::Expr(expr) => expr,
256             HirFrame::Literal(lit) => Hir::literal(lit),
257             _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
258         }
259     }
260 
261     /// Assert that the current stack frame is a Unicode class expression and
262     /// return it.
unwrap_class_unicode(self) -> hir::ClassUnicode263     fn unwrap_class_unicode(self) -> hir::ClassUnicode {
264         match self {
265             HirFrame::ClassUnicode(cls) => cls,
266             _ => panic!(
267                 "tried to unwrap Unicode class \
268                  from HirFrame, got: {:?}",
269                 self
270             ),
271         }
272     }
273 
274     /// Assert that the current stack frame is a byte class expression and
275     /// return it.
unwrap_class_bytes(self) -> hir::ClassBytes276     fn unwrap_class_bytes(self) -> hir::ClassBytes {
277         match self {
278             HirFrame::ClassBytes(cls) => cls,
279             _ => panic!(
280                 "tried to unwrap byte class \
281                  from HirFrame, got: {:?}",
282                 self
283             ),
284         }
285     }
286 
287     /// Assert that the current stack frame is a repetition sentinel. If it
288     /// isn't, then panic.
unwrap_repetition(self)289     fn unwrap_repetition(self) {
290         match self {
291             HirFrame::Repetition => {}
292             _ => {
293                 panic!(
294                     "tried to unwrap repetition from HirFrame, got: {:?}",
295                     self
296                 )
297             }
298         }
299     }
300 
301     /// Assert that the current stack frame is a group indicator and return
302     /// its corresponding flags (the flags that were active at the time the
303     /// group was entered).
unwrap_group(self) -> Flags304     fn unwrap_group(self) -> Flags {
305         match self {
306             HirFrame::Group { old_flags } => old_flags,
307             _ => {
308                 panic!("tried to unwrap group from HirFrame, got: {:?}", self)
309             }
310         }
311     }
312 
313     /// Assert that the current stack frame is an alternation pipe sentinel. If
314     /// it isn't, then panic.
unwrap_alternation_pipe(self)315     fn unwrap_alternation_pipe(self) {
316         match self {
317             HirFrame::AlternationBranch => {}
318             _ => {
319                 panic!(
320                     "tried to unwrap alt pipe from HirFrame, got: {:?}",
321                     self
322                 )
323             }
324         }
325     }
326 }
327 
328 impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
329     type Output = Hir;
330     type Err = Error;
331 
finish(self) -> Result<Hir>332     fn finish(self) -> Result<Hir> {
333         // ... otherwise, we should have exactly one HIR on the stack.
334         assert_eq!(self.trans().stack.borrow().len(), 1);
335         Ok(self.pop().unwrap().unwrap_expr())
336     }
337 
visit_pre(&mut self, ast: &Ast) -> Result<()>338     fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
339         match *ast {
340             Ast::ClassBracketed(_) => {
341                 if self.flags().unicode() {
342                     let cls = hir::ClassUnicode::empty();
343                     self.push(HirFrame::ClassUnicode(cls));
344                 } else {
345                     let cls = hir::ClassBytes::empty();
346                     self.push(HirFrame::ClassBytes(cls));
347                 }
348             }
349             Ast::Repetition(_) => self.push(HirFrame::Repetition),
350             Ast::Group(ref x) => {
351                 let old_flags = x
352                     .flags()
353                     .map(|ast| self.set_flags(ast))
354                     .unwrap_or_else(|| self.flags());
355                 self.push(HirFrame::Group { old_flags });
356             }
357             Ast::Concat(_) => {
358                 self.push(HirFrame::Concat);
359             }
360             Ast::Alternation(ref x) => {
361                 self.push(HirFrame::Alternation);
362                 if !x.asts.is_empty() {
363                     self.push(HirFrame::AlternationBranch);
364                 }
365             }
366             _ => {}
367         }
368         Ok(())
369     }
370 
visit_post(&mut self, ast: &Ast) -> Result<()>371     fn visit_post(&mut self, ast: &Ast) -> Result<()> {
372         match *ast {
373             Ast::Empty(_) => {
374                 self.push(HirFrame::Expr(Hir::empty()));
375             }
376             Ast::Flags(ref x) => {
377                 self.set_flags(&x.flags);
378                 // Flags in the AST are generally considered directives and
379                 // not actual sub-expressions. However, they can be used in
380                 // the concrete syntax like `((?i))`, and we need some kind of
381                 // indication of an expression there, and Empty is the correct
382                 // choice.
383                 //
384                 // There can also be things like `(?i)+`, but we rule those out
385                 // in the parser. In the future, we might allow them for
386                 // consistency sake.
387                 self.push(HirFrame::Expr(Hir::empty()));
388             }
389             Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
390                 Either::Right(byte) => self.push_byte(byte),
391                 Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
392                     None => self.push_char(ch),
393                     Some(expr) => self.push(HirFrame::Expr(expr)),
394                 },
395             },
396             Ast::Dot(ref span) => {
397                 self.push(HirFrame::Expr(self.hir_dot(**span)?));
398             }
399             Ast::Assertion(ref x) => {
400                 self.push(HirFrame::Expr(self.hir_assertion(x)?));
401             }
402             Ast::ClassPerl(ref x) => {
403                 if self.flags().unicode() {
404                     let cls = self.hir_perl_unicode_class(x)?;
405                     let hcls = hir::Class::Unicode(cls);
406                     self.push(HirFrame::Expr(Hir::class(hcls)));
407                 } else {
408                     let cls = self.hir_perl_byte_class(x)?;
409                     let hcls = hir::Class::Bytes(cls);
410                     self.push(HirFrame::Expr(Hir::class(hcls)));
411                 }
412             }
413             Ast::ClassUnicode(ref x) => {
414                 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
415                 self.push(HirFrame::Expr(Hir::class(cls)));
416             }
417             Ast::ClassBracketed(ref ast) => {
418                 if self.flags().unicode() {
419                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
420                     self.unicode_fold_and_negate(
421                         &ast.span,
422                         ast.negated,
423                         &mut cls,
424                     )?;
425                     let expr = Hir::class(hir::Class::Unicode(cls));
426                     self.push(HirFrame::Expr(expr));
427                 } else {
428                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
429                     self.bytes_fold_and_negate(
430                         &ast.span,
431                         ast.negated,
432                         &mut cls,
433                     )?;
434                     let expr = Hir::class(hir::Class::Bytes(cls));
435                     self.push(HirFrame::Expr(expr));
436                 }
437             }
438             Ast::Repetition(ref x) => {
439                 let expr = self.pop().unwrap().unwrap_expr();
440                 self.pop().unwrap().unwrap_repetition();
441                 self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
442             }
443             Ast::Group(ref x) => {
444                 let expr = self.pop().unwrap().unwrap_expr();
445                 let old_flags = self.pop().unwrap().unwrap_group();
446                 self.trans().flags.set(old_flags);
447                 self.push(HirFrame::Expr(self.hir_capture(x, expr)));
448             }
449             Ast::Concat(_) => {
450                 let mut exprs = vec![];
451                 while let Some(expr) = self.pop_concat_expr() {
452                     if !matches!(*expr.kind(), HirKind::Empty) {
453                         exprs.push(expr);
454                     }
455                 }
456                 exprs.reverse();
457                 self.push(HirFrame::Expr(Hir::concat(exprs)));
458             }
459             Ast::Alternation(_) => {
460                 let mut exprs = vec![];
461                 while let Some(expr) = self.pop_alt_expr() {
462                     self.pop().unwrap().unwrap_alternation_pipe();
463                     exprs.push(expr);
464                 }
465                 exprs.reverse();
466                 self.push(HirFrame::Expr(Hir::alternation(exprs)));
467             }
468         }
469         Ok(())
470     }
471 
visit_alternation_in(&mut self) -> Result<()>472     fn visit_alternation_in(&mut self) -> Result<()> {
473         self.push(HirFrame::AlternationBranch);
474         Ok(())
475     }
476 
visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>477     fn visit_class_set_item_pre(
478         &mut self,
479         ast: &ast::ClassSetItem,
480     ) -> Result<()> {
481         match *ast {
482             ast::ClassSetItem::Bracketed(_) => {
483                 if self.flags().unicode() {
484                     let cls = hir::ClassUnicode::empty();
485                     self.push(HirFrame::ClassUnicode(cls));
486                 } else {
487                     let cls = hir::ClassBytes::empty();
488                     self.push(HirFrame::ClassBytes(cls));
489                 }
490             }
491             // We needn't handle the Union case here since the visitor will
492             // do it for us.
493             _ => {}
494         }
495         Ok(())
496     }
497 
visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>498     fn visit_class_set_item_post(
499         &mut self,
500         ast: &ast::ClassSetItem,
501     ) -> Result<()> {
502         match *ast {
503             ast::ClassSetItem::Empty(_) => {}
504             ast::ClassSetItem::Literal(ref x) => {
505                 if self.flags().unicode() {
506                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
507                     cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
508                     self.push(HirFrame::ClassUnicode(cls));
509                 } else {
510                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
511                     let byte = self.class_literal_byte(x)?;
512                     cls.push(hir::ClassBytesRange::new(byte, byte));
513                     self.push(HirFrame::ClassBytes(cls));
514                 }
515             }
516             ast::ClassSetItem::Range(ref x) => {
517                 if self.flags().unicode() {
518                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
519                     cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
520                     self.push(HirFrame::ClassUnicode(cls));
521                 } else {
522                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
523                     let start = self.class_literal_byte(&x.start)?;
524                     let end = self.class_literal_byte(&x.end)?;
525                     cls.push(hir::ClassBytesRange::new(start, end));
526                     self.push(HirFrame::ClassBytes(cls));
527                 }
528             }
529             ast::ClassSetItem::Ascii(ref x) => {
530                 if self.flags().unicode() {
531                     let xcls = self.hir_ascii_unicode_class(x)?;
532                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
533                     cls.union(&xcls);
534                     self.push(HirFrame::ClassUnicode(cls));
535                 } else {
536                     let xcls = self.hir_ascii_byte_class(x)?;
537                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
538                     cls.union(&xcls);
539                     self.push(HirFrame::ClassBytes(cls));
540                 }
541             }
542             ast::ClassSetItem::Unicode(ref x) => {
543                 let xcls = self.hir_unicode_class(x)?;
544                 let mut cls = self.pop().unwrap().unwrap_class_unicode();
545                 cls.union(&xcls);
546                 self.push(HirFrame::ClassUnicode(cls));
547             }
548             ast::ClassSetItem::Perl(ref x) => {
549                 if self.flags().unicode() {
550                     let xcls = self.hir_perl_unicode_class(x)?;
551                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
552                     cls.union(&xcls);
553                     self.push(HirFrame::ClassUnicode(cls));
554                 } else {
555                     let xcls = self.hir_perl_byte_class(x)?;
556                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
557                     cls.union(&xcls);
558                     self.push(HirFrame::ClassBytes(cls));
559                 }
560             }
561             ast::ClassSetItem::Bracketed(ref ast) => {
562                 if self.flags().unicode() {
563                     let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
564                     self.unicode_fold_and_negate(
565                         &ast.span,
566                         ast.negated,
567                         &mut cls1,
568                     )?;
569 
570                     let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
571                     cls2.union(&cls1);
572                     self.push(HirFrame::ClassUnicode(cls2));
573                 } else {
574                     let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
575                     self.bytes_fold_and_negate(
576                         &ast.span,
577                         ast.negated,
578                         &mut cls1,
579                     )?;
580 
581                     let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
582                     cls2.union(&cls1);
583                     self.push(HirFrame::ClassBytes(cls2));
584                 }
585             }
586             // This is handled automatically by the visitor.
587             ast::ClassSetItem::Union(_) => {}
588         }
589         Ok(())
590     }
591 
visit_class_set_binary_op_pre( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()>592     fn visit_class_set_binary_op_pre(
593         &mut self,
594         _op: &ast::ClassSetBinaryOp,
595     ) -> Result<()> {
596         if self.flags().unicode() {
597             let cls = hir::ClassUnicode::empty();
598             self.push(HirFrame::ClassUnicode(cls));
599         } else {
600             let cls = hir::ClassBytes::empty();
601             self.push(HirFrame::ClassBytes(cls));
602         }
603         Ok(())
604     }
605 
visit_class_set_binary_op_in( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()>606     fn visit_class_set_binary_op_in(
607         &mut self,
608         _op: &ast::ClassSetBinaryOp,
609     ) -> Result<()> {
610         if self.flags().unicode() {
611             let cls = hir::ClassUnicode::empty();
612             self.push(HirFrame::ClassUnicode(cls));
613         } else {
614             let cls = hir::ClassBytes::empty();
615             self.push(HirFrame::ClassBytes(cls));
616         }
617         Ok(())
618     }
619 
visit_class_set_binary_op_post( &mut self, op: &ast::ClassSetBinaryOp, ) -> Result<()>620     fn visit_class_set_binary_op_post(
621         &mut self,
622         op: &ast::ClassSetBinaryOp,
623     ) -> Result<()> {
624         use crate::ast::ClassSetBinaryOpKind::*;
625 
626         if self.flags().unicode() {
627             let mut rhs = self.pop().unwrap().unwrap_class_unicode();
628             let mut lhs = self.pop().unwrap().unwrap_class_unicode();
629             let mut cls = self.pop().unwrap().unwrap_class_unicode();
630             if self.flags().case_insensitive() {
631                 rhs.try_case_fold_simple().map_err(|_| {
632                     self.error(
633                         op.rhs.span().clone(),
634                         ErrorKind::UnicodeCaseUnavailable,
635                     )
636                 })?;
637                 lhs.try_case_fold_simple().map_err(|_| {
638                     self.error(
639                         op.lhs.span().clone(),
640                         ErrorKind::UnicodeCaseUnavailable,
641                     )
642                 })?;
643             }
644             match op.kind {
645                 Intersection => lhs.intersect(&rhs),
646                 Difference => lhs.difference(&rhs),
647                 SymmetricDifference => lhs.symmetric_difference(&rhs),
648             }
649             cls.union(&lhs);
650             self.push(HirFrame::ClassUnicode(cls));
651         } else {
652             let mut rhs = self.pop().unwrap().unwrap_class_bytes();
653             let mut lhs = self.pop().unwrap().unwrap_class_bytes();
654             let mut cls = self.pop().unwrap().unwrap_class_bytes();
655             if self.flags().case_insensitive() {
656                 rhs.case_fold_simple();
657                 lhs.case_fold_simple();
658             }
659             match op.kind {
660                 Intersection => lhs.intersect(&rhs),
661                 Difference => lhs.difference(&rhs),
662                 SymmetricDifference => lhs.symmetric_difference(&rhs),
663             }
664             cls.union(&lhs);
665             self.push(HirFrame::ClassBytes(cls));
666         }
667         Ok(())
668     }
669 }
670 
671 /// The internal implementation of a translator.
672 ///
673 /// This type is responsible for carrying around the original pattern string,
674 /// which is not tied to the internal state of a translator.
675 ///
676 /// A TranslatorI exists for the time it takes to translate a single Ast.
677 #[derive(Clone, Debug)]
678 struct TranslatorI<'t, 'p> {
679     trans: &'t Translator,
680     pattern: &'p str,
681 }
682 
683 impl<'t, 'p> TranslatorI<'t, 'p> {
684     /// Build a new internal translator.
new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p>685     fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
686         TranslatorI { trans, pattern }
687     }
688 
689     /// Return a reference to the underlying translator.
trans(&self) -> &Translator690     fn trans(&self) -> &Translator {
691         &self.trans
692     }
693 
694     /// Push the given frame on to the call stack.
push(&self, frame: HirFrame)695     fn push(&self, frame: HirFrame) {
696         self.trans().stack.borrow_mut().push(frame);
697     }
698 
699     /// Push the given literal char on to the call stack.
700     ///
701     /// If the top-most element of the stack is a literal, then the char
702     /// is appended to the end of that literal. Otherwise, a new literal
703     /// containing just the given char is pushed to the top of the stack.
push_char(&self, ch: char)704     fn push_char(&self, ch: char) {
705         let mut buf = [0; 4];
706         let bytes = ch.encode_utf8(&mut buf).as_bytes();
707         let mut stack = self.trans().stack.borrow_mut();
708         if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
709             literal.extend_from_slice(bytes);
710         } else {
711             stack.push(HirFrame::Literal(bytes.to_vec()));
712         }
713     }
714 
715     /// Push the given literal byte on to the call stack.
716     ///
717     /// If the top-most element of the stack is a literal, then the byte
718     /// is appended to the end of that literal. Otherwise, a new literal
719     /// containing just the given byte is pushed to the top of the stack.
push_byte(&self, byte: u8)720     fn push_byte(&self, byte: u8) {
721         let mut stack = self.trans().stack.borrow_mut();
722         if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
723             literal.push(byte);
724         } else {
725             stack.push(HirFrame::Literal(vec![byte]));
726         }
727     }
728 
729     /// Pop the top of the call stack. If the call stack is empty, return None.
pop(&self) -> Option<HirFrame>730     fn pop(&self) -> Option<HirFrame> {
731         self.trans().stack.borrow_mut().pop()
732     }
733 
734     /// Pop an HIR expression from the top of the stack for a concatenation.
735     ///
736     /// This returns None if the stack is empty or when a concat frame is seen.
737     /// Otherwise, it panics if it could not find an HIR expression.
pop_concat_expr(&self) -> Option<Hir>738     fn pop_concat_expr(&self) -> Option<Hir> {
739         let frame = self.pop()?;
740         match frame {
741             HirFrame::Concat => None,
742             HirFrame::Expr(expr) => Some(expr),
743             HirFrame::Literal(lit) => Some(Hir::literal(lit)),
744             HirFrame::ClassUnicode(_) => {
745                 unreachable!("expected expr or concat, got Unicode class")
746             }
747             HirFrame::ClassBytes(_) => {
748                 unreachable!("expected expr or concat, got byte class")
749             }
750             HirFrame::Repetition => {
751                 unreachable!("expected expr or concat, got repetition")
752             }
753             HirFrame::Group { .. } => {
754                 unreachable!("expected expr or concat, got group")
755             }
756             HirFrame::Alternation => {
757                 unreachable!("expected expr or concat, got alt marker")
758             }
759             HirFrame::AlternationBranch => {
760                 unreachable!("expected expr or concat, got alt branch marker")
761             }
762         }
763     }
764 
765     /// Pop an HIR expression from the top of the stack for an alternation.
766     ///
767     /// This returns None if the stack is empty or when an alternation frame is
768     /// seen. Otherwise, it panics if it could not find an HIR expression.
pop_alt_expr(&self) -> Option<Hir>769     fn pop_alt_expr(&self) -> Option<Hir> {
770         let frame = self.pop()?;
771         match frame {
772             HirFrame::Alternation => None,
773             HirFrame::Expr(expr) => Some(expr),
774             HirFrame::Literal(lit) => Some(Hir::literal(lit)),
775             HirFrame::ClassUnicode(_) => {
776                 unreachable!("expected expr or alt, got Unicode class")
777             }
778             HirFrame::ClassBytes(_) => {
779                 unreachable!("expected expr or alt, got byte class")
780             }
781             HirFrame::Repetition => {
782                 unreachable!("expected expr or alt, got repetition")
783             }
784             HirFrame::Group { .. } => {
785                 unreachable!("expected expr or alt, got group")
786             }
787             HirFrame::Concat => {
788                 unreachable!("expected expr or alt, got concat marker")
789             }
790             HirFrame::AlternationBranch => {
791                 unreachable!("expected expr or alt, got alt branch marker")
792             }
793         }
794     }
795 
796     /// Create a new error with the given span and error type.
error(&self, span: Span, kind: ErrorKind) -> Error797     fn error(&self, span: Span, kind: ErrorKind) -> Error {
798         Error { kind, pattern: self.pattern.to_string(), span }
799     }
800 
801     /// Return a copy of the active flags.
flags(&self) -> Flags802     fn flags(&self) -> Flags {
803         self.trans().flags.get()
804     }
805 
806     /// Set the flags of this translator from the flags set in the given AST.
807     /// Then, return the old flags.
set_flags(&self, ast_flags: &ast::Flags) -> Flags808     fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
809         let old_flags = self.flags();
810         let mut new_flags = Flags::from_ast(ast_flags);
811         new_flags.merge(&old_flags);
812         self.trans().flags.set(new_flags);
813         old_flags
814     }
815 
816     /// Convert an Ast literal to its scalar representation.
817     ///
818     /// When Unicode mode is enabled, then this always succeeds and returns a
819     /// `char` (Unicode scalar value).
820     ///
821     /// When Unicode mode is disabled, then a `char` will still be returned
822     /// whenever possible. A byte is returned only when invalid UTF-8 is
823     /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte
824     /// will result in an error when invalid UTF-8 is not allowed.
ast_literal_to_scalar( &self, lit: &ast::Literal, ) -> Result<Either<char, u8>>825     fn ast_literal_to_scalar(
826         &self,
827         lit: &ast::Literal,
828     ) -> Result<Either<char, u8>> {
829         if self.flags().unicode() {
830             return Ok(Either::Left(lit.c));
831         }
832         let byte = match lit.byte() {
833             None => return Ok(Either::Left(lit.c)),
834             Some(byte) => byte,
835         };
836         if byte <= 0x7F {
837             return Ok(Either::Left(char::try_from(byte).unwrap()));
838         }
839         if self.trans().utf8 {
840             return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
841         }
842         Ok(Either::Right(byte))
843     }
844 
case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>>845     fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> {
846         if !self.flags().case_insensitive() {
847             return Ok(None);
848         }
849         if self.flags().unicode() {
850             // If case folding won't do anything, then don't bother trying.
851             let map = unicode::SimpleCaseFolder::new()
852                 .map(|f| f.overlaps(c, c))
853                 .map_err(|_| {
854                     self.error(span, ErrorKind::UnicodeCaseUnavailable)
855                 })?;
856             if !map {
857                 return Ok(None);
858             }
859             let mut cls =
860                 hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
861                     c, c,
862                 )]);
863             cls.try_case_fold_simple().map_err(|_| {
864                 self.error(span, ErrorKind::UnicodeCaseUnavailable)
865             })?;
866             Ok(Some(Hir::class(hir::Class::Unicode(cls))))
867         } else {
868             if !c.is_ascii() {
869                 return Ok(None);
870             }
871             // If case folding won't do anything, then don't bother trying.
872             match c {
873                 'A'..='Z' | 'a'..='z' => {}
874                 _ => return Ok(None),
875             }
876             let mut cls =
877                 hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
878                     // OK because 'c.len_utf8() == 1' which in turn implies
879                     // that 'c' is ASCII.
880                     u8::try_from(c).unwrap(),
881                     u8::try_from(c).unwrap(),
882                 )]);
883             cls.case_fold_simple();
884             Ok(Some(Hir::class(hir::Class::Bytes(cls))))
885         }
886     }
887 
hir_dot(&self, span: Span) -> Result<Hir>888     fn hir_dot(&self, span: Span) -> Result<Hir> {
889         let (utf8, lineterm, flags) =
890             (self.trans().utf8, self.trans().line_terminator, self.flags());
891         if utf8 && (!flags.unicode() || !lineterm.is_ascii()) {
892             return Err(self.error(span, ErrorKind::InvalidUtf8));
893         }
894         let dot = if flags.dot_matches_new_line() {
895             if flags.unicode() {
896                 hir::Dot::AnyChar
897             } else {
898                 hir::Dot::AnyByte
899             }
900         } else {
901             if flags.unicode() {
902                 if flags.crlf() {
903                     hir::Dot::AnyCharExceptCRLF
904                 } else {
905                     if !lineterm.is_ascii() {
906                         return Err(
907                             self.error(span, ErrorKind::InvalidLineTerminator)
908                         );
909                     }
910                     hir::Dot::AnyCharExcept(char::from(lineterm))
911                 }
912             } else {
913                 if flags.crlf() {
914                     hir::Dot::AnyByteExceptCRLF
915                 } else {
916                     hir::Dot::AnyByteExcept(lineterm)
917                 }
918             }
919         };
920         Ok(Hir::dot(dot))
921     }
922 
hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir>923     fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
924         let unicode = self.flags().unicode();
925         let multi_line = self.flags().multi_line();
926         let crlf = self.flags().crlf();
927         Ok(match asst.kind {
928             ast::AssertionKind::StartLine => Hir::look(if multi_line {
929                 if crlf {
930                     hir::Look::StartCRLF
931                 } else {
932                     hir::Look::StartLF
933                 }
934             } else {
935                 hir::Look::Start
936             }),
937             ast::AssertionKind::EndLine => Hir::look(if multi_line {
938                 if crlf {
939                     hir::Look::EndCRLF
940                 } else {
941                     hir::Look::EndLF
942                 }
943             } else {
944                 hir::Look::End
945             }),
946             ast::AssertionKind::StartText => Hir::look(hir::Look::Start),
947             ast::AssertionKind::EndText => Hir::look(hir::Look::End),
948             ast::AssertionKind::WordBoundary => Hir::look(if unicode {
949                 hir::Look::WordUnicode
950             } else {
951                 hir::Look::WordAscii
952             }),
953             ast::AssertionKind::NotWordBoundary => Hir::look(if unicode {
954                 hir::Look::WordUnicodeNegate
955             } else {
956                 hir::Look::WordAsciiNegate
957             }),
958             ast::AssertionKind::WordBoundaryStart
959             | ast::AssertionKind::WordBoundaryStartAngle => {
960                 Hir::look(if unicode {
961                     hir::Look::WordStartUnicode
962                 } else {
963                     hir::Look::WordStartAscii
964                 })
965             }
966             ast::AssertionKind::WordBoundaryEnd
967             | ast::AssertionKind::WordBoundaryEndAngle => {
968                 Hir::look(if unicode {
969                     hir::Look::WordEndUnicode
970                 } else {
971                     hir::Look::WordEndAscii
972                 })
973             }
974             ast::AssertionKind::WordBoundaryStartHalf => {
975                 Hir::look(if unicode {
976                     hir::Look::WordStartHalfUnicode
977                 } else {
978                     hir::Look::WordStartHalfAscii
979                 })
980             }
981             ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
982                 hir::Look::WordEndHalfUnicode
983             } else {
984                 hir::Look::WordEndHalfAscii
985             }),
986         })
987     }
988 
hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir989     fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir {
990         let (index, name) = match group.kind {
991             ast::GroupKind::CaptureIndex(index) => (index, None),
992             ast::GroupKind::CaptureName { ref name, .. } => {
993                 (name.index, Some(name.name.clone().into_boxed_str()))
994             }
995             // The HIR doesn't need to use non-capturing groups, since the way
996             // in which the data type is defined handles this automatically.
997             ast::GroupKind::NonCapturing(_) => return expr,
998         };
999         Hir::capture(hir::Capture { index, name, sub: Box::new(expr) })
1000     }
1001 
hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir1002     fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
1003         let (min, max) = match rep.op.kind {
1004             ast::RepetitionKind::ZeroOrOne => (0, Some(1)),
1005             ast::RepetitionKind::ZeroOrMore => (0, None),
1006             ast::RepetitionKind::OneOrMore => (1, None),
1007             ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
1008                 (m, Some(m))
1009             }
1010             ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
1011                 (m, None)
1012             }
1013             ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
1014                 m,
1015                 n,
1016             )) => (m, Some(n)),
1017         };
1018         let greedy =
1019             if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
1020         Hir::repetition(hir::Repetition {
1021             min,
1022             max,
1023             greedy,
1024             sub: Box::new(expr),
1025         })
1026     }
1027 
hir_unicode_class( &self, ast_class: &ast::ClassUnicode, ) -> Result<hir::ClassUnicode>1028     fn hir_unicode_class(
1029         &self,
1030         ast_class: &ast::ClassUnicode,
1031     ) -> Result<hir::ClassUnicode> {
1032         use crate::ast::ClassUnicodeKind::*;
1033 
1034         if !self.flags().unicode() {
1035             return Err(
1036                 self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
1037             );
1038         }
1039         let query = match ast_class.kind {
1040             OneLetter(name) => ClassQuery::OneLetter(name),
1041             Named(ref name) => ClassQuery::Binary(name),
1042             NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
1043                 property_name: name,
1044                 property_value: value,
1045             },
1046         };
1047         let mut result = self.convert_unicode_class_error(
1048             &ast_class.span,
1049             unicode::class(query),
1050         );
1051         if let Ok(ref mut class) = result {
1052             self.unicode_fold_and_negate(
1053                 &ast_class.span,
1054                 ast_class.negated,
1055                 class,
1056             )?;
1057         }
1058         result
1059     }
1060 
hir_ascii_unicode_class( &self, ast: &ast::ClassAscii, ) -> Result<hir::ClassUnicode>1061     fn hir_ascii_unicode_class(
1062         &self,
1063         ast: &ast::ClassAscii,
1064     ) -> Result<hir::ClassUnicode> {
1065         let mut cls = hir::ClassUnicode::new(
1066             ascii_class_as_chars(&ast.kind)
1067                 .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
1068         );
1069         self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
1070         Ok(cls)
1071     }
1072 
hir_ascii_byte_class( &self, ast: &ast::ClassAscii, ) -> Result<hir::ClassBytes>1073     fn hir_ascii_byte_class(
1074         &self,
1075         ast: &ast::ClassAscii,
1076     ) -> Result<hir::ClassBytes> {
1077         let mut cls = hir::ClassBytes::new(
1078             ascii_class(&ast.kind)
1079                 .map(|(s, e)| hir::ClassBytesRange::new(s, e)),
1080         );
1081         self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
1082         Ok(cls)
1083     }
1084 
hir_perl_unicode_class( &self, ast_class: &ast::ClassPerl, ) -> Result<hir::ClassUnicode>1085     fn hir_perl_unicode_class(
1086         &self,
1087         ast_class: &ast::ClassPerl,
1088     ) -> Result<hir::ClassUnicode> {
1089         use crate::ast::ClassPerlKind::*;
1090 
1091         assert!(self.flags().unicode());
1092         let result = match ast_class.kind {
1093             Digit => unicode::perl_digit(),
1094             Space => unicode::perl_space(),
1095             Word => unicode::perl_word(),
1096         };
1097         let mut class =
1098             self.convert_unicode_class_error(&ast_class.span, result)?;
1099         // We needn't apply case folding here because the Perl Unicode classes
1100         // are already closed under Unicode simple case folding.
1101         if ast_class.negated {
1102             class.negate();
1103         }
1104         Ok(class)
1105     }
1106 
hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, ) -> Result<hir::ClassBytes>1107     fn hir_perl_byte_class(
1108         &self,
1109         ast_class: &ast::ClassPerl,
1110     ) -> Result<hir::ClassBytes> {
1111         use crate::ast::ClassPerlKind::*;
1112 
1113         assert!(!self.flags().unicode());
1114         let mut class = match ast_class.kind {
1115             Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
1116             Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
1117             Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
1118         };
1119         // We needn't apply case folding here because the Perl ASCII classes
1120         // are already closed (under ASCII case folding).
1121         if ast_class.negated {
1122             class.negate();
1123         }
1124         // Negating a Perl byte class is likely to cause it to match invalid
1125         // UTF-8. That's only OK if the translator is configured to allow such
1126         // things.
1127         if self.trans().utf8 && !class.is_ascii() {
1128             return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
1129         }
1130         Ok(class)
1131     }
1132 
1133     /// Converts the given Unicode specific error to an HIR translation error.
1134     ///
1135     /// The span given should approximate the position at which an error would
1136     /// occur.
convert_unicode_class_error( &self, span: &Span, result: core::result::Result<hir::ClassUnicode, unicode::Error>, ) -> Result<hir::ClassUnicode>1137     fn convert_unicode_class_error(
1138         &self,
1139         span: &Span,
1140         result: core::result::Result<hir::ClassUnicode, unicode::Error>,
1141     ) -> Result<hir::ClassUnicode> {
1142         result.map_err(|err| {
1143             let sp = span.clone();
1144             match err {
1145                 unicode::Error::PropertyNotFound => {
1146                     self.error(sp, ErrorKind::UnicodePropertyNotFound)
1147                 }
1148                 unicode::Error::PropertyValueNotFound => {
1149                     self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
1150                 }
1151                 unicode::Error::PerlClassNotFound => {
1152                     self.error(sp, ErrorKind::UnicodePerlClassNotFound)
1153                 }
1154             }
1155         })
1156     }
1157 
unicode_fold_and_negate( &self, span: &Span, negated: bool, class: &mut hir::ClassUnicode, ) -> Result<()>1158     fn unicode_fold_and_negate(
1159         &self,
1160         span: &Span,
1161         negated: bool,
1162         class: &mut hir::ClassUnicode,
1163     ) -> Result<()> {
1164         // Note that we must apply case folding before negation!
1165         // Consider `(?i)[^x]`. If we applied negation first, then
1166         // the result would be the character class that matched any
1167         // Unicode scalar value.
1168         if self.flags().case_insensitive() {
1169             class.try_case_fold_simple().map_err(|_| {
1170                 self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
1171             })?;
1172         }
1173         if negated {
1174             class.negate();
1175         }
1176         Ok(())
1177     }
1178 
bytes_fold_and_negate( &self, span: &Span, negated: bool, class: &mut hir::ClassBytes, ) -> Result<()>1179     fn bytes_fold_and_negate(
1180         &self,
1181         span: &Span,
1182         negated: bool,
1183         class: &mut hir::ClassBytes,
1184     ) -> Result<()> {
1185         // Note that we must apply case folding before negation!
1186         // Consider `(?i)[^x]`. If we applied negation first, then
1187         // the result would be the character class that matched any
1188         // Unicode scalar value.
1189         if self.flags().case_insensitive() {
1190             class.case_fold_simple();
1191         }
1192         if negated {
1193             class.negate();
1194         }
1195         if self.trans().utf8 && !class.is_ascii() {
1196             return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
1197         }
1198         Ok(())
1199     }
1200 
1201     /// Return a scalar byte value suitable for use as a literal in a byte
1202     /// character class.
class_literal_byte(&self, ast: &ast::Literal) -> Result<u8>1203     fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
1204         match self.ast_literal_to_scalar(ast)? {
1205             Either::Right(byte) => Ok(byte),
1206             Either::Left(ch) => {
1207                 if ch.is_ascii() {
1208                     Ok(u8::try_from(ch).unwrap())
1209                 } else {
1210                     // We can't feasibly support Unicode in
1211                     // byte oriented classes. Byte classes don't
1212                     // do Unicode case folding.
1213                     Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
1214                 }
1215             }
1216         }
1217     }
1218 }
1219 
1220 /// A translator's representation of a regular expression's flags at any given
1221 /// moment in time.
1222 ///
1223 /// Each flag can be in one of three states: absent, present but disabled or
1224 /// present but enabled.
1225 #[derive(Clone, Copy, Debug, Default)]
1226 struct Flags {
1227     case_insensitive: Option<bool>,
1228     multi_line: Option<bool>,
1229     dot_matches_new_line: Option<bool>,
1230     swap_greed: Option<bool>,
1231     unicode: Option<bool>,
1232     crlf: Option<bool>,
1233     // Note that `ignore_whitespace` is omitted here because it is handled
1234     // entirely in the parser.
1235 }
1236 
1237 impl Flags {
from_ast(ast: &ast::Flags) -> Flags1238     fn from_ast(ast: &ast::Flags) -> Flags {
1239         let mut flags = Flags::default();
1240         let mut enable = true;
1241         for item in &ast.items {
1242             match item.kind {
1243                 ast::FlagsItemKind::Negation => {
1244                     enable = false;
1245                 }
1246                 ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
1247                     flags.case_insensitive = Some(enable);
1248                 }
1249                 ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
1250                     flags.multi_line = Some(enable);
1251                 }
1252                 ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
1253                     flags.dot_matches_new_line = Some(enable);
1254                 }
1255                 ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
1256                     flags.swap_greed = Some(enable);
1257                 }
1258                 ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
1259                     flags.unicode = Some(enable);
1260                 }
1261                 ast::FlagsItemKind::Flag(ast::Flag::CRLF) => {
1262                     flags.crlf = Some(enable);
1263                 }
1264                 ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
1265             }
1266         }
1267         flags
1268     }
1269 
merge(&mut self, previous: &Flags)1270     fn merge(&mut self, previous: &Flags) {
1271         if self.case_insensitive.is_none() {
1272             self.case_insensitive = previous.case_insensitive;
1273         }
1274         if self.multi_line.is_none() {
1275             self.multi_line = previous.multi_line;
1276         }
1277         if self.dot_matches_new_line.is_none() {
1278             self.dot_matches_new_line = previous.dot_matches_new_line;
1279         }
1280         if self.swap_greed.is_none() {
1281             self.swap_greed = previous.swap_greed;
1282         }
1283         if self.unicode.is_none() {
1284             self.unicode = previous.unicode;
1285         }
1286         if self.crlf.is_none() {
1287             self.crlf = previous.crlf;
1288         }
1289     }
1290 
case_insensitive(&self) -> bool1291     fn case_insensitive(&self) -> bool {
1292         self.case_insensitive.unwrap_or(false)
1293     }
1294 
multi_line(&self) -> bool1295     fn multi_line(&self) -> bool {
1296         self.multi_line.unwrap_or(false)
1297     }
1298 
dot_matches_new_line(&self) -> bool1299     fn dot_matches_new_line(&self) -> bool {
1300         self.dot_matches_new_line.unwrap_or(false)
1301     }
1302 
swap_greed(&self) -> bool1303     fn swap_greed(&self) -> bool {
1304         self.swap_greed.unwrap_or(false)
1305     }
1306 
unicode(&self) -> bool1307     fn unicode(&self) -> bool {
1308         self.unicode.unwrap_or(true)
1309     }
1310 
crlf(&self) -> bool1311     fn crlf(&self) -> bool {
1312         self.crlf.unwrap_or(false)
1313     }
1314 }
1315 
hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes1316 fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
1317     let ranges: Vec<_> = ascii_class(kind)
1318         .map(|(s, e)| hir::ClassBytesRange::new(s, e))
1319         .collect();
1320     hir::ClassBytes::new(ranges)
1321 }
1322 
ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)>1323 fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> {
1324     use crate::ast::ClassAsciiKind::*;
1325 
1326     let slice: &'static [(u8, u8)] = match *kind {
1327         Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1328         Alpha => &[(b'A', b'Z'), (b'a', b'z')],
1329         Ascii => &[(b'\x00', b'\x7F')],
1330         Blank => &[(b'\t', b'\t'), (b' ', b' ')],
1331         Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
1332         Digit => &[(b'0', b'9')],
1333         Graph => &[(b'!', b'~')],
1334         Lower => &[(b'a', b'z')],
1335         Print => &[(b' ', b'~')],
1336         Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1337         Space => &[
1338             (b'\t', b'\t'),
1339             (b'\n', b'\n'),
1340             (b'\x0B', b'\x0B'),
1341             (b'\x0C', b'\x0C'),
1342             (b'\r', b'\r'),
1343             (b' ', b' '),
1344         ],
1345         Upper => &[(b'A', b'Z')],
1346         Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1347         Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1348     };
1349     slice.iter().copied()
1350 }
1351 
ascii_class_as_chars( kind: &ast::ClassAsciiKind, ) -> impl Iterator<Item = (char, char)>1352 fn ascii_class_as_chars(
1353     kind: &ast::ClassAsciiKind,
1354 ) -> impl Iterator<Item = (char, char)> {
1355     ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e)))
1356 }
1357 
1358 #[cfg(test)]
1359 mod tests {
1360     use crate::{
1361         ast::{parse::ParserBuilder, Position},
1362         hir::{Look, Properties},
1363     };
1364 
1365     use super::*;
1366 
1367     // We create these errors to compare with real hir::Errors in the tests.
1368     // We define equality between TestError and hir::Error to disregard the
1369     // pattern string in hir::Error, which is annoying to provide in tests.
1370     #[derive(Clone, Debug)]
1371     struct TestError {
1372         span: Span,
1373         kind: hir::ErrorKind,
1374     }
1375 
1376     impl PartialEq<hir::Error> for TestError {
eq(&self, other: &hir::Error) -> bool1377         fn eq(&self, other: &hir::Error) -> bool {
1378             self.span == other.span && self.kind == other.kind
1379         }
1380     }
1381 
1382     impl PartialEq<TestError> for hir::Error {
eq(&self, other: &TestError) -> bool1383         fn eq(&self, other: &TestError) -> bool {
1384             self.span == other.span && self.kind == other.kind
1385         }
1386     }
1387 
parse(pattern: &str) -> Ast1388     fn parse(pattern: &str) -> Ast {
1389         ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
1390     }
1391 
t(pattern: &str) -> Hir1392     fn t(pattern: &str) -> Hir {
1393         TranslatorBuilder::new()
1394             .utf8(true)
1395             .build()
1396             .translate(pattern, &parse(pattern))
1397             .unwrap()
1398     }
1399 
t_err(pattern: &str) -> hir::Error1400     fn t_err(pattern: &str) -> hir::Error {
1401         TranslatorBuilder::new()
1402             .utf8(true)
1403             .build()
1404             .translate(pattern, &parse(pattern))
1405             .unwrap_err()
1406     }
1407 
t_bytes(pattern: &str) -> Hir1408     fn t_bytes(pattern: &str) -> Hir {
1409         TranslatorBuilder::new()
1410             .utf8(false)
1411             .build()
1412             .translate(pattern, &parse(pattern))
1413             .unwrap()
1414     }
1415 
props(pattern: &str) -> Properties1416     fn props(pattern: &str) -> Properties {
1417         t(pattern).properties().clone()
1418     }
1419 
props_bytes(pattern: &str) -> Properties1420     fn props_bytes(pattern: &str) -> Properties {
1421         t_bytes(pattern).properties().clone()
1422     }
1423 
hir_lit(s: &str) -> Hir1424     fn hir_lit(s: &str) -> Hir {
1425         hir_blit(s.as_bytes())
1426     }
1427 
hir_blit(s: &[u8]) -> Hir1428     fn hir_blit(s: &[u8]) -> Hir {
1429         Hir::literal(s)
1430     }
1431 
hir_capture(index: u32, expr: Hir) -> Hir1432     fn hir_capture(index: u32, expr: Hir) -> Hir {
1433         Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) })
1434     }
1435 
hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir1436     fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir {
1437         Hir::capture(hir::Capture {
1438             index,
1439             name: Some(name.into()),
1440             sub: Box::new(expr),
1441         })
1442     }
1443 
hir_quest(greedy: bool, expr: Hir) -> Hir1444     fn hir_quest(greedy: bool, expr: Hir) -> Hir {
1445         Hir::repetition(hir::Repetition {
1446             min: 0,
1447             max: Some(1),
1448             greedy,
1449             sub: Box::new(expr),
1450         })
1451     }
1452 
hir_star(greedy: bool, expr: Hir) -> Hir1453     fn hir_star(greedy: bool, expr: Hir) -> Hir {
1454         Hir::repetition(hir::Repetition {
1455             min: 0,
1456             max: None,
1457             greedy,
1458             sub: Box::new(expr),
1459         })
1460     }
1461 
hir_plus(greedy: bool, expr: Hir) -> Hir1462     fn hir_plus(greedy: bool, expr: Hir) -> Hir {
1463         Hir::repetition(hir::Repetition {
1464             min: 1,
1465             max: None,
1466             greedy,
1467             sub: Box::new(expr),
1468         })
1469     }
1470 
hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir1471     fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir {
1472         Hir::repetition(hir::Repetition {
1473             min,
1474             max,
1475             greedy,
1476             sub: Box::new(expr),
1477         })
1478     }
1479 
hir_alt(alts: Vec<Hir>) -> Hir1480     fn hir_alt(alts: Vec<Hir>) -> Hir {
1481         Hir::alternation(alts)
1482     }
1483 
hir_cat(exprs: Vec<Hir>) -> Hir1484     fn hir_cat(exprs: Vec<Hir>) -> Hir {
1485         Hir::concat(exprs)
1486     }
1487 
1488     #[allow(dead_code)]
hir_uclass_query(query: ClassQuery<'_>) -> Hir1489     fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
1490         Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
1491     }
1492 
1493     #[allow(dead_code)]
hir_uclass_perl_word() -> Hir1494     fn hir_uclass_perl_word() -> Hir {
1495         Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
1496     }
1497 
hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir1498     fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir {
1499         Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(
1500             ascii_class_as_chars(kind)
1501                 .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
1502         )))
1503     }
1504 
hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir1505     fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir {
1506         Hir::class(hir::Class::Bytes(hir::ClassBytes::new(
1507             ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)),
1508         )))
1509     }
1510 
hir_uclass(ranges: &[(char, char)]) -> Hir1511     fn hir_uclass(ranges: &[(char, char)]) -> Hir {
1512         Hir::class(uclass(ranges))
1513     }
1514 
hir_bclass(ranges: &[(u8, u8)]) -> Hir1515     fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
1516         Hir::class(bclass(ranges))
1517     }
1518 
hir_case_fold(expr: Hir) -> Hir1519     fn hir_case_fold(expr: Hir) -> Hir {
1520         match expr.into_kind() {
1521             HirKind::Class(mut cls) => {
1522                 cls.case_fold_simple();
1523                 Hir::class(cls)
1524             }
1525             _ => panic!("cannot case fold non-class Hir expr"),
1526         }
1527     }
1528 
hir_negate(expr: Hir) -> Hir1529     fn hir_negate(expr: Hir) -> Hir {
1530         match expr.into_kind() {
1531             HirKind::Class(mut cls) => {
1532                 cls.negate();
1533                 Hir::class(cls)
1534             }
1535             _ => panic!("cannot negate non-class Hir expr"),
1536         }
1537     }
1538 
uclass(ranges: &[(char, char)]) -> hir::Class1539     fn uclass(ranges: &[(char, char)]) -> hir::Class {
1540         let ranges: Vec<hir::ClassUnicodeRange> = ranges
1541             .iter()
1542             .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
1543             .collect();
1544         hir::Class::Unicode(hir::ClassUnicode::new(ranges))
1545     }
1546 
bclass(ranges: &[(u8, u8)]) -> hir::Class1547     fn bclass(ranges: &[(u8, u8)]) -> hir::Class {
1548         let ranges: Vec<hir::ClassBytesRange> = ranges
1549             .iter()
1550             .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
1551             .collect();
1552         hir::Class::Bytes(hir::ClassBytes::new(ranges))
1553     }
1554 
1555     #[cfg(feature = "unicode-case")]
class_case_fold(mut cls: hir::Class) -> Hir1556     fn class_case_fold(mut cls: hir::Class) -> Hir {
1557         cls.case_fold_simple();
1558         Hir::class(cls)
1559     }
1560 
class_negate(mut cls: hir::Class) -> Hir1561     fn class_negate(mut cls: hir::Class) -> Hir {
1562         cls.negate();
1563         Hir::class(cls)
1564     }
1565 
1566     #[allow(dead_code)]
hir_union(expr1: Hir, expr2: Hir) -> Hir1567     fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
1568         use crate::hir::Class::{Bytes, Unicode};
1569 
1570         match (expr1.into_kind(), expr2.into_kind()) {
1571             (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1572                 c1.union(&c2);
1573                 Hir::class(hir::Class::Unicode(c1))
1574             }
1575             (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1576                 c1.union(&c2);
1577                 Hir::class(hir::Class::Bytes(c1))
1578             }
1579             _ => panic!("cannot union non-class Hir exprs"),
1580         }
1581     }
1582 
1583     #[allow(dead_code)]
hir_difference(expr1: Hir, expr2: Hir) -> Hir1584     fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
1585         use crate::hir::Class::{Bytes, Unicode};
1586 
1587         match (expr1.into_kind(), expr2.into_kind()) {
1588             (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1589                 c1.difference(&c2);
1590                 Hir::class(hir::Class::Unicode(c1))
1591             }
1592             (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1593                 c1.difference(&c2);
1594                 Hir::class(hir::Class::Bytes(c1))
1595             }
1596             _ => panic!("cannot difference non-class Hir exprs"),
1597         }
1598     }
1599 
hir_look(look: hir::Look) -> Hir1600     fn hir_look(look: hir::Look) -> Hir {
1601         Hir::look(look)
1602     }
1603 
1604     #[test]
empty()1605     fn empty() {
1606         assert_eq!(t(""), Hir::empty());
1607         assert_eq!(t("(?i)"), Hir::empty());
1608         assert_eq!(t("()"), hir_capture(1, Hir::empty()));
1609         assert_eq!(t("(?:)"), Hir::empty());
1610         assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty()));
1611         assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
1612         assert_eq!(
1613             t("()|()"),
1614             hir_alt(vec![
1615                 hir_capture(1, Hir::empty()),
1616                 hir_capture(2, Hir::empty()),
1617             ])
1618         );
1619         assert_eq!(
1620             t("(|b)"),
1621             hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
1622         );
1623         assert_eq!(
1624             t("(a|)"),
1625             hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
1626         );
1627         assert_eq!(
1628             t("(a||c)"),
1629             hir_capture(
1630                 1,
1631                 hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
1632             )
1633         );
1634         assert_eq!(
1635             t("(||)"),
1636             hir_capture(
1637                 1,
1638                 hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
1639             )
1640         );
1641     }
1642 
1643     #[test]
literal()1644     fn literal() {
1645         assert_eq!(t("a"), hir_lit("a"));
1646         assert_eq!(t("(?-u)a"), hir_lit("a"));
1647         assert_eq!(t("☃"), hir_lit("☃"));
1648         assert_eq!(t("abcd"), hir_lit("abcd"));
1649 
1650         assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
1651         assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
1652         assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
1653         assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
1654 
1655         assert_eq!(t("(?-u)☃"), hir_lit("☃"));
1656         assert_eq!(
1657             t_err(r"(?-u)\xFF"),
1658             TestError {
1659                 kind: hir::ErrorKind::InvalidUtf8,
1660                 span: Span::new(
1661                     Position::new(5, 1, 6),
1662                     Position::new(9, 1, 10)
1663                 ),
1664             }
1665         );
1666     }
1667 
1668     #[test]
literal_case_insensitive()1669     fn literal_case_insensitive() {
1670         #[cfg(feature = "unicode-case")]
1671         assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
1672         #[cfg(feature = "unicode-case")]
1673         assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
1674         #[cfg(feature = "unicode-case")]
1675         assert_eq!(
1676             t("a(?i)a(?-i)a"),
1677             hir_cat(vec![
1678                 hir_lit("a"),
1679                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1680                 hir_lit("a"),
1681             ])
1682         );
1683         #[cfg(feature = "unicode-case")]
1684         assert_eq!(
1685             t("(?i)ab@c"),
1686             hir_cat(vec![
1687                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1688                 hir_uclass(&[('B', 'B'), ('b', 'b')]),
1689                 hir_lit("@"),
1690                 hir_uclass(&[('C', 'C'), ('c', 'c')]),
1691             ])
1692         );
1693         #[cfg(feature = "unicode-case")]
1694         assert_eq!(
1695             t("(?i)β"),
1696             hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
1697         );
1698 
1699         assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
1700         #[cfg(feature = "unicode-case")]
1701         assert_eq!(
1702             t("(?-u)a(?i)a(?-i)a"),
1703             hir_cat(vec![
1704                 hir_lit("a"),
1705                 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1706                 hir_lit("a"),
1707             ])
1708         );
1709         assert_eq!(
1710             t("(?i-u)ab@c"),
1711             hir_cat(vec![
1712                 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1713                 hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
1714                 hir_lit("@"),
1715                 hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
1716             ])
1717         );
1718 
1719         assert_eq!(
1720             t_bytes("(?i-u)a"),
1721             hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1722         );
1723         assert_eq!(
1724             t_bytes("(?i-u)\x61"),
1725             hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1726         );
1727         assert_eq!(
1728             t_bytes(r"(?i-u)\x61"),
1729             hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1730         );
1731         assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
1732 
1733         assert_eq!(t("(?i-u)β"), hir_lit("β"),);
1734     }
1735 
1736     #[test]
dot()1737     fn dot() {
1738         assert_eq!(
1739             t("."),
1740             hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')])
1741         );
1742         assert_eq!(
1743             t("(?R)."),
1744             hir_uclass(&[
1745                 ('\0', '\t'),
1746                 ('\x0B', '\x0C'),
1747                 ('\x0E', '\u{10FFFF}'),
1748             ])
1749         );
1750         assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
1751         assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
1752         assert_eq!(
1753             t_bytes("(?-u)."),
1754             hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')])
1755         );
1756         assert_eq!(
1757             t_bytes("(?R-u)."),
1758             hir_bclass(&[
1759                 (b'\0', b'\t'),
1760                 (b'\x0B', b'\x0C'),
1761                 (b'\x0E', b'\xFF'),
1762             ])
1763         );
1764         assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1765         assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1766 
1767         // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
1768         assert_eq!(
1769             t_err("(?-u)."),
1770             TestError {
1771                 kind: hir::ErrorKind::InvalidUtf8,
1772                 span: Span::new(
1773                     Position::new(5, 1, 6),
1774                     Position::new(6, 1, 7)
1775                 ),
1776             }
1777         );
1778         assert_eq!(
1779             t_err("(?R-u)."),
1780             TestError {
1781                 kind: hir::ErrorKind::InvalidUtf8,
1782                 span: Span::new(
1783                     Position::new(6, 1, 7),
1784                     Position::new(7, 1, 8)
1785                 ),
1786             }
1787         );
1788         assert_eq!(
1789             t_err("(?s-u)."),
1790             TestError {
1791                 kind: hir::ErrorKind::InvalidUtf8,
1792                 span: Span::new(
1793                     Position::new(6, 1, 7),
1794                     Position::new(7, 1, 8)
1795                 ),
1796             }
1797         );
1798         assert_eq!(
1799             t_err("(?Rs-u)."),
1800             TestError {
1801                 kind: hir::ErrorKind::InvalidUtf8,
1802                 span: Span::new(
1803                     Position::new(7, 1, 8),
1804                     Position::new(8, 1, 9)
1805                 ),
1806             }
1807         );
1808     }
1809 
1810     #[test]
assertions()1811     fn assertions() {
1812         assert_eq!(t("^"), hir_look(hir::Look::Start));
1813         assert_eq!(t("$"), hir_look(hir::Look::End));
1814         assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1815         assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1816         assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1817         assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1818         assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1819         assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1820 
1821         assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
1822         assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
1823         assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
1824         assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
1825     }
1826 
1827     #[test]
group()1828     fn group() {
1829         assert_eq!(t("(a)"), hir_capture(1, hir_lit("a")));
1830         assert_eq!(
1831             t("(a)(b)"),
1832             hir_cat(vec![
1833                 hir_capture(1, hir_lit("a")),
1834                 hir_capture(2, hir_lit("b")),
1835             ])
1836         );
1837         assert_eq!(
1838             t("(a)|(b)"),
1839             hir_alt(vec![
1840                 hir_capture(1, hir_lit("a")),
1841                 hir_capture(2, hir_lit("b")),
1842             ])
1843         );
1844         assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty()));
1845         assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a")));
1846         assert_eq!(
1847             t("(?P<foo>a)(?P<bar>b)"),
1848             hir_cat(vec![
1849                 hir_capture_name(1, "foo", hir_lit("a")),
1850                 hir_capture_name(2, "bar", hir_lit("b")),
1851             ])
1852         );
1853         assert_eq!(t("(?:)"), Hir::empty());
1854         assert_eq!(t("(?:a)"), hir_lit("a"));
1855         assert_eq!(
1856             t("(?:a)(b)"),
1857             hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),])
1858         );
1859         assert_eq!(
1860             t("(a)(?:b)(c)"),
1861             hir_cat(vec![
1862                 hir_capture(1, hir_lit("a")),
1863                 hir_lit("b"),
1864                 hir_capture(2, hir_lit("c")),
1865             ])
1866         );
1867         assert_eq!(
1868             t("(a)(?P<foo>b)(c)"),
1869             hir_cat(vec![
1870                 hir_capture(1, hir_lit("a")),
1871                 hir_capture_name(2, "foo", hir_lit("b")),
1872                 hir_capture(3, hir_lit("c")),
1873             ])
1874         );
1875         assert_eq!(t("()"), hir_capture(1, Hir::empty()));
1876         assert_eq!(t("((?i))"), hir_capture(1, Hir::empty()));
1877         assert_eq!(t("((?x))"), hir_capture(1, Hir::empty()));
1878         assert_eq!(
1879             t("(((?x)))"),
1880             hir_capture(1, hir_capture(2, Hir::empty()))
1881         );
1882     }
1883 
1884     #[test]
line_anchors()1885     fn line_anchors() {
1886         assert_eq!(t("^"), hir_look(hir::Look::Start));
1887         assert_eq!(t("$"), hir_look(hir::Look::End));
1888         assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1889         assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1890 
1891         assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1892         assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1893         assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1894         assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1895 
1896         assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start));
1897         assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End));
1898         assert_eq!(t("(?R)^"), hir_look(hir::Look::Start));
1899         assert_eq!(t("(?R)$"), hir_look(hir::Look::End));
1900 
1901         assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start));
1902         assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End));
1903         assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF));
1904         assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF));
1905     }
1906 
1907     #[test]
flags()1908     fn flags() {
1909         #[cfg(feature = "unicode-case")]
1910         assert_eq!(
1911             t("(?i:a)a"),
1912             hir_cat(
1913                 vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),]
1914             )
1915         );
1916         assert_eq!(
1917             t("(?i-u:a)β"),
1918             hir_cat(vec![
1919                 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1920                 hir_lit("β"),
1921             ])
1922         );
1923         assert_eq!(
1924             t("(?:(?i-u)a)b"),
1925             hir_cat(vec![
1926                 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1927                 hir_lit("b"),
1928             ])
1929         );
1930         assert_eq!(
1931             t("((?i-u)a)b"),
1932             hir_cat(vec![
1933                 hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1934                 hir_lit("b"),
1935             ])
1936         );
1937         #[cfg(feature = "unicode-case")]
1938         assert_eq!(
1939             t("(?i)(?-i:a)a"),
1940             hir_cat(
1941                 vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),]
1942             )
1943         );
1944         #[cfg(feature = "unicode-case")]
1945         assert_eq!(
1946             t("(?im)a^"),
1947             hir_cat(vec![
1948                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1949                 hir_look(hir::Look::StartLF),
1950             ])
1951         );
1952         #[cfg(feature = "unicode-case")]
1953         assert_eq!(
1954             t("(?im)a^(?i-m)a^"),
1955             hir_cat(vec![
1956                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1957                 hir_look(hir::Look::StartLF),
1958                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1959                 hir_look(hir::Look::Start),
1960             ])
1961         );
1962         assert_eq!(
1963             t("(?U)a*a*?(?-U)a*a*?"),
1964             hir_cat(vec![
1965                 hir_star(false, hir_lit("a")),
1966                 hir_star(true, hir_lit("a")),
1967                 hir_star(true, hir_lit("a")),
1968                 hir_star(false, hir_lit("a")),
1969             ])
1970         );
1971         #[cfg(feature = "unicode-case")]
1972         assert_eq!(
1973             t("(?:a(?i)a)a"),
1974             hir_cat(vec![
1975                 hir_cat(vec![
1976                     hir_lit("a"),
1977                     hir_uclass(&[('A', 'A'), ('a', 'a')]),
1978                 ]),
1979                 hir_lit("a"),
1980             ])
1981         );
1982         #[cfg(feature = "unicode-case")]
1983         assert_eq!(
1984             t("(?i)(?:a(?-i)a)a"),
1985             hir_cat(vec![
1986                 hir_cat(vec![
1987                     hir_uclass(&[('A', 'A'), ('a', 'a')]),
1988                     hir_lit("a"),
1989                 ]),
1990                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1991             ])
1992         );
1993     }
1994 
1995     #[test]
escape()1996     fn escape() {
1997         assert_eq!(
1998             t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
1999             hir_lit(r"\.+*?()|[]{}^$#")
2000         );
2001     }
2002 
2003     #[test]
repetition()2004     fn repetition() {
2005         assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
2006         assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
2007         assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
2008         assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
2009         assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
2010         assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
2011 
2012         assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),));
2013         assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),));
2014         assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),));
2015         assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),));
2016         assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),));
2017         assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),));
2018 
2019         assert_eq!(
2020             t("ab?"),
2021             hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
2022         );
2023         assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab"))));
2024         assert_eq!(
2025             t("a|b?"),
2026             hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
2027         );
2028     }
2029 
2030     #[test]
cat_alt()2031     fn cat_alt() {
2032         let a = || hir_look(hir::Look::Start);
2033         let b = || hir_look(hir::Look::End);
2034         let c = || hir_look(hir::Look::WordUnicode);
2035         let d = || hir_look(hir::Look::WordUnicodeNegate);
2036 
2037         assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()])));
2038         assert_eq!(t("^|$"), hir_alt(vec![a(), b()]));
2039         assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()]));
2040         assert_eq!(
2041             t(r"^$|$\b|\b\B"),
2042             hir_alt(vec![
2043                 hir_cat(vec![a(), b()]),
2044                 hir_cat(vec![b(), c()]),
2045                 hir_cat(vec![c(), d()]),
2046             ])
2047         );
2048         assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()])));
2049         assert_eq!(
2050             t(r"(^|$|\b)"),
2051             hir_capture(1, hir_alt(vec![a(), b(), c()]))
2052         );
2053         assert_eq!(
2054             t(r"(^$|$\b|\b\B)"),
2055             hir_capture(
2056                 1,
2057                 hir_alt(vec![
2058                     hir_cat(vec![a(), b()]),
2059                     hir_cat(vec![b(), c()]),
2060                     hir_cat(vec![c(), d()]),
2061                 ])
2062             )
2063         );
2064         assert_eq!(
2065             t(r"(^$|($\b|(\b\B)))"),
2066             hir_capture(
2067                 1,
2068                 hir_alt(vec![
2069                     hir_cat(vec![a(), b()]),
2070                     hir_capture(
2071                         2,
2072                         hir_alt(vec![
2073                             hir_cat(vec![b(), c()]),
2074                             hir_capture(3, hir_cat(vec![c(), d()])),
2075                         ])
2076                     ),
2077                 ])
2078             )
2079         );
2080     }
2081 
2082     // Tests the HIR transformation of things like '[a-z]|[A-Z]' into
2083     // '[A-Za-z]'. In other words, an alternation of just classes is always
2084     // equivalent to a single class corresponding to the union of the branches
2085     // in that class. (Unless some branches match invalid UTF-8 and others
2086     // match non-ASCII Unicode.)
2087     #[test]
cat_class_flattened()2088     fn cat_class_flattened() {
2089         assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2090         // Combining all of the letter properties should give us the one giant
2091         // letter property.
2092         #[cfg(feature = "unicode-gencat")]
2093         assert_eq!(
2094             t(r"(?x)
2095                 \p{Lowercase_Letter}
2096                 |\p{Uppercase_Letter}
2097                 |\p{Titlecase_Letter}
2098                 |\p{Modifier_Letter}
2099                 |\p{Other_Letter}
2100             "),
2101             hir_uclass_query(ClassQuery::Binary("letter"))
2102         );
2103         // Byte classes that can truly match invalid UTF-8 cannot be combined
2104         // with Unicode classes.
2105         assert_eq!(
2106             t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"),
2107             hir_alt(vec![
2108                 hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]),
2109                 hir_bclass(&[(b'\x90', b'\xFF')]),
2110                 hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]),
2111             ])
2112         );
2113         // Byte classes on their own can be combined, even if some are ASCII
2114         // and others are invalid UTF-8.
2115         assert_eq!(
2116             t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"),
2117             hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]),
2118         );
2119     }
2120 
2121     #[test]
class_ascii()2122     fn class_ascii() {
2123         assert_eq!(
2124             t("[[:alnum:]]"),
2125             hir_ascii_uclass(&ast::ClassAsciiKind::Alnum)
2126         );
2127         assert_eq!(
2128             t("[[:alpha:]]"),
2129             hir_ascii_uclass(&ast::ClassAsciiKind::Alpha)
2130         );
2131         assert_eq!(
2132             t("[[:ascii:]]"),
2133             hir_ascii_uclass(&ast::ClassAsciiKind::Ascii)
2134         );
2135         assert_eq!(
2136             t("[[:blank:]]"),
2137             hir_ascii_uclass(&ast::ClassAsciiKind::Blank)
2138         );
2139         assert_eq!(
2140             t("[[:cntrl:]]"),
2141             hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl)
2142         );
2143         assert_eq!(
2144             t("[[:digit:]]"),
2145             hir_ascii_uclass(&ast::ClassAsciiKind::Digit)
2146         );
2147         assert_eq!(
2148             t("[[:graph:]]"),
2149             hir_ascii_uclass(&ast::ClassAsciiKind::Graph)
2150         );
2151         assert_eq!(
2152             t("[[:lower:]]"),
2153             hir_ascii_uclass(&ast::ClassAsciiKind::Lower)
2154         );
2155         assert_eq!(
2156             t("[[:print:]]"),
2157             hir_ascii_uclass(&ast::ClassAsciiKind::Print)
2158         );
2159         assert_eq!(
2160             t("[[:punct:]]"),
2161             hir_ascii_uclass(&ast::ClassAsciiKind::Punct)
2162         );
2163         assert_eq!(
2164             t("[[:space:]]"),
2165             hir_ascii_uclass(&ast::ClassAsciiKind::Space)
2166         );
2167         assert_eq!(
2168             t("[[:upper:]]"),
2169             hir_ascii_uclass(&ast::ClassAsciiKind::Upper)
2170         );
2171         assert_eq!(
2172             t("[[:word:]]"),
2173             hir_ascii_uclass(&ast::ClassAsciiKind::Word)
2174         );
2175         assert_eq!(
2176             t("[[:xdigit:]]"),
2177             hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit)
2178         );
2179 
2180         assert_eq!(
2181             t("[[:^lower:]]"),
2182             hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower))
2183         );
2184         #[cfg(feature = "unicode-case")]
2185         assert_eq!(
2186             t("(?i)[[:lower:]]"),
2187             hir_uclass(&[
2188                 ('A', 'Z'),
2189                 ('a', 'z'),
2190                 ('\u{17F}', '\u{17F}'),
2191                 ('\u{212A}', '\u{212A}'),
2192             ])
2193         );
2194 
2195         assert_eq!(
2196             t("(?-u)[[:lower:]]"),
2197             hir_ascii_bclass(&ast::ClassAsciiKind::Lower)
2198         );
2199         assert_eq!(
2200             t("(?i-u)[[:lower:]]"),
2201             hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower))
2202         );
2203 
2204         assert_eq!(
2205             t_err("(?-u)[[:^lower:]]"),
2206             TestError {
2207                 kind: hir::ErrorKind::InvalidUtf8,
2208                 span: Span::new(
2209                     Position::new(6, 1, 7),
2210                     Position::new(16, 1, 17)
2211                 ),
2212             }
2213         );
2214         assert_eq!(
2215             t_err("(?i-u)[[:^lower:]]"),
2216             TestError {
2217                 kind: hir::ErrorKind::InvalidUtf8,
2218                 span: Span::new(
2219                     Position::new(7, 1, 8),
2220                     Position::new(17, 1, 18)
2221                 ),
2222             }
2223         );
2224     }
2225 
2226     #[test]
class_ascii_multiple()2227     fn class_ascii_multiple() {
2228         // See: https://github.com/rust-lang/regex/issues/680
2229         assert_eq!(
2230             t("[[:alnum:][:^ascii:]]"),
2231             hir_union(
2232                 hir_ascii_uclass(&ast::ClassAsciiKind::Alnum),
2233                 hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
2234             ),
2235         );
2236         assert_eq!(
2237             t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
2238             hir_union(
2239                 hir_ascii_bclass(&ast::ClassAsciiKind::Alnum),
2240                 hir_bclass(&[(0x80, 0xFF)]),
2241             ),
2242         );
2243     }
2244 
2245     #[test]
2246     #[cfg(feature = "unicode-perl")]
class_perl_unicode()2247     fn class_perl_unicode() {
2248         // Unicode
2249         assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
2250         assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
2251         assert_eq!(t(r"\w"), hir_uclass_perl_word());
2252         #[cfg(feature = "unicode-case")]
2253         assert_eq!(
2254             t(r"(?i)\d"),
2255             hir_uclass_query(ClassQuery::Binary("digit"))
2256         );
2257         #[cfg(feature = "unicode-case")]
2258         assert_eq!(
2259             t(r"(?i)\s"),
2260             hir_uclass_query(ClassQuery::Binary("space"))
2261         );
2262         #[cfg(feature = "unicode-case")]
2263         assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
2264 
2265         // Unicode, negated
2266         assert_eq!(
2267             t(r"\D"),
2268             hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2269         );
2270         assert_eq!(
2271             t(r"\S"),
2272             hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2273         );
2274         assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
2275         #[cfg(feature = "unicode-case")]
2276         assert_eq!(
2277             t(r"(?i)\D"),
2278             hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2279         );
2280         #[cfg(feature = "unicode-case")]
2281         assert_eq!(
2282             t(r"(?i)\S"),
2283             hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2284         );
2285         #[cfg(feature = "unicode-case")]
2286         assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
2287     }
2288 
2289     #[test]
class_perl_ascii()2290     fn class_perl_ascii() {
2291         // ASCII only
2292         assert_eq!(
2293             t(r"(?-u)\d"),
2294             hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2295         );
2296         assert_eq!(
2297             t(r"(?-u)\s"),
2298             hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2299         );
2300         assert_eq!(
2301             t(r"(?-u)\w"),
2302             hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2303         );
2304         assert_eq!(
2305             t(r"(?i-u)\d"),
2306             hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2307         );
2308         assert_eq!(
2309             t(r"(?i-u)\s"),
2310             hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2311         );
2312         assert_eq!(
2313             t(r"(?i-u)\w"),
2314             hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2315         );
2316 
2317         // ASCII only, negated
2318         assert_eq!(
2319             t_bytes(r"(?-u)\D"),
2320             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2321         );
2322         assert_eq!(
2323             t_bytes(r"(?-u)\S"),
2324             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2325         );
2326         assert_eq!(
2327             t_bytes(r"(?-u)\W"),
2328             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2329         );
2330         assert_eq!(
2331             t_bytes(r"(?i-u)\D"),
2332             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2333         );
2334         assert_eq!(
2335             t_bytes(r"(?i-u)\S"),
2336             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2337         );
2338         assert_eq!(
2339             t_bytes(r"(?i-u)\W"),
2340             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2341         );
2342 
2343         // ASCII only, negated, with UTF-8 mode enabled.
2344         // In this case, negating any Perl class results in an error because
2345         // all such classes can match invalid UTF-8.
2346         assert_eq!(
2347             t_err(r"(?-u)\D"),
2348             TestError {
2349                 kind: hir::ErrorKind::InvalidUtf8,
2350                 span: Span::new(
2351                     Position::new(5, 1, 6),
2352                     Position::new(7, 1, 8),
2353                 ),
2354             },
2355         );
2356         assert_eq!(
2357             t_err(r"(?-u)\S"),
2358             TestError {
2359                 kind: hir::ErrorKind::InvalidUtf8,
2360                 span: Span::new(
2361                     Position::new(5, 1, 6),
2362                     Position::new(7, 1, 8),
2363                 ),
2364             },
2365         );
2366         assert_eq!(
2367             t_err(r"(?-u)\W"),
2368             TestError {
2369                 kind: hir::ErrorKind::InvalidUtf8,
2370                 span: Span::new(
2371                     Position::new(5, 1, 6),
2372                     Position::new(7, 1, 8),
2373                 ),
2374             },
2375         );
2376         assert_eq!(
2377             t_err(r"(?i-u)\D"),
2378             TestError {
2379                 kind: hir::ErrorKind::InvalidUtf8,
2380                 span: Span::new(
2381                     Position::new(6, 1, 7),
2382                     Position::new(8, 1, 9),
2383                 ),
2384             },
2385         );
2386         assert_eq!(
2387             t_err(r"(?i-u)\S"),
2388             TestError {
2389                 kind: hir::ErrorKind::InvalidUtf8,
2390                 span: Span::new(
2391                     Position::new(6, 1, 7),
2392                     Position::new(8, 1, 9),
2393                 ),
2394             },
2395         );
2396         assert_eq!(
2397             t_err(r"(?i-u)\W"),
2398             TestError {
2399                 kind: hir::ErrorKind::InvalidUtf8,
2400                 span: Span::new(
2401                     Position::new(6, 1, 7),
2402                     Position::new(8, 1, 9),
2403                 ),
2404             },
2405         );
2406     }
2407 
2408     #[test]
2409     #[cfg(not(feature = "unicode-perl"))]
class_perl_word_disabled()2410     fn class_perl_word_disabled() {
2411         assert_eq!(
2412             t_err(r"\w"),
2413             TestError {
2414                 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2415                 span: Span::new(
2416                     Position::new(0, 1, 1),
2417                     Position::new(2, 1, 3)
2418                 ),
2419             }
2420         );
2421     }
2422 
2423     #[test]
2424     #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
class_perl_space_disabled()2425     fn class_perl_space_disabled() {
2426         assert_eq!(
2427             t_err(r"\s"),
2428             TestError {
2429                 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2430                 span: Span::new(
2431                     Position::new(0, 1, 1),
2432                     Position::new(2, 1, 3)
2433                 ),
2434             }
2435         );
2436     }
2437 
2438     #[test]
2439     #[cfg(all(
2440         not(feature = "unicode-perl"),
2441         not(feature = "unicode-gencat")
2442     ))]
class_perl_digit_disabled()2443     fn class_perl_digit_disabled() {
2444         assert_eq!(
2445             t_err(r"\d"),
2446             TestError {
2447                 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2448                 span: Span::new(
2449                     Position::new(0, 1, 1),
2450                     Position::new(2, 1, 3)
2451                 ),
2452             }
2453         );
2454     }
2455 
2456     #[test]
2457     #[cfg(feature = "unicode-gencat")]
class_unicode_gencat()2458     fn class_unicode_gencat() {
2459         assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
2460         assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
2461         assert_eq!(
2462             t(r"\p{Separator}"),
2463             hir_uclass_query(ClassQuery::Binary("Z"))
2464         );
2465         assert_eq!(
2466             t(r"\p{se      PaRa ToR}"),
2467             hir_uclass_query(ClassQuery::Binary("Z"))
2468         );
2469         assert_eq!(
2470             t(r"\p{gc:Separator}"),
2471             hir_uclass_query(ClassQuery::Binary("Z"))
2472         );
2473         assert_eq!(
2474             t(r"\p{gc=Separator}"),
2475             hir_uclass_query(ClassQuery::Binary("Z"))
2476         );
2477         assert_eq!(
2478             t(r"\p{Other}"),
2479             hir_uclass_query(ClassQuery::Binary("Other"))
2480         );
2481         assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
2482 
2483         assert_eq!(
2484             t(r"\PZ"),
2485             hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2486         );
2487         assert_eq!(
2488             t(r"\P{separator}"),
2489             hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2490         );
2491         assert_eq!(
2492             t(r"\P{gc!=separator}"),
2493             hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2494         );
2495 
2496         assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
2497         assert_eq!(
2498             t(r"\p{assigned}"),
2499             hir_uclass_query(ClassQuery::Binary("Assigned"))
2500         );
2501         assert_eq!(
2502             t(r"\p{ascii}"),
2503             hir_uclass_query(ClassQuery::Binary("ASCII"))
2504         );
2505         assert_eq!(
2506             t(r"\p{gc:any}"),
2507             hir_uclass_query(ClassQuery::Binary("Any"))
2508         );
2509         assert_eq!(
2510             t(r"\p{gc:assigned}"),
2511             hir_uclass_query(ClassQuery::Binary("Assigned"))
2512         );
2513         assert_eq!(
2514             t(r"\p{gc:ascii}"),
2515             hir_uclass_query(ClassQuery::Binary("ASCII"))
2516         );
2517 
2518         assert_eq!(
2519             t_err(r"(?-u)\pZ"),
2520             TestError {
2521                 kind: hir::ErrorKind::UnicodeNotAllowed,
2522                 span: Span::new(
2523                     Position::new(5, 1, 6),
2524                     Position::new(8, 1, 9)
2525                 ),
2526             }
2527         );
2528         assert_eq!(
2529             t_err(r"(?-u)\p{Separator}"),
2530             TestError {
2531                 kind: hir::ErrorKind::UnicodeNotAllowed,
2532                 span: Span::new(
2533                     Position::new(5, 1, 6),
2534                     Position::new(18, 1, 19)
2535                 ),
2536             }
2537         );
2538         assert_eq!(
2539             t_err(r"\pE"),
2540             TestError {
2541                 kind: hir::ErrorKind::UnicodePropertyNotFound,
2542                 span: Span::new(
2543                     Position::new(0, 1, 1),
2544                     Position::new(3, 1, 4)
2545                 ),
2546             }
2547         );
2548         assert_eq!(
2549             t_err(r"\p{Foo}"),
2550             TestError {
2551                 kind: hir::ErrorKind::UnicodePropertyNotFound,
2552                 span: Span::new(
2553                     Position::new(0, 1, 1),
2554                     Position::new(7, 1, 8)
2555                 ),
2556             }
2557         );
2558         assert_eq!(
2559             t_err(r"\p{gc:Foo}"),
2560             TestError {
2561                 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2562                 span: Span::new(
2563                     Position::new(0, 1, 1),
2564                     Position::new(10, 1, 11)
2565                 ),
2566             }
2567         );
2568     }
2569 
2570     #[test]
2571     #[cfg(not(feature = "unicode-gencat"))]
class_unicode_gencat_disabled()2572     fn class_unicode_gencat_disabled() {
2573         assert_eq!(
2574             t_err(r"\p{Separator}"),
2575             TestError {
2576                 kind: hir::ErrorKind::UnicodePropertyNotFound,
2577                 span: Span::new(
2578                     Position::new(0, 1, 1),
2579                     Position::new(13, 1, 14)
2580                 ),
2581             }
2582         );
2583 
2584         assert_eq!(
2585             t_err(r"\p{Any}"),
2586             TestError {
2587                 kind: hir::ErrorKind::UnicodePropertyNotFound,
2588                 span: Span::new(
2589                     Position::new(0, 1, 1),
2590                     Position::new(7, 1, 8)
2591                 ),
2592             }
2593         );
2594     }
2595 
2596     #[test]
2597     #[cfg(feature = "unicode-script")]
class_unicode_script()2598     fn class_unicode_script() {
2599         assert_eq!(
2600             t(r"\p{Greek}"),
2601             hir_uclass_query(ClassQuery::Binary("Greek"))
2602         );
2603         #[cfg(feature = "unicode-case")]
2604         assert_eq!(
2605             t(r"(?i)\p{Greek}"),
2606             hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
2607         );
2608         #[cfg(feature = "unicode-case")]
2609         assert_eq!(
2610             t(r"(?i)\P{Greek}"),
2611             hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2612                 "Greek"
2613             ))))
2614         );
2615 
2616         assert_eq!(
2617             t_err(r"\p{sc:Foo}"),
2618             TestError {
2619                 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2620                 span: Span::new(
2621                     Position::new(0, 1, 1),
2622                     Position::new(10, 1, 11)
2623                 ),
2624             }
2625         );
2626         assert_eq!(
2627             t_err(r"\p{scx:Foo}"),
2628             TestError {
2629                 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2630                 span: Span::new(
2631                     Position::new(0, 1, 1),
2632                     Position::new(11, 1, 12)
2633                 ),
2634             }
2635         );
2636     }
2637 
2638     #[test]
2639     #[cfg(not(feature = "unicode-script"))]
class_unicode_script_disabled()2640     fn class_unicode_script_disabled() {
2641         assert_eq!(
2642             t_err(r"\p{Greek}"),
2643             TestError {
2644                 kind: hir::ErrorKind::UnicodePropertyNotFound,
2645                 span: Span::new(
2646                     Position::new(0, 1, 1),
2647                     Position::new(9, 1, 10)
2648                 ),
2649             }
2650         );
2651 
2652         assert_eq!(
2653             t_err(r"\p{scx:Greek}"),
2654             TestError {
2655                 kind: hir::ErrorKind::UnicodePropertyNotFound,
2656                 span: Span::new(
2657                     Position::new(0, 1, 1),
2658                     Position::new(13, 1, 14)
2659                 ),
2660             }
2661         );
2662     }
2663 
2664     #[test]
2665     #[cfg(feature = "unicode-age")]
class_unicode_age()2666     fn class_unicode_age() {
2667         assert_eq!(
2668             t_err(r"\p{age:Foo}"),
2669             TestError {
2670                 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2671                 span: Span::new(
2672                     Position::new(0, 1, 1),
2673                     Position::new(11, 1, 12)
2674                 ),
2675             }
2676         );
2677     }
2678 
2679     #[test]
2680     #[cfg(feature = "unicode-gencat")]
class_unicode_any_empty()2681     fn class_unicode_any_empty() {
2682         assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
2683     }
2684 
2685     #[test]
2686     #[cfg(not(feature = "unicode-age"))]
class_unicode_age_disabled()2687     fn class_unicode_age_disabled() {
2688         assert_eq!(
2689             t_err(r"\p{age:3.0}"),
2690             TestError {
2691                 kind: hir::ErrorKind::UnicodePropertyNotFound,
2692                 span: Span::new(
2693                     Position::new(0, 1, 1),
2694                     Position::new(11, 1, 12)
2695                 ),
2696             }
2697         );
2698     }
2699 
2700     #[test]
class_bracketed()2701     fn class_bracketed() {
2702         assert_eq!(t("[a]"), hir_lit("a"));
2703         assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')]));
2704         assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')])));
2705         assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
2706         assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
2707         assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
2708         assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
2709         assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
2710         assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
2711         #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2712         assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
2713         #[cfg(feature = "unicode-gencat")]
2714         assert_eq!(
2715             t(r"[\pZ]"),
2716             hir_uclass_query(ClassQuery::Binary("separator"))
2717         );
2718         #[cfg(feature = "unicode-gencat")]
2719         assert_eq!(
2720             t(r"[\p{separator}]"),
2721             hir_uclass_query(ClassQuery::Binary("separator"))
2722         );
2723         #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2724         assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
2725         #[cfg(feature = "unicode-gencat")]
2726         assert_eq!(
2727             t(r"[^\PZ]"),
2728             hir_uclass_query(ClassQuery::Binary("separator"))
2729         );
2730         #[cfg(feature = "unicode-gencat")]
2731         assert_eq!(
2732             t(r"[^\P{separator}]"),
2733             hir_uclass_query(ClassQuery::Binary("separator"))
2734         );
2735         #[cfg(all(
2736             feature = "unicode-case",
2737             any(feature = "unicode-perl", feature = "unicode-gencat")
2738         ))]
2739         assert_eq!(
2740             t(r"(?i)[^\D]"),
2741             hir_uclass_query(ClassQuery::Binary("digit"))
2742         );
2743         #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2744         assert_eq!(
2745             t(r"(?i)[^\P{greek}]"),
2746             hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
2747         );
2748 
2749         assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
2750         assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
2751         assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
2752 
2753         #[cfg(feature = "unicode-case")]
2754         assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
2755         #[cfg(feature = "unicode-case")]
2756         assert_eq!(
2757             t("(?i)[k]"),
2758             hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
2759         );
2760         #[cfg(feature = "unicode-case")]
2761         assert_eq!(
2762             t("(?i)[β]"),
2763             hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
2764         );
2765         assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
2766 
2767         assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')])));
2768         assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')])));
2769         assert_eq!(
2770             t_bytes("(?-u)[^a]"),
2771             class_negate(bclass(&[(b'a', b'a')]))
2772         );
2773         #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2774         assert_eq!(
2775             t(r"[^\d]"),
2776             hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2777         );
2778         #[cfg(feature = "unicode-gencat")]
2779         assert_eq!(
2780             t(r"[^\pZ]"),
2781             hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2782         );
2783         #[cfg(feature = "unicode-gencat")]
2784         assert_eq!(
2785             t(r"[^\p{separator}]"),
2786             hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2787         );
2788         #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2789         assert_eq!(
2790             t(r"(?i)[^\p{greek}]"),
2791             hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2792                 "greek"
2793             ))))
2794         );
2795         #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2796         assert_eq!(
2797             t(r"(?i)[\P{greek}]"),
2798             hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2799                 "greek"
2800             ))))
2801         );
2802 
2803         // Test some weird cases.
2804         assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
2805 
2806         assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
2807         assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
2808         assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
2809         assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
2810         assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
2811 
2812         assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
2813         assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
2814         assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
2815         assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
2816         assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
2817 
2818         assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
2819         assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
2820         assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
2821         assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
2822         assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
2823 
2824         assert_eq!(
2825             t_err("(?-u)[^a]"),
2826             TestError {
2827                 kind: hir::ErrorKind::InvalidUtf8,
2828                 span: Span::new(
2829                     Position::new(5, 1, 6),
2830                     Position::new(9, 1, 10)
2831                 ),
2832             }
2833         );
2834         #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2835         assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
2836         #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2837         assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
2838     }
2839 
2840     #[test]
class_bracketed_union()2841     fn class_bracketed_union() {
2842         assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2843         #[cfg(feature = "unicode-gencat")]
2844         assert_eq!(
2845             t(r"[a\pZb]"),
2846             hir_union(
2847                 hir_uclass(&[('a', 'b')]),
2848                 hir_uclass_query(ClassQuery::Binary("separator"))
2849             )
2850         );
2851         #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
2852         assert_eq!(
2853             t(r"[\pZ\p{Greek}]"),
2854             hir_union(
2855                 hir_uclass_query(ClassQuery::Binary("greek")),
2856                 hir_uclass_query(ClassQuery::Binary("separator"))
2857             )
2858         );
2859         #[cfg(all(
2860             feature = "unicode-age",
2861             feature = "unicode-gencat",
2862             feature = "unicode-script"
2863         ))]
2864         assert_eq!(
2865             t(r"[\p{age:3.0}\pZ\p{Greek}]"),
2866             hir_union(
2867                 hir_uclass_query(ClassQuery::ByValue {
2868                     property_name: "age",
2869                     property_value: "3.0",
2870                 }),
2871                 hir_union(
2872                     hir_uclass_query(ClassQuery::Binary("greek")),
2873                     hir_uclass_query(ClassQuery::Binary("separator"))
2874                 )
2875             )
2876         );
2877         #[cfg(all(
2878             feature = "unicode-age",
2879             feature = "unicode-gencat",
2880             feature = "unicode-script"
2881         ))]
2882         assert_eq!(
2883             t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
2884             hir_union(
2885                 hir_uclass_query(ClassQuery::ByValue {
2886                     property_name: "age",
2887                     property_value: "3.0",
2888                 }),
2889                 hir_union(
2890                     hir_uclass_query(ClassQuery::Binary("cyrillic")),
2891                     hir_union(
2892                         hir_uclass_query(ClassQuery::Binary("greek")),
2893                         hir_uclass_query(ClassQuery::Binary("separator"))
2894                     )
2895                 )
2896             )
2897         );
2898 
2899         #[cfg(all(
2900             feature = "unicode-age",
2901             feature = "unicode-case",
2902             feature = "unicode-gencat",
2903             feature = "unicode-script"
2904         ))]
2905         assert_eq!(
2906             t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
2907             hir_case_fold(hir_union(
2908                 hir_uclass_query(ClassQuery::ByValue {
2909                     property_name: "age",
2910                     property_value: "3.0",
2911                 }),
2912                 hir_union(
2913                     hir_uclass_query(ClassQuery::Binary("greek")),
2914                     hir_uclass_query(ClassQuery::Binary("separator"))
2915                 )
2916             ))
2917         );
2918         #[cfg(all(
2919             feature = "unicode-age",
2920             feature = "unicode-gencat",
2921             feature = "unicode-script"
2922         ))]
2923         assert_eq!(
2924             t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
2925             hir_negate(hir_union(
2926                 hir_uclass_query(ClassQuery::ByValue {
2927                     property_name: "age",
2928                     property_value: "3.0",
2929                 }),
2930                 hir_union(
2931                     hir_uclass_query(ClassQuery::Binary("greek")),
2932                     hir_uclass_query(ClassQuery::Binary("separator"))
2933                 )
2934             ))
2935         );
2936         #[cfg(all(
2937             feature = "unicode-age",
2938             feature = "unicode-case",
2939             feature = "unicode-gencat",
2940             feature = "unicode-script"
2941         ))]
2942         assert_eq!(
2943             t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
2944             hir_negate(hir_case_fold(hir_union(
2945                 hir_uclass_query(ClassQuery::ByValue {
2946                     property_name: "age",
2947                     property_value: "3.0",
2948                 }),
2949                 hir_union(
2950                     hir_uclass_query(ClassQuery::Binary("greek")),
2951                     hir_uclass_query(ClassQuery::Binary("separator"))
2952                 )
2953             )))
2954         );
2955     }
2956 
2957     #[test]
class_bracketed_nested()2958     fn class_bracketed_nested() {
2959         assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')])));
2960         assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')])));
2961         assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[])));
2962 
2963         assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
2964         assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
2965 
2966         #[cfg(feature = "unicode-case")]
2967         assert_eq!(
2968             t(r"(?i)[a[^c]]"),
2969             hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2970         );
2971         #[cfg(feature = "unicode-case")]
2972         assert_eq!(
2973             t(r"(?i)[a-b[^c]]"),
2974             hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2975         );
2976 
2977         #[cfg(feature = "unicode-case")]
2978         assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
2979         #[cfg(feature = "unicode-case")]
2980         assert_eq!(
2981             t(r"(?i)[^a-b[^c]]"),
2982             hir_uclass(&[('C', 'C'), ('c', 'c')])
2983         );
2984 
2985         assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
2986         #[cfg(feature = "unicode-case")]
2987         assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
2988     }
2989 
2990     #[test]
class_bracketed_intersect()2991     fn class_bracketed_intersect() {
2992         assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
2993         assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2994         assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2995         assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
2996         assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
2997         assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
2998         assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
2999         assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
3000         assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
3001 
3002         assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
3003         assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
3004         assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
3005         assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
3006         assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
3007         assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
3008 
3009         #[cfg(feature = "unicode-case")]
3010         assert_eq!(
3011             t("(?i)[abc&&b-c]"),
3012             hir_case_fold(hir_uclass(&[('b', 'c')]))
3013         );
3014         #[cfg(feature = "unicode-case")]
3015         assert_eq!(
3016             t("(?i)[abc&&[b-c]]"),
3017             hir_case_fold(hir_uclass(&[('b', 'c')]))
3018         );
3019         #[cfg(feature = "unicode-case")]
3020         assert_eq!(
3021             t("(?i)[[abc]&&[b-c]]"),
3022             hir_case_fold(hir_uclass(&[('b', 'c')]))
3023         );
3024         #[cfg(feature = "unicode-case")]
3025         assert_eq!(
3026             t("(?i)[a-z&&b-y&&c-x]"),
3027             hir_case_fold(hir_uclass(&[('c', 'x')]))
3028         );
3029         #[cfg(feature = "unicode-case")]
3030         assert_eq!(
3031             t("(?i)[c-da-b&&a-d]"),
3032             hir_case_fold(hir_uclass(&[('a', 'd')]))
3033         );
3034         #[cfg(feature = "unicode-case")]
3035         assert_eq!(
3036             t("(?i)[a-d&&c-da-b]"),
3037             hir_case_fold(hir_uclass(&[('a', 'd')]))
3038         );
3039 
3040         assert_eq!(
3041             t("(?i-u)[abc&&b-c]"),
3042             hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3043         );
3044         assert_eq!(
3045             t("(?i-u)[abc&&[b-c]]"),
3046             hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3047         );
3048         assert_eq!(
3049             t("(?i-u)[[abc]&&[b-c]]"),
3050             hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3051         );
3052         assert_eq!(
3053             t("(?i-u)[a-z&&b-y&&c-x]"),
3054             hir_case_fold(hir_bclass(&[(b'c', b'x')]))
3055         );
3056         assert_eq!(
3057             t("(?i-u)[c-da-b&&a-d]"),
3058             hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3059         );
3060         assert_eq!(
3061             t("(?i-u)[a-d&&c-da-b]"),
3062             hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3063         );
3064 
3065         // In `[a^]`, `^` does not need to be escaped, so it makes sense that
3066         // `^` is also allowed to be unescaped after `&&`.
3067         assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
3068         // `]` needs to be escaped after `&&` since it's not at start of class.
3069         assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
3070         assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
3071         assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
3072         assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
3073         // Test precedence.
3074         assert_eq!(
3075             t(r"[a-w&&[^c-g]z]"),
3076             hir_uclass(&[('a', 'b'), ('h', 'w')])
3077         );
3078     }
3079 
3080     #[test]
class_bracketed_intersect_negate()3081     fn class_bracketed_intersect_negate() {
3082         #[cfg(feature = "unicode-perl")]
3083         assert_eq!(
3084             t(r"[^\w&&\d]"),
3085             hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3086         );
3087         assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
3088         #[cfg(feature = "unicode-perl")]
3089         assert_eq!(
3090             t(r"[^[\w&&\d]]"),
3091             hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3092         );
3093         #[cfg(feature = "unicode-perl")]
3094         assert_eq!(
3095             t(r"[^[^\w&&\d]]"),
3096             hir_uclass_query(ClassQuery::Binary("digit"))
3097         );
3098         #[cfg(feature = "unicode-perl")]
3099         assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
3100 
3101         #[cfg(feature = "unicode-perl")]
3102         assert_eq!(
3103             t_bytes(r"(?-u)[^\w&&\d]"),
3104             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3105         );
3106         assert_eq!(
3107             t_bytes(r"(?-u)[^[a-z&&a-c]]"),
3108             hir_negate(hir_bclass(&[(b'a', b'c')]))
3109         );
3110         assert_eq!(
3111             t_bytes(r"(?-u)[^[\w&&\d]]"),
3112             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3113         );
3114         assert_eq!(
3115             t_bytes(r"(?-u)[^[^\w&&\d]]"),
3116             hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
3117         );
3118         assert_eq!(
3119             t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
3120             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
3121         );
3122     }
3123 
3124     #[test]
class_bracketed_difference()3125     fn class_bracketed_difference() {
3126         #[cfg(feature = "unicode-gencat")]
3127         assert_eq!(
3128             t(r"[\pL--[:ascii:]]"),
3129             hir_difference(
3130                 hir_uclass_query(ClassQuery::Binary("letter")),
3131                 hir_uclass(&[('\0', '\x7F')])
3132             )
3133         );
3134 
3135         assert_eq!(
3136             t(r"(?-u)[[:alpha:]--[:lower:]]"),
3137             hir_bclass(&[(b'A', b'Z')])
3138         );
3139     }
3140 
3141     #[test]
class_bracketed_symmetric_difference()3142     fn class_bracketed_symmetric_difference() {
3143         #[cfg(feature = "unicode-script")]
3144         assert_eq!(
3145             t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
3146             hir_uclass(&[
3147                 ('\u{0342}', '\u{0342}'),
3148                 ('\u{0345}', '\u{0345}'),
3149                 ('\u{1DC0}', '\u{1DC1}'),
3150             ])
3151         );
3152         assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
3153 
3154         assert_eq!(
3155             t(r"(?-u)[a-g~~c-j]"),
3156             hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
3157         );
3158     }
3159 
3160     #[test]
ignore_whitespace()3161     fn ignore_whitespace() {
3162         assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
3163         assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
3164         assert_eq!(
3165             t(r"(?x)\x # comment
3166 { # comment
3167     53 # comment
3168 } #comment"),
3169             hir_lit("S")
3170         );
3171 
3172         assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
3173         assert_eq!(
3174             t(r"(?x)\x # comment
3175         53 # comment"),
3176             hir_lit("S")
3177         );
3178         assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
3179 
3180         #[cfg(feature = "unicode-gencat")]
3181         assert_eq!(
3182             t(r"(?x)\p # comment
3183 { # comment
3184     Separator # comment
3185 } # comment"),
3186             hir_uclass_query(ClassQuery::Binary("separator"))
3187         );
3188 
3189         assert_eq!(
3190             t(r"(?x)a # comment
3191 { # comment
3192     5 # comment
3193     , # comment
3194     10 # comment
3195 } # comment"),
3196             hir_range(true, 5, Some(10), hir_lit("a"))
3197         );
3198 
3199         assert_eq!(t(r"(?x)a\  # hi there"), hir_lit("a "));
3200     }
3201 
3202     #[test]
analysis_is_utf8()3203     fn analysis_is_utf8() {
3204         // Positive examples.
3205         assert!(props_bytes(r"a").is_utf8());
3206         assert!(props_bytes(r"ab").is_utf8());
3207         assert!(props_bytes(r"(?-u)a").is_utf8());
3208         assert!(props_bytes(r"(?-u)ab").is_utf8());
3209         assert!(props_bytes(r"\xFF").is_utf8());
3210         assert!(props_bytes(r"\xFF\xFF").is_utf8());
3211         assert!(props_bytes(r"[^a]").is_utf8());
3212         assert!(props_bytes(r"[^a][^a]").is_utf8());
3213         assert!(props_bytes(r"\b").is_utf8());
3214         assert!(props_bytes(r"\B").is_utf8());
3215         assert!(props_bytes(r"(?-u)\b").is_utf8());
3216         assert!(props_bytes(r"(?-u)\B").is_utf8());
3217 
3218         // Negative examples.
3219         assert!(!props_bytes(r"(?-u)\xFF").is_utf8());
3220         assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8());
3221         assert!(!props_bytes(r"(?-u)[^a]").is_utf8());
3222         assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8());
3223     }
3224 
3225     #[test]
analysis_captures_len()3226     fn analysis_captures_len() {
3227         assert_eq!(0, props(r"a").explicit_captures_len());
3228         assert_eq!(0, props(r"(?:a)").explicit_captures_len());
3229         assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len());
3230         assert_eq!(0, props(r"(?i-u)a").explicit_captures_len());
3231         assert_eq!(1, props(r"(a)").explicit_captures_len());
3232         assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len());
3233         assert_eq!(1, props(r"()").explicit_captures_len());
3234         assert_eq!(1, props(r"()a").explicit_captures_len());
3235         assert_eq!(1, props(r"(a)+").explicit_captures_len());
3236         assert_eq!(2, props(r"(a)(b)").explicit_captures_len());
3237         assert_eq!(2, props(r"(a)|(b)").explicit_captures_len());
3238         assert_eq!(2, props(r"((a))").explicit_captures_len());
3239         assert_eq!(1, props(r"([a&&b])").explicit_captures_len());
3240     }
3241 
3242     #[test]
analysis_static_captures_len()3243     fn analysis_static_captures_len() {
3244         let len = |pattern| props(pattern).static_explicit_captures_len();
3245         assert_eq!(Some(0), len(r""));
3246         assert_eq!(Some(0), len(r"foo|bar"));
3247         assert_eq!(None, len(r"(foo)|bar"));
3248         assert_eq!(None, len(r"foo|(bar)"));
3249         assert_eq!(Some(1), len(r"(foo|bar)"));
3250         assert_eq!(Some(1), len(r"(a|b|c|d|e|f)"));
3251         assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)"));
3252         assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)"));
3253         assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)"));
3254         assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()"));
3255         assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)"));
3256         assert_eq!(None, len(r"(a)(b)(extra)?"));
3257         assert_eq!(Some(1), len(r"(foo)|(bar)"));
3258         assert_eq!(Some(2), len(r"(foo)(bar)"));
3259         assert_eq!(Some(2), len(r"(foo)+(bar)"));
3260         assert_eq!(None, len(r"(foo)*(bar)"));
3261         assert_eq!(Some(0), len(r"(foo)?{0}"));
3262         assert_eq!(None, len(r"(foo)?{1}"));
3263         assert_eq!(Some(1), len(r"(foo){1}"));
3264         assert_eq!(Some(1), len(r"(foo){1,}"));
3265         assert_eq!(Some(1), len(r"(foo){1,}?"));
3266         assert_eq!(None, len(r"(foo){1,}??"));
3267         assert_eq!(None, len(r"(foo){0,}"));
3268         assert_eq!(Some(1), len(r"(foo)(?:bar)"));
3269         assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))"));
3270         assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)"));
3271         assert_eq!(
3272             Some(2),
3273             len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#)
3274         );
3275     }
3276 
3277     #[test]
analysis_is_all_assertions()3278     fn analysis_is_all_assertions() {
3279         // Positive examples.
3280         let p = props(r"\b");
3281         assert!(!p.look_set().is_empty());
3282         assert_eq!(p.minimum_len(), Some(0));
3283 
3284         let p = props(r"\B");
3285         assert!(!p.look_set().is_empty());
3286         assert_eq!(p.minimum_len(), Some(0));
3287 
3288         let p = props(r"^");
3289         assert!(!p.look_set().is_empty());
3290         assert_eq!(p.minimum_len(), Some(0));
3291 
3292         let p = props(r"$");
3293         assert!(!p.look_set().is_empty());
3294         assert_eq!(p.minimum_len(), Some(0));
3295 
3296         let p = props(r"\A");
3297         assert!(!p.look_set().is_empty());
3298         assert_eq!(p.minimum_len(), Some(0));
3299 
3300         let p = props(r"\z");
3301         assert!(!p.look_set().is_empty());
3302         assert_eq!(p.minimum_len(), Some(0));
3303 
3304         let p = props(r"$^\z\A\b\B");
3305         assert!(!p.look_set().is_empty());
3306         assert_eq!(p.minimum_len(), Some(0));
3307 
3308         let p = props(r"$|^|\z|\A|\b|\B");
3309         assert!(!p.look_set().is_empty());
3310         assert_eq!(p.minimum_len(), Some(0));
3311 
3312         let p = props(r"^$|$^");
3313         assert!(!p.look_set().is_empty());
3314         assert_eq!(p.minimum_len(), Some(0));
3315 
3316         let p = props(r"((\b)+())*^");
3317         assert!(!p.look_set().is_empty());
3318         assert_eq!(p.minimum_len(), Some(0));
3319 
3320         // Negative examples.
3321         let p = props(r"^a");
3322         assert!(!p.look_set().is_empty());
3323         assert_eq!(p.minimum_len(), Some(1));
3324     }
3325 
3326     #[test]
analysis_look_set_prefix_any()3327     fn analysis_look_set_prefix_any() {
3328         let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))");
3329         assert!(p.look_set_prefix_any().contains(Look::WordAscii));
3330     }
3331 
3332     #[test]
analysis_is_anchored()3333     fn analysis_is_anchored() {
3334         let is_start = |p| props(p).look_set_prefix().contains(Look::Start);
3335         let is_end = |p| props(p).look_set_suffix().contains(Look::End);
3336 
3337         // Positive examples.
3338         assert!(is_start(r"^"));
3339         assert!(is_end(r"$"));
3340 
3341         assert!(is_start(r"^^"));
3342         assert!(props(r"$$").look_set_suffix().contains(Look::End));
3343 
3344         assert!(is_start(r"^$"));
3345         assert!(is_end(r"^$"));
3346 
3347         assert!(is_start(r"^foo"));
3348         assert!(is_end(r"foo$"));
3349 
3350         assert!(is_start(r"^foo|^bar"));
3351         assert!(is_end(r"foo$|bar$"));
3352 
3353         assert!(is_start(r"^(foo|bar)"));
3354         assert!(is_end(r"(foo|bar)$"));
3355 
3356         assert!(is_start(r"^+"));
3357         assert!(is_end(r"$+"));
3358         assert!(is_start(r"^++"));
3359         assert!(is_end(r"$++"));
3360         assert!(is_start(r"(^)+"));
3361         assert!(is_end(r"($)+"));
3362 
3363         assert!(is_start(r"$^"));
3364         assert!(is_start(r"$^"));
3365         assert!(is_start(r"$^|^$"));
3366         assert!(is_end(r"$^|^$"));
3367 
3368         assert!(is_start(r"\b^"));
3369         assert!(is_end(r"$\b"));
3370         assert!(is_start(r"^(?m:^)"));
3371         assert!(is_end(r"(?m:$)$"));
3372         assert!(is_start(r"(?m:^)^"));
3373         assert!(is_end(r"$(?m:$)"));
3374 
3375         // Negative examples.
3376         assert!(!is_start(r"(?m)^"));
3377         assert!(!is_end(r"(?m)$"));
3378         assert!(!is_start(r"(?m:^$)|$^"));
3379         assert!(!is_end(r"(?m:^$)|$^"));
3380         assert!(!is_start(r"$^|(?m:^$)"));
3381         assert!(!is_end(r"$^|(?m:^$)"));
3382 
3383         assert!(!is_start(r"a^"));
3384         assert!(!is_start(r"$a"));
3385 
3386         assert!(!is_end(r"a^"));
3387         assert!(!is_end(r"$a"));
3388 
3389         assert!(!is_start(r"^foo|bar"));
3390         assert!(!is_end(r"foo|bar$"));
3391 
3392         assert!(!is_start(r"^*"));
3393         assert!(!is_end(r"$*"));
3394         assert!(!is_start(r"^*+"));
3395         assert!(!is_end(r"$*+"));
3396         assert!(!is_start(r"^+*"));
3397         assert!(!is_end(r"$+*"));
3398         assert!(!is_start(r"(^)*"));
3399         assert!(!is_end(r"($)*"));
3400     }
3401 
3402     #[test]
analysis_is_any_anchored()3403     fn analysis_is_any_anchored() {
3404         let is_start = |p| props(p).look_set().contains(Look::Start);
3405         let is_end = |p| props(p).look_set().contains(Look::End);
3406 
3407         // Positive examples.
3408         assert!(is_start(r"^"));
3409         assert!(is_end(r"$"));
3410         assert!(is_start(r"\A"));
3411         assert!(is_end(r"\z"));
3412 
3413         // Negative examples.
3414         assert!(!is_start(r"(?m)^"));
3415         assert!(!is_end(r"(?m)$"));
3416         assert!(!is_start(r"$"));
3417         assert!(!is_end(r"^"));
3418     }
3419 
3420     #[test]
analysis_can_empty()3421     fn analysis_can_empty() {
3422         // Positive examples.
3423         let assert_empty =
3424             |p| assert_eq!(Some(0), props_bytes(p).minimum_len());
3425         assert_empty(r"");
3426         assert_empty(r"()");
3427         assert_empty(r"()*");
3428         assert_empty(r"()+");
3429         assert_empty(r"()?");
3430         assert_empty(r"a*");
3431         assert_empty(r"a?");
3432         assert_empty(r"a{0}");
3433         assert_empty(r"a{0,}");
3434         assert_empty(r"a{0,1}");
3435         assert_empty(r"a{0,10}");
3436         #[cfg(feature = "unicode-gencat")]
3437         assert_empty(r"\pL*");
3438         assert_empty(r"a*|b");
3439         assert_empty(r"b|a*");
3440         assert_empty(r"a|");
3441         assert_empty(r"|a");
3442         assert_empty(r"a||b");
3443         assert_empty(r"a*a?(abcd)*");
3444         assert_empty(r"^");
3445         assert_empty(r"$");
3446         assert_empty(r"(?m)^");
3447         assert_empty(r"(?m)$");
3448         assert_empty(r"\A");
3449         assert_empty(r"\z");
3450         assert_empty(r"\B");
3451         assert_empty(r"(?-u)\B");
3452         assert_empty(r"\b");
3453         assert_empty(r"(?-u)\b");
3454 
3455         // Negative examples.
3456         let assert_non_empty =
3457             |p| assert_ne!(Some(0), props_bytes(p).minimum_len());
3458         assert_non_empty(r"a+");
3459         assert_non_empty(r"a{1}");
3460         assert_non_empty(r"a{1,}");
3461         assert_non_empty(r"a{1,2}");
3462         assert_non_empty(r"a{1,10}");
3463         assert_non_empty(r"b|a");
3464         assert_non_empty(r"a*a+(abcd)*");
3465         #[cfg(feature = "unicode-gencat")]
3466         assert_non_empty(r"\P{any}");
3467         assert_non_empty(r"[a--a]");
3468         assert_non_empty(r"[a&&b]");
3469     }
3470 
3471     #[test]
analysis_is_literal()3472     fn analysis_is_literal() {
3473         // Positive examples.
3474         assert!(props(r"a").is_literal());
3475         assert!(props(r"ab").is_literal());
3476         assert!(props(r"abc").is_literal());
3477         assert!(props(r"(?m)abc").is_literal());
3478         assert!(props(r"(?:a)").is_literal());
3479         assert!(props(r"foo(?:a)").is_literal());
3480         assert!(props(r"(?:a)foo").is_literal());
3481         assert!(props(r"[a]").is_literal());
3482 
3483         // Negative examples.
3484         assert!(!props(r"").is_literal());
3485         assert!(!props(r"^").is_literal());
3486         assert!(!props(r"a|b").is_literal());
3487         assert!(!props(r"(a)").is_literal());
3488         assert!(!props(r"a+").is_literal());
3489         assert!(!props(r"foo(a)").is_literal());
3490         assert!(!props(r"(a)foo").is_literal());
3491         assert!(!props(r"[ab]").is_literal());
3492     }
3493 
3494     #[test]
analysis_is_alternation_literal()3495     fn analysis_is_alternation_literal() {
3496         // Positive examples.
3497         assert!(props(r"a").is_alternation_literal());
3498         assert!(props(r"ab").is_alternation_literal());
3499         assert!(props(r"abc").is_alternation_literal());
3500         assert!(props(r"(?m)abc").is_alternation_literal());
3501         assert!(props(r"foo|bar").is_alternation_literal());
3502         assert!(props(r"foo|bar|baz").is_alternation_literal());
3503         assert!(props(r"[a]").is_alternation_literal());
3504         assert!(props(r"(?:ab)|cd").is_alternation_literal());
3505         assert!(props(r"ab|(?:cd)").is_alternation_literal());
3506 
3507         // Negative examples.
3508         assert!(!props(r"").is_alternation_literal());
3509         assert!(!props(r"^").is_alternation_literal());
3510         assert!(!props(r"(a)").is_alternation_literal());
3511         assert!(!props(r"a+").is_alternation_literal());
3512         assert!(!props(r"foo(a)").is_alternation_literal());
3513         assert!(!props(r"(a)foo").is_alternation_literal());
3514         assert!(!props(r"[ab]").is_alternation_literal());
3515         assert!(!props(r"[ab]|b").is_alternation_literal());
3516         assert!(!props(r"a|[ab]").is_alternation_literal());
3517         assert!(!props(r"(a)|b").is_alternation_literal());
3518         assert!(!props(r"a|(b)").is_alternation_literal());
3519         assert!(!props(r"a|b").is_alternation_literal());
3520         assert!(!props(r"a|b|c").is_alternation_literal());
3521         assert!(!props(r"[a]|b").is_alternation_literal());
3522         assert!(!props(r"a|[b]").is_alternation_literal());
3523         assert!(!props(r"(?:a)|b").is_alternation_literal());
3524         assert!(!props(r"a|(?:b)").is_alternation_literal());
3525         assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
3526     }
3527 
3528     // This tests that the smart Hir::repetition constructors does some basic
3529     // simplifications.
3530     #[test]
smart_repetition()3531     fn smart_repetition() {
3532         assert_eq!(t(r"a{0}"), Hir::empty());
3533         assert_eq!(t(r"a{1}"), hir_lit("a"));
3534         assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate));
3535     }
3536 
3537     // This tests that the smart Hir::concat constructor simplifies the given
3538     // exprs in a way we expect.
3539     #[test]
smart_concat()3540     fn smart_concat() {
3541         assert_eq!(t(""), Hir::empty());
3542         assert_eq!(t("(?:)"), Hir::empty());
3543         assert_eq!(t("abc"), hir_lit("abc"));
3544         assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar"));
3545         assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz"));
3546         assert_eq!(
3547             t("foo(?:bar^baz)quux"),
3548             hir_cat(vec![
3549                 hir_lit("foobar"),
3550                 hir_look(hir::Look::Start),
3551                 hir_lit("bazquux"),
3552             ])
3553         );
3554         assert_eq!(
3555             t("foo(?:ba(?:r^b)az)quux"),
3556             hir_cat(vec![
3557                 hir_lit("foobar"),
3558                 hir_look(hir::Look::Start),
3559                 hir_lit("bazquux"),
3560             ])
3561         );
3562     }
3563 
3564     // This tests that the smart Hir::alternation constructor simplifies the
3565     // given exprs in a way we expect.
3566     #[test]
smart_alternation()3567     fn smart_alternation() {
3568         assert_eq!(
3569             t("(?:foo)|(?:bar)"),
3570             hir_alt(vec![hir_lit("foo"), hir_lit("bar")])
3571         );
3572         assert_eq!(
3573             t("quux|(?:abc|def|xyz)|baz"),
3574             hir_alt(vec![
3575                 hir_lit("quux"),
3576                 hir_lit("abc"),
3577                 hir_lit("def"),
3578                 hir_lit("xyz"),
3579                 hir_lit("baz"),
3580             ])
3581         );
3582         assert_eq!(
3583             t("quux|(?:abc|(?:def|mno)|xyz)|baz"),
3584             hir_alt(vec![
3585                 hir_lit("quux"),
3586                 hir_lit("abc"),
3587                 hir_lit("def"),
3588                 hir_lit("mno"),
3589                 hir_lit("xyz"),
3590                 hir_lit("baz"),
3591             ])
3592         );
3593         assert_eq!(
3594             t("a|b|c|d|e|f|x|y|z"),
3595             hir_uclass(&[('a', 'f'), ('x', 'z')]),
3596         );
3597         // Tests that we lift common prefixes out of an alternation.
3598         assert_eq!(
3599             t("[A-Z]foo|[A-Z]quux"),
3600             hir_cat(vec![
3601                 hir_uclass(&[('A', 'Z')]),
3602                 hir_alt(vec![hir_lit("foo"), hir_lit("quux")]),
3603             ]),
3604         );
3605         assert_eq!(
3606             t("[A-Z][A-Z]|[A-Z]quux"),
3607             hir_cat(vec![
3608                 hir_uclass(&[('A', 'Z')]),
3609                 hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]),
3610             ]),
3611         );
3612         assert_eq!(
3613             t("[A-Z][A-Z]|[A-Z][A-Z]quux"),
3614             hir_cat(vec![
3615                 hir_uclass(&[('A', 'Z')]),
3616                 hir_uclass(&[('A', 'Z')]),
3617                 hir_alt(vec![Hir::empty(), hir_lit("quux")]),
3618             ]),
3619         );
3620         assert_eq!(
3621             t("[A-Z]foo|[A-Z]foobar"),
3622             hir_cat(vec![
3623                 hir_uclass(&[('A', 'Z')]),
3624                 hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]),
3625             ]),
3626         );
3627     }
3628 
3629     #[test]
regression_alt_empty_concat()3630     fn regression_alt_empty_concat() {
3631         use crate::ast::{self, Ast};
3632 
3633         let span = Span::splat(Position::new(0, 0, 0));
3634         let ast = Ast::alternation(ast::Alternation {
3635             span,
3636             asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })],
3637         });
3638 
3639         let mut t = Translator::new();
3640         assert_eq!(Ok(Hir::empty()), t.translate("", &ast));
3641     }
3642 
3643     #[test]
regression_empty_alt()3644     fn regression_empty_alt() {
3645         use crate::ast::{self, Ast};
3646 
3647         let span = Span::splat(Position::new(0, 0, 0));
3648         let ast = Ast::concat(ast::Concat {
3649             span,
3650             asts: vec![Ast::alternation(ast::Alternation {
3651                 span,
3652                 asts: vec![],
3653             })],
3654         });
3655 
3656         let mut t = Translator::new();
3657         assert_eq!(Ok(Hir::fail()), t.translate("", &ast));
3658     }
3659 
3660     #[test]
regression_singleton_alt()3661     fn regression_singleton_alt() {
3662         use crate::{
3663             ast::{self, Ast},
3664             hir::Dot,
3665         };
3666 
3667         let span = Span::splat(Position::new(0, 0, 0));
3668         let ast = Ast::concat(ast::Concat {
3669             span,
3670             asts: vec![Ast::alternation(ast::Alternation {
3671                 span,
3672                 asts: vec![Ast::dot(span)],
3673             })],
3674         });
3675 
3676         let mut t = Translator::new();
3677         assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
3678     }
3679 
3680     // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
3681     #[test]
regression_fuzz_match()3682     fn regression_fuzz_match() {
3683         let pat = "[(\u{6} \0-\u{afdf5}]  \0 ";
3684         let ast = ParserBuilder::new()
3685             .octal(false)
3686             .ignore_whitespace(true)
3687             .build()
3688             .parse(pat)
3689             .unwrap();
3690         let hir = TranslatorBuilder::new()
3691             .utf8(true)
3692             .case_insensitive(false)
3693             .multi_line(false)
3694             .dot_matches_new_line(false)
3695             .swap_greed(true)
3696             .unicode(true)
3697             .build()
3698             .translate(pat, &ast)
3699             .unwrap();
3700         assert_eq!(
3701             hir,
3702             Hir::concat(vec![
3703                 hir_uclass(&[('\0', '\u{afdf5}')]),
3704                 hir_lit("\0"),
3705             ])
3706         );
3707     }
3708 
3709     // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
3710     #[cfg(feature = "unicode")]
3711     #[test]
regression_fuzz_difference1()3712     fn regression_fuzz_difference1() {
3713         let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*";
3714         let _ = t(pat); // shouldn't panic
3715     }
3716 
3717     // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
3718     #[test]
regression_fuzz_char_decrement1()3719     fn regression_fuzz_char_decrement1() {
3720         let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 ";
3721         let _ = t(pat); // shouldn't panic
3722     }
3723 }
3724