xref: /aosp_15_r20/external/cronet/third_party/rust/chromium_crates_io/vendor/regex-syntax-0.8.3/src/hir/mod.rs (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 /*!
2 Defines a high-level intermediate (HIR) representation for regular expressions.
3 
4 The HIR is represented by the [`Hir`] type, and it principally constructed via
5 [translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6 may use the smart constructors defined on `Hir` to build their own by hand. The
7 smart constructors simultaneously simplify and "optimize" the HIR, and are also
8 the same routines used by translation.
9 
10 Most regex engines only have an HIR like this, and usually construct it
11 directly from the concrete syntax. This crate however first parses the
12 concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13 as mentioned above. It's done this way to facilitate better error reporting,
14 and to have a structured representation of a regex that faithfully represents
15 its concrete syntax. Namely, while an `Hir` value can be converted back to an
16 equivalent regex pattern string, it is unlikely to look like the original due
17 to its simplified structure.
18 */
19 
20 use core::{char, cmp};
21 
22 use alloc::{
23     boxed::Box,
24     format,
25     string::{String, ToString},
26     vec,
27     vec::Vec,
28 };
29 
30 use crate::{
31     ast::Span,
32     hir::interval::{Interval, IntervalSet, IntervalSetIter},
33     unicode,
34 };
35 
36 pub use crate::{
37     hir::visitor::{visit, Visitor},
38     unicode::CaseFoldError,
39 };
40 
41 mod interval;
42 pub mod literal;
43 pub mod print;
44 pub mod translate;
45 mod visitor;
46 
47 /// An error that can occur while translating an `Ast` to a `Hir`.
48 #[derive(Clone, Debug, Eq, PartialEq)]
49 pub struct Error {
50     /// The kind of error.
51     kind: ErrorKind,
52     /// The original pattern that the translator's Ast was parsed from. Every
53     /// span in an error is a valid range into this string.
54     pattern: String,
55     /// The span of this error, derived from the Ast given to the translator.
56     span: Span,
57 }
58 
59 impl Error {
60     /// Return the type of this error.
kind(&self) -> &ErrorKind61     pub fn kind(&self) -> &ErrorKind {
62         &self.kind
63     }
64 
65     /// The original pattern string in which this error occurred.
66     ///
67     /// Every span reported by this error is reported in terms of this string.
pattern(&self) -> &str68     pub fn pattern(&self) -> &str {
69         &self.pattern
70     }
71 
72     /// Return the span at which this error occurred.
span(&self) -> &Span73     pub fn span(&self) -> &Span {
74         &self.span
75     }
76 }
77 
78 /// The type of an error that occurred while building an `Hir`.
79 ///
80 /// This error type is marked as `non_exhaustive`. This means that adding a
81 /// new variant is not considered a breaking change.
82 #[non_exhaustive]
83 #[derive(Clone, Debug, Eq, PartialEq)]
84 pub enum ErrorKind {
85     /// This error occurs when a Unicode feature is used when Unicode
86     /// support is disabled. For example `(?-u:\pL)` would trigger this error.
87     UnicodeNotAllowed,
88     /// This error occurs when translating a pattern that could match a byte
89     /// sequence that isn't UTF-8 and `utf8` was enabled.
90     InvalidUtf8,
91     /// This error occurs when one uses a non-ASCII byte for a line terminator,
92     /// but where Unicode mode is enabled and UTF-8 mode is disabled.
93     InvalidLineTerminator,
94     /// This occurs when an unrecognized Unicode property name could not
95     /// be found.
96     UnicodePropertyNotFound,
97     /// This occurs when an unrecognized Unicode property value could not
98     /// be found.
99     UnicodePropertyValueNotFound,
100     /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
101     /// `\d`) could not be found. This can occur when the `unicode-perl`
102     /// crate feature is not enabled.
103     UnicodePerlClassNotFound,
104     /// This occurs when the Unicode simple case mapping tables are not
105     /// available, and the regular expression required Unicode aware case
106     /// insensitivity.
107     UnicodeCaseUnavailable,
108 }
109 
110 #[cfg(feature = "std")]
111 impl std::error::Error for Error {}
112 
113 impl core::fmt::Display for Error {
fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result114     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
115         crate::error::Formatter::from(self).fmt(f)
116     }
117 }
118 
119 impl core::fmt::Display for ErrorKind {
fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result120     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
121         use self::ErrorKind::*;
122 
123         let msg = match *self {
124             UnicodeNotAllowed => "Unicode not allowed here",
125             InvalidUtf8 => "pattern can match invalid UTF-8",
126             InvalidLineTerminator => "invalid line terminator, must be ASCII",
127             UnicodePropertyNotFound => "Unicode property not found",
128             UnicodePropertyValueNotFound => "Unicode property value not found",
129             UnicodePerlClassNotFound => {
130                 "Unicode-aware Perl class not found \
131                  (make sure the unicode-perl feature is enabled)"
132             }
133             UnicodeCaseUnavailable => {
134                 "Unicode-aware case insensitivity matching is not available \
135                  (make sure the unicode-case feature is enabled)"
136             }
137         };
138         f.write_str(msg)
139     }
140 }
141 
142 /// A high-level intermediate representation (HIR) for a regular expression.
143 ///
144 /// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
145 /// An `HirKind` indicates what kind of regular expression it is (a literal,
146 /// a repetition, a look-around assertion, etc.), where as a `Properties`
147 /// describes various facts about the regular expression. For example, whether
148 /// it matches UTF-8 or if it matches the empty string.
149 ///
150 /// The HIR of a regular expression represents an intermediate step between
151 /// its abstract syntax (a structured description of the concrete syntax) and
152 /// an actual regex matcher. The purpose of HIR is to make regular expressions
153 /// easier to analyze. In particular, the AST is much more complex than the
154 /// HIR. For example, while an AST supports arbitrarily nested character
155 /// classes, the HIR will flatten all nested classes into a single set. The HIR
156 /// will also "compile away" every flag present in the concrete syntax. For
157 /// example, users of HIR expressions never need to worry about case folding;
158 /// it is handled automatically by the translator (e.g., by translating
159 /// `(?i:A)` to `[aA]`).
160 ///
161 /// The specific type of an HIR expression can be accessed via its `kind`
162 /// or `into_kind` methods. This extra level of indirection exists for two
163 /// reasons:
164 ///
165 /// 1. Construction of an HIR expression *must* use the constructor methods on
166 /// this `Hir` type instead of building the `HirKind` values directly. This
167 /// permits construction to enforce invariants like "concatenations always
168 /// consist of two or more sub-expressions."
169 /// 2. Every HIR expression contains attributes that are defined inductively,
170 /// and can be computed cheaply during the construction process. For example,
171 /// one such attribute is whether the expression must match at the beginning of
172 /// the haystack.
173 ///
174 /// In particular, if you have an `HirKind` value, then there is intentionally
175 /// no way to build an `Hir` value from it. You instead need to do case
176 /// analysis on the `HirKind` value and build the `Hir` value using its smart
177 /// constructors.
178 ///
179 /// # UTF-8
180 ///
181 /// If the HIR was produced by a translator with
182 /// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
183 /// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
184 /// matches.
185 ///
186 /// For empty matches, those can occur at any position. It is the
187 /// responsibility of the regex engine to determine whether empty matches are
188 /// permitted between the code units of a single codepoint.
189 ///
190 /// # Stack space
191 ///
192 /// This type defines its own destructor that uses constant stack space and
193 /// heap space proportional to the size of the HIR.
194 ///
195 /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
196 /// expression pattern string, and uses constant stack space and heap space
197 /// proportional to the size of the `Hir`. The regex it prints is guaranteed to
198 /// be _semantically_ equivalent to the original concrete syntax, but it may
199 /// look very different. (And potentially not practically readable by a human.)
200 ///
201 /// An `Hir`'s `fmt::Debug` implementation currently does not use constant
202 /// stack space. The implementation will also suppress some details (such as
203 /// the `Properties` inlined into every `Hir` value to make it less noisy).
204 #[derive(Clone, Eq, PartialEq)]
205 pub struct Hir {
206     /// The underlying HIR kind.
207     kind: HirKind,
208     /// Analysis info about this HIR, computed during construction.
209     props: Properties,
210 }
211 
212 /// Methods for accessing the underlying `HirKind` and `Properties`.
213 impl Hir {
214     /// Returns a reference to the underlying HIR kind.
kind(&self) -> &HirKind215     pub fn kind(&self) -> &HirKind {
216         &self.kind
217     }
218 
219     /// Consumes ownership of this HIR expression and returns its underlying
220     /// `HirKind`.
into_kind(mut self) -> HirKind221     pub fn into_kind(mut self) -> HirKind {
222         core::mem::replace(&mut self.kind, HirKind::Empty)
223     }
224 
225     /// Returns the properties computed for this `Hir`.
properties(&self) -> &Properties226     pub fn properties(&self) -> &Properties {
227         &self.props
228     }
229 
230     /// Splits this HIR into its constituent parts.
231     ///
232     /// This is useful because `let Hir { kind, props } = hir;` does not work
233     /// because of `Hir`'s custom `Drop` implementation.
into_parts(mut self) -> (HirKind, Properties)234     fn into_parts(mut self) -> (HirKind, Properties) {
235         (
236             core::mem::replace(&mut self.kind, HirKind::Empty),
237             core::mem::replace(&mut self.props, Properties::empty()),
238         )
239     }
240 }
241 
242 /// Smart constructors for HIR values.
243 ///
244 /// These constructors are called "smart" because they do inductive work or
245 /// simplifications. For example, calling `Hir::repetition` with a repetition
246 /// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
247 /// since it is equivalent to an empty regex. Another example is calling
248 /// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
249 /// just get back the original `expr` since it's precisely equivalent.
250 ///
251 /// Smart constructors enable maintaining invariants about the HIR data type
252 /// while also simulanteously keeping the representation as simple as possible.
253 impl Hir {
254     /// Returns an empty HIR expression.
255     ///
256     /// An empty HIR expression always matches, including the empty string.
257     #[inline]
empty() -> Hir258     pub fn empty() -> Hir {
259         let props = Properties::empty();
260         Hir { kind: HirKind::Empty, props }
261     }
262 
263     /// Returns an HIR expression that can never match anything. That is,
264     /// the size of the set of strings in the language described by the HIR
265     /// returned is `0`.
266     ///
267     /// This is distinct from [`Hir::empty`] in that the empty string matches
268     /// the HIR returned by `Hir::empty`. That is, the set of strings in the
269     /// language describe described by `Hir::empty` is non-empty.
270     ///
271     /// Note that currently, the HIR returned uses an empty character class to
272     /// indicate that nothing can match. An equivalent expression that cannot
273     /// match is an empty alternation, but all such "fail" expressions are
274     /// normalized (via smart constructors) to empty character classes. This is
275     /// because empty character classes can be spelled in the concrete syntax
276     /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
277     /// empty alternations cannot.
278     #[inline]
fail() -> Hir279     pub fn fail() -> Hir {
280         let class = Class::Bytes(ClassBytes::empty());
281         let props = Properties::class(&class);
282         // We can't just call Hir::class here because it defers to Hir::fail
283         // in order to canonicalize the Hir value used to represent "cannot
284         // match."
285         Hir { kind: HirKind::Class(class), props }
286     }
287 
288     /// Creates a literal HIR expression.
289     ///
290     /// This accepts anything that can be converted into a `Box<[u8]>`.
291     ///
292     /// Note that there is no mechanism for storing a `char` or a `Box<str>`
293     /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
294     /// any HIR node) matches valid UTF-8 exclusively can be queried via
295     /// [`Properties::is_utf8`].
296     ///
297     /// # Example
298     ///
299     /// This example shows that concatenations of `Literal` HIR values will
300     /// automatically get flattened and combined together. So for example, even
301     /// if you concat multiple `Literal` values that are themselves not valid
302     /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
303     /// how "smart" Hir's smart constructors are.
304     ///
305     /// ```
306     /// use regex_syntax::hir::{Hir, HirKind, Literal};
307     ///
308     /// let literals = vec![
309     ///     Hir::literal([0xE2]),
310     ///     Hir::literal([0x98]),
311     ///     Hir::literal([0x83]),
312     /// ];
313     /// // Each literal, on its own, is invalid UTF-8.
314     /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
315     ///
316     /// let concat = Hir::concat(literals);
317     /// // But the concatenation is valid UTF-8!
318     /// assert!(concat.properties().is_utf8());
319     ///
320     /// // And also notice that the literals have been concatenated into a
321     /// // single `Literal`, to the point where there is no explicit `Concat`!
322     /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
323     /// assert_eq!(&expected, concat.kind());
324     /// ```
325     ///
326     /// # Example: building a literal from a `char`
327     ///
328     /// This example shows how to build a single `Hir` literal from a `char`
329     /// value. Since a [`Literal`] is just bytes, we just need to UTF-8
330     /// encode a `char` value:
331     ///
332     /// ```
333     /// use regex_syntax::hir::{Hir, HirKind, Literal};
334     ///
335     /// let ch = '☃';
336     /// let got = Hir::literal(ch.encode_utf8(&mut [0; 4]).as_bytes());
337     ///
338     /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
339     /// assert_eq!(&expected, got.kind());
340     /// ```
341     #[inline]
literal<B: Into<Box<[u8]>>>(lit: B) -> Hir342     pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
343         let bytes = lit.into();
344         if bytes.is_empty() {
345             return Hir::empty();
346         }
347 
348         let lit = Literal(bytes);
349         let props = Properties::literal(&lit);
350         Hir { kind: HirKind::Literal(lit), props }
351     }
352 
353     /// Creates a class HIR expression. The class may either be defined over
354     /// ranges of Unicode codepoints or ranges of raw byte values.
355     ///
356     /// Note that an empty class is permitted. An empty class is equivalent to
357     /// `Hir::fail()`.
358     #[inline]
class(class: Class) -> Hir359     pub fn class(class: Class) -> Hir {
360         if class.is_empty() {
361             return Hir::fail();
362         } else if let Some(bytes) = class.literal() {
363             return Hir::literal(bytes);
364         }
365         let props = Properties::class(&class);
366         Hir { kind: HirKind::Class(class), props }
367     }
368 
369     /// Creates a look-around assertion HIR expression.
370     #[inline]
look(look: Look) -> Hir371     pub fn look(look: Look) -> Hir {
372         let props = Properties::look(look);
373         Hir { kind: HirKind::Look(look), props }
374     }
375 
376     /// Creates a repetition HIR expression.
377     #[inline]
repetition(mut rep: Repetition) -> Hir378     pub fn repetition(mut rep: Repetition) -> Hir {
379         // If the sub-expression of a repetition can only match the empty
380         // string, then we force its maximum to be at most 1.
381         if rep.sub.properties().maximum_len() == Some(0) {
382             rep.min = cmp::min(rep.min, 1);
383             rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
384         }
385         // The regex 'a{0}' is always equivalent to the empty regex. This is
386         // true even when 'a' is an expression that never matches anything
387         // (like '\P{any}').
388         //
389         // Additionally, the regex 'a{1}' is always equivalent to 'a'.
390         if rep.min == 0 && rep.max == Some(0) {
391             return Hir::empty();
392         } else if rep.min == 1 && rep.max == Some(1) {
393             return *rep.sub;
394         }
395         let props = Properties::repetition(&rep);
396         Hir { kind: HirKind::Repetition(rep), props }
397     }
398 
399     /// Creates a capture HIR expression.
400     ///
401     /// Note that there is no explicit HIR value for a non-capturing group.
402     /// Since a non-capturing group only exists to override precedence in the
403     /// concrete syntax and since an HIR already does its own grouping based on
404     /// what is parsed, there is no need to explicitly represent non-capturing
405     /// groups in the HIR.
406     #[inline]
capture(capture: Capture) -> Hir407     pub fn capture(capture: Capture) -> Hir {
408         let props = Properties::capture(&capture);
409         Hir { kind: HirKind::Capture(capture), props }
410     }
411 
412     /// Returns the concatenation of the given expressions.
413     ///
414     /// This attempts to flatten and simplify the concatenation as appropriate.
415     ///
416     /// # Example
417     ///
418     /// This shows a simple example of basic flattening of both concatenations
419     /// and literals.
420     ///
421     /// ```
422     /// use regex_syntax::hir::Hir;
423     ///
424     /// let hir = Hir::concat(vec![
425     ///     Hir::concat(vec![
426     ///         Hir::literal([b'a']),
427     ///         Hir::literal([b'b']),
428     ///         Hir::literal([b'c']),
429     ///     ]),
430     ///     Hir::concat(vec![
431     ///         Hir::literal([b'x']),
432     ///         Hir::literal([b'y']),
433     ///         Hir::literal([b'z']),
434     ///     ]),
435     /// ]);
436     /// let expected = Hir::literal("abcxyz".as_bytes());
437     /// assert_eq!(expected, hir);
438     /// ```
concat(subs: Vec<Hir>) -> Hir439     pub fn concat(subs: Vec<Hir>) -> Hir {
440         // We rebuild the concatenation by simplifying it. Would be nice to do
441         // it in place, but that seems a little tricky?
442         let mut new = vec![];
443         // This gobbles up any adjacent literals in a concatenation and smushes
444         // them together. Basically, when we see a literal, we add its bytes
445         // to 'prior_lit', and whenever we see anything else, we first take
446         // any bytes in 'prior_lit' and add it to the 'new' concatenation.
447         let mut prior_lit: Option<Vec<u8>> = None;
448         for sub in subs {
449             let (kind, props) = sub.into_parts();
450             match kind {
451                 HirKind::Literal(Literal(bytes)) => {
452                     if let Some(ref mut prior_bytes) = prior_lit {
453                         prior_bytes.extend_from_slice(&bytes);
454                     } else {
455                         prior_lit = Some(bytes.to_vec());
456                     }
457                 }
458                 // We also flatten concats that are direct children of another
459                 // concat. We only need to do this one level deep since
460                 // Hir::concat is the only way to build concatenations, and so
461                 // flattening happens inductively.
462                 HirKind::Concat(subs2) => {
463                     for sub2 in subs2 {
464                         let (kind2, props2) = sub2.into_parts();
465                         match kind2 {
466                             HirKind::Literal(Literal(bytes)) => {
467                                 if let Some(ref mut prior_bytes) = prior_lit {
468                                     prior_bytes.extend_from_slice(&bytes);
469                                 } else {
470                                     prior_lit = Some(bytes.to_vec());
471                                 }
472                             }
473                             kind2 => {
474                                 if let Some(prior_bytes) = prior_lit.take() {
475                                     new.push(Hir::literal(prior_bytes));
476                                 }
477                                 new.push(Hir { kind: kind2, props: props2 });
478                             }
479                         }
480                     }
481                 }
482                 // We can just skip empty HIRs.
483                 HirKind::Empty => {}
484                 kind => {
485                     if let Some(prior_bytes) = prior_lit.take() {
486                         new.push(Hir::literal(prior_bytes));
487                     }
488                     new.push(Hir { kind, props });
489                 }
490             }
491         }
492         if let Some(prior_bytes) = prior_lit.take() {
493             new.push(Hir::literal(prior_bytes));
494         }
495         if new.is_empty() {
496             return Hir::empty();
497         } else if new.len() == 1 {
498             return new.pop().unwrap();
499         }
500         let props = Properties::concat(&new);
501         Hir { kind: HirKind::Concat(new), props }
502     }
503 
504     /// Returns the alternation of the given expressions.
505     ///
506     /// This flattens and simplifies the alternation as appropriate. This may
507     /// include factoring out common prefixes or even rewriting the alternation
508     /// as a character class.
509     ///
510     /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
511     /// is not possible for one to write an empty alternation, or even an
512     /// alternation with a single sub-expression, in the concrete syntax of a
513     /// regex.)
514     ///
515     /// # Example
516     ///
517     /// This is a simple example showing how an alternation might get
518     /// simplified.
519     ///
520     /// ```
521     /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
522     ///
523     /// let hir = Hir::alternation(vec![
524     ///     Hir::literal([b'a']),
525     ///     Hir::literal([b'b']),
526     ///     Hir::literal([b'c']),
527     ///     Hir::literal([b'd']),
528     ///     Hir::literal([b'e']),
529     ///     Hir::literal([b'f']),
530     /// ]);
531     /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
532     ///     ClassUnicodeRange::new('a', 'f'),
533     /// ])));
534     /// assert_eq!(expected, hir);
535     /// ```
536     ///
537     /// And another example showing how common prefixes might get factored
538     /// out.
539     ///
540     /// ```
541     /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
542     ///
543     /// let hir = Hir::alternation(vec![
544     ///     Hir::concat(vec![
545     ///         Hir::literal("abc".as_bytes()),
546     ///         Hir::class(Class::Unicode(ClassUnicode::new([
547     ///             ClassUnicodeRange::new('A', 'Z'),
548     ///         ]))),
549     ///     ]),
550     ///     Hir::concat(vec![
551     ///         Hir::literal("abc".as_bytes()),
552     ///         Hir::class(Class::Unicode(ClassUnicode::new([
553     ///             ClassUnicodeRange::new('a', 'z'),
554     ///         ]))),
555     ///     ]),
556     /// ]);
557     /// let expected = Hir::concat(vec![
558     ///     Hir::literal("abc".as_bytes()),
559     ///     Hir::alternation(vec![
560     ///         Hir::class(Class::Unicode(ClassUnicode::new([
561     ///             ClassUnicodeRange::new('A', 'Z'),
562     ///         ]))),
563     ///         Hir::class(Class::Unicode(ClassUnicode::new([
564     ///             ClassUnicodeRange::new('a', 'z'),
565     ///         ]))),
566     ///     ]),
567     /// ]);
568     /// assert_eq!(expected, hir);
569     /// ```
570     ///
571     /// Note that these sorts of simplifications are not guaranteed.
alternation(subs: Vec<Hir>) -> Hir572     pub fn alternation(subs: Vec<Hir>) -> Hir {
573         // We rebuild the alternation by simplifying it. We proceed similarly
574         // as the concatenation case. But in this case, there's no literal
575         // simplification happening. We're just flattening alternations.
576         let mut new = Vec::with_capacity(subs.len());
577         for sub in subs {
578             let (kind, props) = sub.into_parts();
579             match kind {
580                 HirKind::Alternation(subs2) => {
581                     new.extend(subs2);
582                 }
583                 kind => {
584                     new.push(Hir { kind, props });
585                 }
586             }
587         }
588         if new.is_empty() {
589             return Hir::fail();
590         } else if new.len() == 1 {
591             return new.pop().unwrap();
592         }
593         // Now that it's completely flattened, look for the special case of
594         // 'char1|char2|...|charN' and collapse that into a class. Note that
595         // we look for 'char' first and then bytes. The issue here is that if
596         // we find both non-ASCII codepoints and non-ASCII singleton bytes,
597         // then it isn't actually possible to smush them into a single class.
598         // (Because classes are either "all codepoints" or "all bytes." You
599         // can have a class that both matches non-ASCII but valid UTF-8 and
600         // invalid UTF-8.) So we look for all chars and then all bytes, and
601         // don't handle anything else.
602         if let Some(singletons) = singleton_chars(&new) {
603             let it = singletons
604                 .into_iter()
605                 .map(|ch| ClassUnicodeRange { start: ch, end: ch });
606             return Hir::class(Class::Unicode(ClassUnicode::new(it)));
607         }
608         if let Some(singletons) = singleton_bytes(&new) {
609             let it = singletons
610                 .into_iter()
611                 .map(|b| ClassBytesRange { start: b, end: b });
612             return Hir::class(Class::Bytes(ClassBytes::new(it)));
613         }
614         // Similar to singleton chars, we can also look for alternations of
615         // classes. Those can be smushed into a single class.
616         if let Some(cls) = class_chars(&new) {
617             return Hir::class(cls);
618         }
619         if let Some(cls) = class_bytes(&new) {
620             return Hir::class(cls);
621         }
622         // Factor out a common prefix if we can, which might potentially
623         // simplify the expression and unlock other optimizations downstream.
624         // It also might generally make NFA matching and DFA construction
625         // faster by reducing the scope of branching in the regex.
626         new = match lift_common_prefix(new) {
627             Ok(hir) => return hir,
628             Err(unchanged) => unchanged,
629         };
630         let props = Properties::alternation(&new);
631         Hir { kind: HirKind::Alternation(new), props }
632     }
633 
634     /// Returns an HIR expression for `.`.
635     ///
636     /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
637     /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
638     /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
639     /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
640     /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
641     /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
642     ///
643     /// # Example
644     ///
645     /// Note that this is a convenience routine for constructing the correct
646     /// character class based on the value of `Dot`. There is no explicit "dot"
647     /// HIR value. It is just an abbreviation for a common character class.
648     ///
649     /// ```
650     /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
651     ///
652     /// let hir = Hir::dot(Dot::AnyByte);
653     /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
654     ///     ClassBytesRange::new(0x00, 0xFF),
655     /// ])));
656     /// assert_eq!(expected, hir);
657     /// ```
658     #[inline]
dot(dot: Dot) -> Hir659     pub fn dot(dot: Dot) -> Hir {
660         match dot {
661             Dot::AnyChar => {
662                 let mut cls = ClassUnicode::empty();
663                 cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
664                 Hir::class(Class::Unicode(cls))
665             }
666             Dot::AnyByte => {
667                 let mut cls = ClassBytes::empty();
668                 cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
669                 Hir::class(Class::Bytes(cls))
670             }
671             Dot::AnyCharExcept(ch) => {
672                 let mut cls =
673                     ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
674                 cls.negate();
675                 Hir::class(Class::Unicode(cls))
676             }
677             Dot::AnyCharExceptLF => {
678                 let mut cls = ClassUnicode::empty();
679                 cls.push(ClassUnicodeRange::new('\0', '\x09'));
680                 cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
681                 Hir::class(Class::Unicode(cls))
682             }
683             Dot::AnyCharExceptCRLF => {
684                 let mut cls = ClassUnicode::empty();
685                 cls.push(ClassUnicodeRange::new('\0', '\x09'));
686                 cls.push(ClassUnicodeRange::new('\x0B', '\x0C'));
687                 cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
688                 Hir::class(Class::Unicode(cls))
689             }
690             Dot::AnyByteExcept(byte) => {
691                 let mut cls =
692                     ClassBytes::new([ClassBytesRange::new(byte, byte)]);
693                 cls.negate();
694                 Hir::class(Class::Bytes(cls))
695             }
696             Dot::AnyByteExceptLF => {
697                 let mut cls = ClassBytes::empty();
698                 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
699                 cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
700                 Hir::class(Class::Bytes(cls))
701             }
702             Dot::AnyByteExceptCRLF => {
703                 let mut cls = ClassBytes::empty();
704                 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
705                 cls.push(ClassBytesRange::new(b'\x0B', b'\x0C'));
706                 cls.push(ClassBytesRange::new(b'\x0E', b'\xFF'));
707                 Hir::class(Class::Bytes(cls))
708             }
709         }
710     }
711 }
712 
713 /// The underlying kind of an arbitrary [`Hir`] expression.
714 ///
715 /// An `HirKind` is principally useful for doing case analysis on the type
716 /// of a regular expression. If you're looking to build new `Hir` values,
717 /// then you _must_ use the smart constructors defined on `Hir`, like
718 /// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
719 /// not expose any way of building an `Hir` directly from an `HirKind`.
720 #[derive(Clone, Debug, Eq, PartialEq)]
721 pub enum HirKind {
722     /// The empty regular expression, which matches everything, including the
723     /// empty string.
724     Empty,
725     /// A literalstring that matches exactly these bytes.
726     Literal(Literal),
727     /// A single character class that matches any of the characters in the
728     /// class. A class can either consist of Unicode scalar values as
729     /// characters, or it can use bytes.
730     ///
731     /// A class may be empty. In which case, it matches nothing.
732     Class(Class),
733     /// A look-around assertion. A look-around match always has zero length.
734     Look(Look),
735     /// A repetition operation applied to a sub-expression.
736     Repetition(Repetition),
737     /// A capturing group, which contains a sub-expression.
738     Capture(Capture),
739     /// A concatenation of expressions.
740     ///
741     /// A concatenation matches only if each of its sub-expressions match one
742     /// after the other.
743     ///
744     /// Concatenations are guaranteed by `Hir`'s smart constructors to always
745     /// have at least two sub-expressions.
746     Concat(Vec<Hir>),
747     /// An alternation of expressions.
748     ///
749     /// An alternation matches only if at least one of its sub-expressions
750     /// match. If multiple sub-expressions match, then the leftmost is
751     /// preferred.
752     ///
753     /// Alternations are guaranteed by `Hir`'s smart constructors to always
754     /// have at least two sub-expressions.
755     Alternation(Vec<Hir>),
756 }
757 
758 impl HirKind {
759     /// Returns a slice of this kind's sub-expressions, if any.
subs(&self) -> &[Hir]760     pub fn subs(&self) -> &[Hir] {
761         use core::slice::from_ref;
762 
763         match *self {
764             HirKind::Empty
765             | HirKind::Literal(_)
766             | HirKind::Class(_)
767             | HirKind::Look(_) => &[],
768             HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
769             HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
770             HirKind::Concat(ref subs) => subs,
771             HirKind::Alternation(ref subs) => subs,
772         }
773     }
774 }
775 
776 impl core::fmt::Debug for Hir {
fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result777     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
778         self.kind.fmt(f)
779     }
780 }
781 
782 /// Print a display representation of this Hir.
783 ///
784 /// The result of this is a valid regular expression pattern string.
785 ///
786 /// This implementation uses constant stack space and heap space proportional
787 /// to the size of the `Hir`.
788 impl core::fmt::Display for Hir {
fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result789     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
790         crate::hir::print::Printer::new().print(self, f)
791     }
792 }
793 
794 /// The high-level intermediate representation of a literal.
795 ///
796 /// A literal corresponds to `0` or more bytes that should be matched
797 /// literally. The smart constructors defined on `Hir` will automatically
798 /// concatenate adjacent literals into one literal, and will even automatically
799 /// replace empty literals with `Hir::empty()`.
800 ///
801 /// Note that despite a literal being represented by a sequence of bytes, its
802 /// `Debug` implementation will attempt to print it as a normal string. (That
803 /// is, not a sequence of decimal numbers.)
804 #[derive(Clone, Eq, PartialEq)]
805 pub struct Literal(pub Box<[u8]>);
806 
807 impl core::fmt::Debug for Literal {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result808     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
809         crate::debug::Bytes(&self.0).fmt(f)
810     }
811 }
812 
813 /// The high-level intermediate representation of a character class.
814 ///
815 /// A character class corresponds to a set of characters. A character is either
816 /// defined by a Unicode scalar value or a byte.
817 ///
818 /// A character class, regardless of its character type, is represented by a
819 /// sequence of non-overlapping non-adjacent ranges of characters.
820 ///
821 /// There are no guarantees about which class variant is used. Generally
822 /// speaking, the Unicode variat is used whenever a class needs to contain
823 /// non-ASCII Unicode scalar values. But the Unicode variant can be used even
824 /// when Unicode mode is disabled. For example, at the time of writing, the
825 /// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
826 /// `[a\u00A0]` due to optimizations.
827 ///
828 /// Note that `Bytes` variant may be produced even when it exclusively matches
829 /// valid UTF-8. This is because a `Bytes` variant represents an intention by
830 /// the author of the regular expression to disable Unicode mode, which in turn
831 /// impacts the semantics of case insensitive matching. For example, `(?i)k`
832 /// and `(?i-u)k` will not match the same set of strings.
833 #[derive(Clone, Eq, PartialEq)]
834 pub enum Class {
835     /// A set of characters represented by Unicode scalar values.
836     Unicode(ClassUnicode),
837     /// A set of characters represented by arbitrary bytes (one byte per
838     /// character).
839     Bytes(ClassBytes),
840 }
841 
842 impl Class {
843     /// Apply Unicode simple case folding to this character class, in place.
844     /// The character class will be expanded to include all simple case folded
845     /// character variants.
846     ///
847     /// If this is a byte oriented character class, then this will be limited
848     /// to the ASCII ranges `A-Z` and `a-z`.
849     ///
850     /// # Panics
851     ///
852     /// This routine panics when the case mapping data necessary for this
853     /// routine to complete is unavailable. This occurs when the `unicode-case`
854     /// feature is not enabled and the underlying class is Unicode oriented.
855     ///
856     /// Callers should prefer using `try_case_fold_simple` instead, which will
857     /// return an error instead of panicking.
case_fold_simple(&mut self)858     pub fn case_fold_simple(&mut self) {
859         match *self {
860             Class::Unicode(ref mut x) => x.case_fold_simple(),
861             Class::Bytes(ref mut x) => x.case_fold_simple(),
862         }
863     }
864 
865     /// Apply Unicode simple case folding to this character class, in place.
866     /// The character class will be expanded to include all simple case folded
867     /// character variants.
868     ///
869     /// If this is a byte oriented character class, then this will be limited
870     /// to the ASCII ranges `A-Z` and `a-z`.
871     ///
872     /// # Error
873     ///
874     /// This routine returns an error when the case mapping data necessary
875     /// for this routine to complete is unavailable. This occurs when the
876     /// `unicode-case` feature is not enabled and the underlying class is
877     /// Unicode oriented.
try_case_fold_simple( &mut self, ) -> core::result::Result<(), CaseFoldError>878     pub fn try_case_fold_simple(
879         &mut self,
880     ) -> core::result::Result<(), CaseFoldError> {
881         match *self {
882             Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
883             Class::Bytes(ref mut x) => x.case_fold_simple(),
884         }
885         Ok(())
886     }
887 
888     /// Negate this character class in place.
889     ///
890     /// After completion, this character class will contain precisely the
891     /// characters that weren't previously in the class.
negate(&mut self)892     pub fn negate(&mut self) {
893         match *self {
894             Class::Unicode(ref mut x) => x.negate(),
895             Class::Bytes(ref mut x) => x.negate(),
896         }
897     }
898 
899     /// Returns true if and only if this character class will only ever match
900     /// valid UTF-8.
901     ///
902     /// A character class can match invalid UTF-8 only when the following
903     /// conditions are met:
904     ///
905     /// 1. The translator was configured to permit generating an expression
906     ///    that can match invalid UTF-8. (By default, this is disabled.)
907     /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
908     ///    syntax or in the parser builder. By default, Unicode mode is
909     ///    enabled.
is_utf8(&self) -> bool910     pub fn is_utf8(&self) -> bool {
911         match *self {
912             Class::Unicode(_) => true,
913             Class::Bytes(ref x) => x.is_ascii(),
914         }
915     }
916 
917     /// Returns the length, in bytes, of the smallest string matched by this
918     /// character class.
919     ///
920     /// For non-empty byte oriented classes, this always returns `1`. For
921     /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
922     /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
923     /// be returned.
924     ///
925     /// # Example
926     ///
927     /// This example shows some examples of regexes and their corresponding
928     /// minimum length, if any.
929     ///
930     /// ```
931     /// use regex_syntax::{hir::Properties, parse};
932     ///
933     /// // The empty string has a min length of 0.
934     /// let hir = parse(r"")?;
935     /// assert_eq!(Some(0), hir.properties().minimum_len());
936     /// // As do other types of regexes that only match the empty string.
937     /// let hir = parse(r"^$\b\B")?;
938     /// assert_eq!(Some(0), hir.properties().minimum_len());
939     /// // A regex that can match the empty string but match more is still 0.
940     /// let hir = parse(r"a*")?;
941     /// assert_eq!(Some(0), hir.properties().minimum_len());
942     /// // A regex that matches nothing has no minimum defined.
943     /// let hir = parse(r"[a&&b]")?;
944     /// assert_eq!(None, hir.properties().minimum_len());
945     /// // Character classes usually have a minimum length of 1.
946     /// let hir = parse(r"\w")?;
947     /// assert_eq!(Some(1), hir.properties().minimum_len());
948     /// // But sometimes Unicode classes might be bigger!
949     /// let hir = parse(r"\p{Cyrillic}")?;
950     /// assert_eq!(Some(2), hir.properties().minimum_len());
951     ///
952     /// # Ok::<(), Box<dyn std::error::Error>>(())
953     /// ```
minimum_len(&self) -> Option<usize>954     pub fn minimum_len(&self) -> Option<usize> {
955         match *self {
956             Class::Unicode(ref x) => x.minimum_len(),
957             Class::Bytes(ref x) => x.minimum_len(),
958         }
959     }
960 
961     /// Returns the length, in bytes, of the longest string matched by this
962     /// character class.
963     ///
964     /// For non-empty byte oriented classes, this always returns `1`. For
965     /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
966     /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
967     /// be returned.
968     ///
969     /// # Example
970     ///
971     /// This example shows some examples of regexes and their corresponding
972     /// maximum length, if any.
973     ///
974     /// ```
975     /// use regex_syntax::{hir::Properties, parse};
976     ///
977     /// // The empty string has a max length of 0.
978     /// let hir = parse(r"")?;
979     /// assert_eq!(Some(0), hir.properties().maximum_len());
980     /// // As do other types of regexes that only match the empty string.
981     /// let hir = parse(r"^$\b\B")?;
982     /// assert_eq!(Some(0), hir.properties().maximum_len());
983     /// // A regex that matches nothing has no maximum defined.
984     /// let hir = parse(r"[a&&b]")?;
985     /// assert_eq!(None, hir.properties().maximum_len());
986     /// // Bounded repeats work as you expect.
987     /// let hir = parse(r"x{2,10}")?;
988     /// assert_eq!(Some(10), hir.properties().maximum_len());
989     /// // An unbounded repeat means there is no maximum.
990     /// let hir = parse(r"x{2,}")?;
991     /// assert_eq!(None, hir.properties().maximum_len());
992     /// // With Unicode enabled, \w can match up to 4 bytes!
993     /// let hir = parse(r"\w")?;
994     /// assert_eq!(Some(4), hir.properties().maximum_len());
995     /// // Without Unicode enabled, \w matches at most 1 byte.
996     /// let hir = parse(r"(?-u)\w")?;
997     /// assert_eq!(Some(1), hir.properties().maximum_len());
998     ///
999     /// # Ok::<(), Box<dyn std::error::Error>>(())
1000     /// ```
maximum_len(&self) -> Option<usize>1001     pub fn maximum_len(&self) -> Option<usize> {
1002         match *self {
1003             Class::Unicode(ref x) => x.maximum_len(),
1004             Class::Bytes(ref x) => x.maximum_len(),
1005         }
1006     }
1007 
1008     /// Returns true if and only if this character class is empty. That is,
1009     /// it has no elements.
1010     ///
1011     /// An empty character can never match anything, including an empty string.
is_empty(&self) -> bool1012     pub fn is_empty(&self) -> bool {
1013         match *self {
1014             Class::Unicode(ref x) => x.ranges().is_empty(),
1015             Class::Bytes(ref x) => x.ranges().is_empty(),
1016         }
1017     }
1018 
1019     /// If this class consists of exactly one element (whether a codepoint or a
1020     /// byte), then return it as a literal byte string.
1021     ///
1022     /// If this class is empty or contains more than one element, then `None`
1023     /// is returned.
literal(&self) -> Option<Vec<u8>>1024     pub fn literal(&self) -> Option<Vec<u8>> {
1025         match *self {
1026             Class::Unicode(ref x) => x.literal(),
1027             Class::Bytes(ref x) => x.literal(),
1028         }
1029     }
1030 }
1031 
1032 impl core::fmt::Debug for Class {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result1033     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1034         use crate::debug::Byte;
1035 
1036         let mut fmter = f.debug_set();
1037         match *self {
1038             Class::Unicode(ref cls) => {
1039                 for r in cls.ranges().iter() {
1040                     fmter.entry(&(r.start..=r.end));
1041                 }
1042             }
1043             Class::Bytes(ref cls) => {
1044                 for r in cls.ranges().iter() {
1045                     fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1046                 }
1047             }
1048         }
1049         fmter.finish()
1050     }
1051 }
1052 
1053 /// A set of characters represented by Unicode scalar values.
1054 #[derive(Clone, Debug, Eq, PartialEq)]
1055 pub struct ClassUnicode {
1056     set: IntervalSet<ClassUnicodeRange>,
1057 }
1058 
1059 impl ClassUnicode {
1060     /// Create a new class from a sequence of ranges.
1061     ///
1062     /// The given ranges do not need to be in any specific order, and ranges
1063     /// may overlap. Ranges will automatically be sorted into a canonical
1064     /// non-overlapping order.
new<I>(ranges: I) -> ClassUnicode where I: IntoIterator<Item = ClassUnicodeRange>,1065     pub fn new<I>(ranges: I) -> ClassUnicode
1066     where
1067         I: IntoIterator<Item = ClassUnicodeRange>,
1068     {
1069         ClassUnicode { set: IntervalSet::new(ranges) }
1070     }
1071 
1072     /// Create a new class with no ranges.
1073     ///
1074     /// An empty class matches nothing. That is, it is equivalent to
1075     /// [`Hir::fail`].
empty() -> ClassUnicode1076     pub fn empty() -> ClassUnicode {
1077         ClassUnicode::new(vec![])
1078     }
1079 
1080     /// Add a new range to this set.
push(&mut self, range: ClassUnicodeRange)1081     pub fn push(&mut self, range: ClassUnicodeRange) {
1082         self.set.push(range);
1083     }
1084 
1085     /// Return an iterator over all ranges in this class.
1086     ///
1087     /// The iterator yields ranges in ascending order.
iter(&self) -> ClassUnicodeIter<'_>1088     pub fn iter(&self) -> ClassUnicodeIter<'_> {
1089         ClassUnicodeIter(self.set.iter())
1090     }
1091 
1092     /// Return the underlying ranges as a slice.
ranges(&self) -> &[ClassUnicodeRange]1093     pub fn ranges(&self) -> &[ClassUnicodeRange] {
1094         self.set.intervals()
1095     }
1096 
1097     /// Expand this character class such that it contains all case folded
1098     /// characters, according to Unicode's "simple" mapping. For example, if
1099     /// this class consists of the range `a-z`, then applying case folding will
1100     /// result in the class containing both the ranges `a-z` and `A-Z`.
1101     ///
1102     /// # Panics
1103     ///
1104     /// This routine panics when the case mapping data necessary for this
1105     /// routine to complete is unavailable. This occurs when the `unicode-case`
1106     /// feature is not enabled.
1107     ///
1108     /// Callers should prefer using `try_case_fold_simple` instead, which will
1109     /// return an error instead of panicking.
case_fold_simple(&mut self)1110     pub fn case_fold_simple(&mut self) {
1111         self.set
1112             .case_fold_simple()
1113             .expect("unicode-case feature must be enabled");
1114     }
1115 
1116     /// Expand this character class such that it contains all case folded
1117     /// characters, according to Unicode's "simple" mapping. For example, if
1118     /// this class consists of the range `a-z`, then applying case folding will
1119     /// result in the class containing both the ranges `a-z` and `A-Z`.
1120     ///
1121     /// # Error
1122     ///
1123     /// This routine returns an error when the case mapping data necessary
1124     /// for this routine to complete is unavailable. This occurs when the
1125     /// `unicode-case` feature is not enabled.
try_case_fold_simple( &mut self, ) -> core::result::Result<(), CaseFoldError>1126     pub fn try_case_fold_simple(
1127         &mut self,
1128     ) -> core::result::Result<(), CaseFoldError> {
1129         self.set.case_fold_simple()
1130     }
1131 
1132     /// Negate this character class.
1133     ///
1134     /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1135     /// set, then it will not be in this set after negation.
negate(&mut self)1136     pub fn negate(&mut self) {
1137         self.set.negate();
1138     }
1139 
1140     /// Union this character class with the given character class, in place.
union(&mut self, other: &ClassUnicode)1141     pub fn union(&mut self, other: &ClassUnicode) {
1142         self.set.union(&other.set);
1143     }
1144 
1145     /// Intersect this character class with the given character class, in
1146     /// place.
intersect(&mut self, other: &ClassUnicode)1147     pub fn intersect(&mut self, other: &ClassUnicode) {
1148         self.set.intersect(&other.set);
1149     }
1150 
1151     /// Subtract the given character class from this character class, in place.
difference(&mut self, other: &ClassUnicode)1152     pub fn difference(&mut self, other: &ClassUnicode) {
1153         self.set.difference(&other.set);
1154     }
1155 
1156     /// Compute the symmetric difference of the given character classes, in
1157     /// place.
1158     ///
1159     /// This computes the symmetric difference of two character classes. This
1160     /// removes all elements in this class that are also in the given class,
1161     /// but all adds all elements from the given class that aren't in this
1162     /// class. That is, the class will contain all elements in either class,
1163     /// but will not contain any elements that are in both classes.
symmetric_difference(&mut self, other: &ClassUnicode)1164     pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1165         self.set.symmetric_difference(&other.set);
1166     }
1167 
1168     /// Returns true if and only if this character class will either match
1169     /// nothing or only ASCII bytes. Stated differently, this returns false
1170     /// if and only if this class contains a non-ASCII codepoint.
is_ascii(&self) -> bool1171     pub fn is_ascii(&self) -> bool {
1172         self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
1173     }
1174 
1175     /// Returns the length, in bytes, of the smallest string matched by this
1176     /// character class.
1177     ///
1178     /// Returns `None` when the class is empty.
minimum_len(&self) -> Option<usize>1179     pub fn minimum_len(&self) -> Option<usize> {
1180         let first = self.ranges().get(0)?;
1181         // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1182         Some(first.start.len_utf8())
1183     }
1184 
1185     /// Returns the length, in bytes, of the longest string matched by this
1186     /// character class.
1187     ///
1188     /// Returns `None` when the class is empty.
maximum_len(&self) -> Option<usize>1189     pub fn maximum_len(&self) -> Option<usize> {
1190         let last = self.ranges().last()?;
1191         // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1192         Some(last.end.len_utf8())
1193     }
1194 
1195     /// If this class consists of exactly one codepoint, then return it as
1196     /// a literal byte string.
1197     ///
1198     /// If this class is empty or contains more than one codepoint, then `None`
1199     /// is returned.
literal(&self) -> Option<Vec<u8>>1200     pub fn literal(&self) -> Option<Vec<u8>> {
1201         let rs = self.ranges();
1202         if rs.len() == 1 && rs[0].start == rs[0].end {
1203             Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
1204         } else {
1205             None
1206         }
1207     }
1208 
1209     /// If this class consists of only ASCII ranges, then return its
1210     /// corresponding and equivalent byte class.
to_byte_class(&self) -> Option<ClassBytes>1211     pub fn to_byte_class(&self) -> Option<ClassBytes> {
1212         if !self.is_ascii() {
1213             return None;
1214         }
1215         Some(ClassBytes::new(self.ranges().iter().map(|r| {
1216             // Since we are guaranteed that our codepoint range is ASCII, the
1217             // 'u8::try_from' calls below are guaranteed to be correct.
1218             ClassBytesRange {
1219                 start: u8::try_from(r.start).unwrap(),
1220                 end: u8::try_from(r.end).unwrap(),
1221             }
1222         })))
1223     }
1224 }
1225 
1226 /// An iterator over all ranges in a Unicode character class.
1227 ///
1228 /// The lifetime `'a` refers to the lifetime of the underlying class.
1229 #[derive(Debug)]
1230 pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1231 
1232 impl<'a> Iterator for ClassUnicodeIter<'a> {
1233     type Item = &'a ClassUnicodeRange;
1234 
next(&mut self) -> Option<&'a ClassUnicodeRange>1235     fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1236         self.0.next()
1237     }
1238 }
1239 
1240 /// A single range of characters represented by Unicode scalar values.
1241 ///
1242 /// The range is closed. That is, the start and end of the range are included
1243 /// in the range.
1244 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1245 pub struct ClassUnicodeRange {
1246     start: char,
1247     end: char,
1248 }
1249 
1250 impl core::fmt::Debug for ClassUnicodeRange {
fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result1251     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1252         let start = if !self.start.is_whitespace() && !self.start.is_control()
1253         {
1254             self.start.to_string()
1255         } else {
1256             format!("0x{:X}", u32::from(self.start))
1257         };
1258         let end = if !self.end.is_whitespace() && !self.end.is_control() {
1259             self.end.to_string()
1260         } else {
1261             format!("0x{:X}", u32::from(self.end))
1262         };
1263         f.debug_struct("ClassUnicodeRange")
1264             .field("start", &start)
1265             .field("end", &end)
1266             .finish()
1267     }
1268 }
1269 
1270 impl Interval for ClassUnicodeRange {
1271     type Bound = char;
1272 
1273     #[inline]
lower(&self) -> char1274     fn lower(&self) -> char {
1275         self.start
1276     }
1277     #[inline]
upper(&self) -> char1278     fn upper(&self) -> char {
1279         self.end
1280     }
1281     #[inline]
set_lower(&mut self, bound: char)1282     fn set_lower(&mut self, bound: char) {
1283         self.start = bound;
1284     }
1285     #[inline]
set_upper(&mut self, bound: char)1286     fn set_upper(&mut self, bound: char) {
1287         self.end = bound;
1288     }
1289 
1290     /// Apply simple case folding to this Unicode scalar value range.
1291     ///
1292     /// Additional ranges are appended to the given vector. Canonical ordering
1293     /// is *not* maintained in the given vector.
case_fold_simple( &self, ranges: &mut Vec<ClassUnicodeRange>, ) -> Result<(), unicode::CaseFoldError>1294     fn case_fold_simple(
1295         &self,
1296         ranges: &mut Vec<ClassUnicodeRange>,
1297     ) -> Result<(), unicode::CaseFoldError> {
1298         let mut folder = unicode::SimpleCaseFolder::new()?;
1299         if !folder.overlaps(self.start, self.end) {
1300             return Ok(());
1301         }
1302         let (start, end) = (u32::from(self.start), u32::from(self.end));
1303         for cp in (start..=end).filter_map(char::from_u32) {
1304             for &cp_folded in folder.mapping(cp) {
1305                 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1306             }
1307         }
1308         Ok(())
1309     }
1310 }
1311 
1312 impl ClassUnicodeRange {
1313     /// Create a new Unicode scalar value range for a character class.
1314     ///
1315     /// The returned range is always in a canonical form. That is, the range
1316     /// returned always satisfies the invariant that `start <= end`.
new(start: char, end: char) -> ClassUnicodeRange1317     pub fn new(start: char, end: char) -> ClassUnicodeRange {
1318         ClassUnicodeRange::create(start, end)
1319     }
1320 
1321     /// Return the start of this range.
1322     ///
1323     /// The start of a range is always less than or equal to the end of the
1324     /// range.
start(&self) -> char1325     pub fn start(&self) -> char {
1326         self.start
1327     }
1328 
1329     /// Return the end of this range.
1330     ///
1331     /// The end of a range is always greater than or equal to the start of the
1332     /// range.
end(&self) -> char1333     pub fn end(&self) -> char {
1334         self.end
1335     }
1336 
1337     /// Returns the number of codepoints in this range.
len(&self) -> usize1338     pub fn len(&self) -> usize {
1339         let diff = 1 + u32::from(self.end) - u32::from(self.start);
1340         // This is likely to panic in 16-bit targets since a usize can only fit
1341         // 2^16. It's not clear what to do here, other than to return an error
1342         // when building a Unicode class that contains a range whose length
1343         // overflows usize. (Which, to be honest, is probably quite common on
1344         // 16-bit targets. For example, this would imply that '.' and '\p{any}'
1345         // would be impossible to build.)
1346         usize::try_from(diff).expect("char class len fits in usize")
1347     }
1348 }
1349 
1350 /// A set of characters represented by arbitrary bytes.
1351 ///
1352 /// Each byte corresponds to one character.
1353 #[derive(Clone, Debug, Eq, PartialEq)]
1354 pub struct ClassBytes {
1355     set: IntervalSet<ClassBytesRange>,
1356 }
1357 
1358 impl ClassBytes {
1359     /// Create a new class from a sequence of ranges.
1360     ///
1361     /// The given ranges do not need to be in any specific order, and ranges
1362     /// may overlap. Ranges will automatically be sorted into a canonical
1363     /// non-overlapping order.
new<I>(ranges: I) -> ClassBytes where I: IntoIterator<Item = ClassBytesRange>,1364     pub fn new<I>(ranges: I) -> ClassBytes
1365     where
1366         I: IntoIterator<Item = ClassBytesRange>,
1367     {
1368         ClassBytes { set: IntervalSet::new(ranges) }
1369     }
1370 
1371     /// Create a new class with no ranges.
1372     ///
1373     /// An empty class matches nothing. That is, it is equivalent to
1374     /// [`Hir::fail`].
empty() -> ClassBytes1375     pub fn empty() -> ClassBytes {
1376         ClassBytes::new(vec![])
1377     }
1378 
1379     /// Add a new range to this set.
push(&mut self, range: ClassBytesRange)1380     pub fn push(&mut self, range: ClassBytesRange) {
1381         self.set.push(range);
1382     }
1383 
1384     /// Return an iterator over all ranges in this class.
1385     ///
1386     /// The iterator yields ranges in ascending order.
iter(&self) -> ClassBytesIter<'_>1387     pub fn iter(&self) -> ClassBytesIter<'_> {
1388         ClassBytesIter(self.set.iter())
1389     }
1390 
1391     /// Return the underlying ranges as a slice.
ranges(&self) -> &[ClassBytesRange]1392     pub fn ranges(&self) -> &[ClassBytesRange] {
1393         self.set.intervals()
1394     }
1395 
1396     /// Expand this character class such that it contains all case folded
1397     /// characters. For example, if this class consists of the range `a-z`,
1398     /// then applying case folding will result in the class containing both the
1399     /// ranges `a-z` and `A-Z`.
1400     ///
1401     /// Note that this only applies ASCII case folding, which is limited to the
1402     /// characters `a-z` and `A-Z`.
case_fold_simple(&mut self)1403     pub fn case_fold_simple(&mut self) {
1404         self.set.case_fold_simple().expect("ASCII case folding never fails");
1405     }
1406 
1407     /// Negate this byte class.
1408     ///
1409     /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1410     /// will not be in this set after negation.
negate(&mut self)1411     pub fn negate(&mut self) {
1412         self.set.negate();
1413     }
1414 
1415     /// Union this byte class with the given byte class, in place.
union(&mut self, other: &ClassBytes)1416     pub fn union(&mut self, other: &ClassBytes) {
1417         self.set.union(&other.set);
1418     }
1419 
1420     /// Intersect this byte class with the given byte class, in place.
intersect(&mut self, other: &ClassBytes)1421     pub fn intersect(&mut self, other: &ClassBytes) {
1422         self.set.intersect(&other.set);
1423     }
1424 
1425     /// Subtract the given byte class from this byte class, in place.
difference(&mut self, other: &ClassBytes)1426     pub fn difference(&mut self, other: &ClassBytes) {
1427         self.set.difference(&other.set);
1428     }
1429 
1430     /// Compute the symmetric difference of the given byte classes, in place.
1431     ///
1432     /// This computes the symmetric difference of two byte classes. This
1433     /// removes all elements in this class that are also in the given class,
1434     /// but all adds all elements from the given class that aren't in this
1435     /// class. That is, the class will contain all elements in either class,
1436     /// but will not contain any elements that are in both classes.
symmetric_difference(&mut self, other: &ClassBytes)1437     pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1438         self.set.symmetric_difference(&other.set);
1439     }
1440 
1441     /// Returns true if and only if this character class will either match
1442     /// nothing or only ASCII bytes. Stated differently, this returns false
1443     /// if and only if this class contains a non-ASCII byte.
is_ascii(&self) -> bool1444     pub fn is_ascii(&self) -> bool {
1445         self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1446     }
1447 
1448     /// Returns the length, in bytes, of the smallest string matched by this
1449     /// character class.
1450     ///
1451     /// Returns `None` when the class is empty.
minimum_len(&self) -> Option<usize>1452     pub fn minimum_len(&self) -> Option<usize> {
1453         if self.ranges().is_empty() {
1454             None
1455         } else {
1456             Some(1)
1457         }
1458     }
1459 
1460     /// Returns the length, in bytes, of the longest string matched by this
1461     /// character class.
1462     ///
1463     /// Returns `None` when the class is empty.
maximum_len(&self) -> Option<usize>1464     pub fn maximum_len(&self) -> Option<usize> {
1465         if self.ranges().is_empty() {
1466             None
1467         } else {
1468             Some(1)
1469         }
1470     }
1471 
1472     /// If this class consists of exactly one byte, then return it as
1473     /// a literal byte string.
1474     ///
1475     /// If this class is empty or contains more than one byte, then `None`
1476     /// is returned.
literal(&self) -> Option<Vec<u8>>1477     pub fn literal(&self) -> Option<Vec<u8>> {
1478         let rs = self.ranges();
1479         if rs.len() == 1 && rs[0].start == rs[0].end {
1480             Some(vec![rs[0].start])
1481         } else {
1482             None
1483         }
1484     }
1485 
1486     /// If this class consists of only ASCII ranges, then return its
1487     /// corresponding and equivalent Unicode class.
to_unicode_class(&self) -> Option<ClassUnicode>1488     pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1489         if !self.is_ascii() {
1490             return None;
1491         }
1492         Some(ClassUnicode::new(self.ranges().iter().map(|r| {
1493             // Since we are guaranteed that our byte range is ASCII, the
1494             // 'char::from' calls below are correct and will not erroneously
1495             // convert a raw byte value into its corresponding codepoint.
1496             ClassUnicodeRange {
1497                 start: char::from(r.start),
1498                 end: char::from(r.end),
1499             }
1500         })))
1501     }
1502 }
1503 
1504 /// An iterator over all ranges in a byte character class.
1505 ///
1506 /// The lifetime `'a` refers to the lifetime of the underlying class.
1507 #[derive(Debug)]
1508 pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1509 
1510 impl<'a> Iterator for ClassBytesIter<'a> {
1511     type Item = &'a ClassBytesRange;
1512 
next(&mut self) -> Option<&'a ClassBytesRange>1513     fn next(&mut self) -> Option<&'a ClassBytesRange> {
1514         self.0.next()
1515     }
1516 }
1517 
1518 /// A single range of characters represented by arbitrary bytes.
1519 ///
1520 /// The range is closed. That is, the start and end of the range are included
1521 /// in the range.
1522 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1523 pub struct ClassBytesRange {
1524     start: u8,
1525     end: u8,
1526 }
1527 
1528 impl Interval for ClassBytesRange {
1529     type Bound = u8;
1530 
1531     #[inline]
lower(&self) -> u81532     fn lower(&self) -> u8 {
1533         self.start
1534     }
1535     #[inline]
upper(&self) -> u81536     fn upper(&self) -> u8 {
1537         self.end
1538     }
1539     #[inline]
set_lower(&mut self, bound: u8)1540     fn set_lower(&mut self, bound: u8) {
1541         self.start = bound;
1542     }
1543     #[inline]
set_upper(&mut self, bound: u8)1544     fn set_upper(&mut self, bound: u8) {
1545         self.end = bound;
1546     }
1547 
1548     /// Apply simple case folding to this byte range. Only ASCII case mappings
1549     /// (for a-z) are applied.
1550     ///
1551     /// Additional ranges are appended to the given vector. Canonical ordering
1552     /// is *not* maintained in the given vector.
case_fold_simple( &self, ranges: &mut Vec<ClassBytesRange>, ) -> Result<(), unicode::CaseFoldError>1553     fn case_fold_simple(
1554         &self,
1555         ranges: &mut Vec<ClassBytesRange>,
1556     ) -> Result<(), unicode::CaseFoldError> {
1557         if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1558             let lower = cmp::max(self.start, b'a');
1559             let upper = cmp::min(self.end, b'z');
1560             ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1561         }
1562         if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1563             let lower = cmp::max(self.start, b'A');
1564             let upper = cmp::min(self.end, b'Z');
1565             ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1566         }
1567         Ok(())
1568     }
1569 }
1570 
1571 impl ClassBytesRange {
1572     /// Create a new byte range for a character class.
1573     ///
1574     /// The returned range is always in a canonical form. That is, the range
1575     /// returned always satisfies the invariant that `start <= end`.
new(start: u8, end: u8) -> ClassBytesRange1576     pub fn new(start: u8, end: u8) -> ClassBytesRange {
1577         ClassBytesRange::create(start, end)
1578     }
1579 
1580     /// Return the start of this range.
1581     ///
1582     /// The start of a range is always less than or equal to the end of the
1583     /// range.
start(&self) -> u81584     pub fn start(&self) -> u8 {
1585         self.start
1586     }
1587 
1588     /// Return the end of this range.
1589     ///
1590     /// The end of a range is always greater than or equal to the start of the
1591     /// range.
end(&self) -> u81592     pub fn end(&self) -> u8 {
1593         self.end
1594     }
1595 
1596     /// Returns the number of bytes in this range.
len(&self) -> usize1597     pub fn len(&self) -> usize {
1598         usize::from(self.end.checked_sub(self.start).unwrap())
1599             .checked_add(1)
1600             .unwrap()
1601     }
1602 }
1603 
1604 impl core::fmt::Debug for ClassBytesRange {
fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result1605     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1606         f.debug_struct("ClassBytesRange")
1607             .field("start", &crate::debug::Byte(self.start))
1608             .field("end", &crate::debug::Byte(self.end))
1609             .finish()
1610     }
1611 }
1612 
1613 /// The high-level intermediate representation for a look-around assertion.
1614 ///
1615 /// An assertion match is always zero-length. Also called an "empty match."
1616 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1617 pub enum Look {
1618     /// Match the beginning of text. Specifically, this matches at the starting
1619     /// position of the input.
1620     Start = 1 << 0,
1621     /// Match the end of text. Specifically, this matches at the ending
1622     /// position of the input.
1623     End = 1 << 1,
1624     /// Match the beginning of a line or the beginning of text. Specifically,
1625     /// this matches at the starting position of the input, or at the position
1626     /// immediately following a `\n` character.
1627     StartLF = 1 << 2,
1628     /// Match the end of a line or the end of text. Specifically, this matches
1629     /// at the end position of the input, or at the position immediately
1630     /// preceding a `\n` character.
1631     EndLF = 1 << 3,
1632     /// Match the beginning of a line or the beginning of text. Specifically,
1633     /// this matches at the starting position of the input, or at the position
1634     /// immediately following either a `\r` or `\n` character, but never after
1635     /// a `\r` when a `\n` follows.
1636     StartCRLF = 1 << 4,
1637     /// Match the end of a line or the end of text. Specifically, this matches
1638     /// at the end position of the input, or at the position immediately
1639     /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1640     /// precedes it.
1641     EndCRLF = 1 << 5,
1642     /// Match an ASCII-only word boundary. That is, this matches a position
1643     /// where the left adjacent character and right adjacent character
1644     /// correspond to a word and non-word or a non-word and word character.
1645     WordAscii = 1 << 6,
1646     /// Match an ASCII-only negation of a word boundary.
1647     WordAsciiNegate = 1 << 7,
1648     /// Match a Unicode-aware word boundary. That is, this matches a position
1649     /// where the left adjacent character and right adjacent character
1650     /// correspond to a word and non-word or a non-word and word character.
1651     WordUnicode = 1 << 8,
1652     /// Match a Unicode-aware negation of a word boundary.
1653     WordUnicodeNegate = 1 << 9,
1654     /// Match the start of an ASCII-only word boundary. That is, this matches a
1655     /// position at either the beginning of the haystack or where the previous
1656     /// character is not a word character and the following character is a word
1657     /// character.
1658     WordStartAscii = 1 << 10,
1659     /// Match the end of an ASCII-only word boundary. That is, this matches
1660     /// a position at either the end of the haystack or where the previous
1661     /// character is a word character and the following character is not a word
1662     /// character.
1663     WordEndAscii = 1 << 11,
1664     /// Match the start of a Unicode word boundary. That is, this matches a
1665     /// position at either the beginning of the haystack or where the previous
1666     /// character is not a word character and the following character is a word
1667     /// character.
1668     WordStartUnicode = 1 << 12,
1669     /// Match the end of a Unicode word boundary. That is, this matches a
1670     /// position at either the end of the haystack or where the previous
1671     /// character is a word character and the following character is not a word
1672     /// character.
1673     WordEndUnicode = 1 << 13,
1674     /// Match the start half of an ASCII-only word boundary. That is, this
1675     /// matches a position at either the beginning of the haystack or where the
1676     /// previous character is not a word character.
1677     WordStartHalfAscii = 1 << 14,
1678     /// Match the end half of an ASCII-only word boundary. That is, this
1679     /// matches a position at either the end of the haystack or where the
1680     /// following character is not a word character.
1681     WordEndHalfAscii = 1 << 15,
1682     /// Match the start half of a Unicode word boundary. That is, this matches
1683     /// a position at either the beginning of the haystack or where the
1684     /// previous character is not a word character.
1685     WordStartHalfUnicode = 1 << 16,
1686     /// Match the end half of a Unicode word boundary. That is, this matches
1687     /// a position at either the end of the haystack or where the following
1688     /// character is not a word character.
1689     WordEndHalfUnicode = 1 << 17,
1690 }
1691 
1692 impl Look {
1693     /// Flip the look-around assertion to its equivalent for reverse searches.
1694     /// For example, `StartLF` gets translated to `EndLF`.
1695     ///
1696     /// Some assertions, such as `WordUnicode`, remain the same since they
1697     /// match the same positions regardless of the direction of the search.
1698     #[inline]
reversed(self) -> Look1699     pub const fn reversed(self) -> Look {
1700         match self {
1701             Look::Start => Look::End,
1702             Look::End => Look::Start,
1703             Look::StartLF => Look::EndLF,
1704             Look::EndLF => Look::StartLF,
1705             Look::StartCRLF => Look::EndCRLF,
1706             Look::EndCRLF => Look::StartCRLF,
1707             Look::WordAscii => Look::WordAscii,
1708             Look::WordAsciiNegate => Look::WordAsciiNegate,
1709             Look::WordUnicode => Look::WordUnicode,
1710             Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1711             Look::WordStartAscii => Look::WordEndAscii,
1712             Look::WordEndAscii => Look::WordStartAscii,
1713             Look::WordStartUnicode => Look::WordEndUnicode,
1714             Look::WordEndUnicode => Look::WordStartUnicode,
1715             Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1716             Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1717             Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1718             Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
1719         }
1720     }
1721 
1722     /// Return the underlying representation of this look-around enumeration
1723     /// as an integer. Giving the return value to the [`Look::from_repr`]
1724     /// constructor is guaranteed to return the same look-around variant that
1725     /// one started with within a semver compatible release of this crate.
1726     #[inline]
as_repr(self) -> u321727     pub const fn as_repr(self) -> u32 {
1728         // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1729         // actual int.
1730         self as u32
1731     }
1732 
1733     /// Given the underlying representation of a `Look` value, return the
1734     /// corresponding `Look` value if the representation is valid. Otherwise
1735     /// `None` is returned.
1736     #[inline]
from_repr(repr: u32) -> Option<Look>1737     pub const fn from_repr(repr: u32) -> Option<Look> {
1738         match repr {
1739             0b00_0000_0000_0000_0001 => Some(Look::Start),
1740             0b00_0000_0000_0000_0010 => Some(Look::End),
1741             0b00_0000_0000_0000_0100 => Some(Look::StartLF),
1742             0b00_0000_0000_0000_1000 => Some(Look::EndLF),
1743             0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
1744             0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
1745             0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
1746             0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
1747             0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
1748             0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
1749             0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
1750             0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
1751             0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
1752             0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
1753             0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
1754             0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
1755             0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
1756             0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
1757             _ => None,
1758         }
1759     }
1760 
1761     /// Returns a convenient single codepoint representation of this
1762     /// look-around assertion. Each assertion is guaranteed to be represented
1763     /// by a distinct character.
1764     ///
1765     /// This is useful for succinctly representing a look-around assertion in
1766     /// human friendly but succinct output intended for a programmer working on
1767     /// regex internals.
1768     #[inline]
as_char(self) -> char1769     pub const fn as_char(self) -> char {
1770         match self {
1771             Look::Start => 'A',
1772             Look::End => 'z',
1773             Look::StartLF => '^',
1774             Look::EndLF => '$',
1775             Look::StartCRLF => 'r',
1776             Look::EndCRLF => 'R',
1777             Look::WordAscii => 'b',
1778             Look::WordAsciiNegate => 'B',
1779             Look::WordUnicode => '��',
1780             Look::WordUnicodeNegate => '��',
1781             Look::WordStartAscii => '<',
1782             Look::WordEndAscii => '>',
1783             Look::WordStartUnicode => '〈',
1784             Look::WordEndUnicode => '〉',
1785             Look::WordStartHalfAscii => '◁',
1786             Look::WordEndHalfAscii => '▷',
1787             Look::WordStartHalfUnicode => '◀',
1788             Look::WordEndHalfUnicode => '▶',
1789         }
1790     }
1791 }
1792 
1793 /// The high-level intermediate representation for a capturing group.
1794 ///
1795 /// A capturing group always has an index and a child expression. It may
1796 /// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1797 /// necessary.
1798 ///
1799 /// Note that there is no explicit representation of a non-capturing group
1800 /// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1801 /// the recursive structure of the `Hir` itself.
1802 #[derive(Clone, Debug, Eq, PartialEq)]
1803 pub struct Capture {
1804     /// The capture index of the capture.
1805     pub index: u32,
1806     /// The name of the capture, if it exists.
1807     pub name: Option<Box<str>>,
1808     /// The expression inside the capturing group, which may be empty.
1809     pub sub: Box<Hir>,
1810 }
1811 
1812 /// The high-level intermediate representation of a repetition operator.
1813 ///
1814 /// A repetition operator permits the repetition of an arbitrary
1815 /// sub-expression.
1816 #[derive(Clone, Debug, Eq, PartialEq)]
1817 pub struct Repetition {
1818     /// The minimum range of the repetition.
1819     ///
1820     /// Note that special cases like `?`, `+` and `*` all get translated into
1821     /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1822     ///
1823     /// When `min` is zero, this expression can match the empty string
1824     /// regardless of what its sub-expression is.
1825     pub min: u32,
1826     /// The maximum range of the repetition.
1827     ///
1828     /// Note that when `max` is `None`, `min` acts as a lower bound but where
1829     /// there is no upper bound. For something like `x{5}` where the min and
1830     /// max are equivalent, `min` will be set to `5` and `max` will be set to
1831     /// `Some(5)`.
1832     pub max: Option<u32>,
1833     /// Whether this repetition operator is greedy or not. A greedy operator
1834     /// will match as much as it can. A non-greedy operator will match as
1835     /// little as it can.
1836     ///
1837     /// Typically, operators are greedy by default and are only non-greedy when
1838     /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1839     /// not. However, this can be inverted via the `U` "ungreedy" flag.
1840     pub greedy: bool,
1841     /// The expression being repeated.
1842     pub sub: Box<Hir>,
1843 }
1844 
1845 impl Repetition {
1846     /// Returns a new repetition with the same `min`, `max` and `greedy`
1847     /// values, but with its sub-expression replaced with the one given.
with(&self, sub: Hir) -> Repetition1848     pub fn with(&self, sub: Hir) -> Repetition {
1849         Repetition {
1850             min: self.min,
1851             max: self.max,
1852             greedy: self.greedy,
1853             sub: Box::new(sub),
1854         }
1855     }
1856 }
1857 
1858 /// A type describing the different flavors of `.`.
1859 ///
1860 /// This type is meant to be used with [`Hir::dot`], which is a convenience
1861 /// routine for building HIR values derived from the `.` regex.
1862 #[non_exhaustive]
1863 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
1864 pub enum Dot {
1865     /// Matches the UTF-8 encoding of any Unicode scalar value.
1866     ///
1867     /// This is equivalent to `(?su:.)` and also `\p{any}`.
1868     AnyChar,
1869     /// Matches any byte value.
1870     ///
1871     /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1872     AnyByte,
1873     /// Matches the UTF-8 encoding of any Unicode scalar value except for the
1874     /// `char` given.
1875     ///
1876     /// This is equivalent to using `(?u-s:.)` with the line terminator set
1877     /// to a particular ASCII byte. (Because of peculiarities in the regex
1878     /// engines, a line terminator must be a single byte. It follows that when
1879     /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1880     /// value. That is, ti must be ASCII.)
1881     ///
1882     /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1883     /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1884     AnyCharExcept(char),
1885     /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1886     ///
1887     /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1888     AnyCharExceptLF,
1889     /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1890     /// and `\n`.
1891     ///
1892     /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1893     AnyCharExceptCRLF,
1894     /// Matches any byte value except for the `u8` given.
1895     ///
1896     /// This is equivalent to using `(?-us:.)` with the line terminator set
1897     /// to a particular ASCII byte. (Because of peculiarities in the regex
1898     /// engines, a line terminator must be a single byte. It follows that when
1899     /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1900     /// value. That is, ti must be ASCII.)
1901     ///
1902     /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1903     /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1904     AnyByteExcept(u8),
1905     /// Matches any byte value except for `\n`.
1906     ///
1907     /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1908     AnyByteExceptLF,
1909     /// Matches any byte value except for `\r` and `\n`.
1910     ///
1911     /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1912     AnyByteExceptCRLF,
1913 }
1914 
1915 /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1916 /// space but heap space proportional to the depth of the total `Hir`.
1917 impl Drop for Hir {
drop(&mut self)1918     fn drop(&mut self) {
1919         use core::mem;
1920 
1921         match *self.kind() {
1922             HirKind::Empty
1923             | HirKind::Literal(_)
1924             | HirKind::Class(_)
1925             | HirKind::Look(_) => return,
1926             HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1927             HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1928                 return
1929             }
1930             HirKind::Concat(ref x) if x.is_empty() => return,
1931             HirKind::Alternation(ref x) if x.is_empty() => return,
1932             _ => {}
1933         }
1934 
1935         let mut stack = vec![mem::replace(self, Hir::empty())];
1936         while let Some(mut expr) = stack.pop() {
1937             match expr.kind {
1938                 HirKind::Empty
1939                 | HirKind::Literal(_)
1940                 | HirKind::Class(_)
1941                 | HirKind::Look(_) => {}
1942                 HirKind::Capture(ref mut x) => {
1943                     stack.push(mem::replace(&mut x.sub, Hir::empty()));
1944                 }
1945                 HirKind::Repetition(ref mut x) => {
1946                     stack.push(mem::replace(&mut x.sub, Hir::empty()));
1947                 }
1948                 HirKind::Concat(ref mut x) => {
1949                     stack.extend(x.drain(..));
1950                 }
1951                 HirKind::Alternation(ref mut x) => {
1952                     stack.extend(x.drain(..));
1953                 }
1954             }
1955         }
1956     }
1957 }
1958 
1959 /// A type that collects various properties of an HIR value.
1960 ///
1961 /// Properties are always scalar values and represent meta data that is
1962 /// computed inductively on an HIR value. Properties are defined for all
1963 /// HIR values.
1964 ///
1965 /// All methods on a `Properties` value take constant time and are meant to
1966 /// be cheap to call.
1967 #[derive(Clone, Debug, Eq, PartialEq)]
1968 pub struct Properties(Box<PropertiesI>);
1969 
1970 /// The property definition. It is split out so that we can box it, and
1971 /// there by make `Properties` use less stack size. This is kind-of important
1972 /// because every HIR value has a `Properties` attached to it.
1973 ///
1974 /// This does have the unfortunate consequence that creating any HIR value
1975 /// always leads to at least one alloc for properties, but this is generally
1976 /// true anyway (for pretty much all HirKinds except for look-arounds).
1977 #[derive(Clone, Debug, Eq, PartialEq)]
1978 struct PropertiesI {
1979     minimum_len: Option<usize>,
1980     maximum_len: Option<usize>,
1981     look_set: LookSet,
1982     look_set_prefix: LookSet,
1983     look_set_suffix: LookSet,
1984     look_set_prefix_any: LookSet,
1985     look_set_suffix_any: LookSet,
1986     utf8: bool,
1987     explicit_captures_len: usize,
1988     static_explicit_captures_len: Option<usize>,
1989     literal: bool,
1990     alternation_literal: bool,
1991 }
1992 
1993 impl Properties {
1994     /// Returns the length (in bytes) of the smallest string matched by this
1995     /// HIR.
1996     ///
1997     /// A return value of `0` is possible and occurs when the HIR can match an
1998     /// empty string.
1999     ///
2000     /// `None` is returned when there is no minimum length. This occurs in
2001     /// precisely the cases where the HIR matches nothing. i.e., The language
2002     /// the regex matches is empty. An example of such a regex is `\P{any}`.
2003     #[inline]
minimum_len(&self) -> Option<usize>2004     pub fn minimum_len(&self) -> Option<usize> {
2005         self.0.minimum_len
2006     }
2007 
2008     /// Returns the length (in bytes) of the longest string matched by this
2009     /// HIR.
2010     ///
2011     /// A return value of `0` is possible and occurs when nothing longer than
2012     /// the empty string is in the language described by this HIR.
2013     ///
2014     /// `None` is returned when there is no longest matching string. This
2015     /// occurs when the HIR matches nothing or when there is no upper bound on
2016     /// the length of matching strings. Example of such regexes are `\P{any}`
2017     /// (matches nothing) and `a+` (has no upper bound).
2018     #[inline]
maximum_len(&self) -> Option<usize>2019     pub fn maximum_len(&self) -> Option<usize> {
2020         self.0.maximum_len
2021     }
2022 
2023     /// Returns a set of all look-around assertions that appear at least once
2024     /// in this HIR value.
2025     #[inline]
look_set(&self) -> LookSet2026     pub fn look_set(&self) -> LookSet {
2027         self.0.look_set
2028     }
2029 
2030     /// Returns a set of all look-around assertions that appear as a prefix for
2031     /// this HIR value. That is, the set returned corresponds to the set of
2032     /// assertions that must be passed before matching any bytes in a haystack.
2033     ///
2034     /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
2035     /// if and only if the HIR is fully anchored at the start.
2036     #[inline]
look_set_prefix(&self) -> LookSet2037     pub fn look_set_prefix(&self) -> LookSet {
2038         self.0.look_set_prefix
2039     }
2040 
2041     /// Returns a set of all look-around assertions that appear as a _possible_
2042     /// prefix for this HIR value. That is, the set returned corresponds to the
2043     /// set of assertions that _may_ be passed before matching any bytes in a
2044     /// haystack.
2045     ///
2046     /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
2047     /// true if and only if it's possible for the regex to match through a
2048     /// anchored assertion before consuming any input.
2049     #[inline]
look_set_prefix_any(&self) -> LookSet2050     pub fn look_set_prefix_any(&self) -> LookSet {
2051         self.0.look_set_prefix_any
2052     }
2053 
2054     /// Returns a set of all look-around assertions that appear as a suffix for
2055     /// this HIR value. That is, the set returned corresponds to the set of
2056     /// assertions that must be passed in order to be considered a match after
2057     /// all other consuming HIR expressions.
2058     ///
2059     /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
2060     /// if and only if the HIR is fully anchored at the end.
2061     #[inline]
look_set_suffix(&self) -> LookSet2062     pub fn look_set_suffix(&self) -> LookSet {
2063         self.0.look_set_suffix
2064     }
2065 
2066     /// Returns a set of all look-around assertions that appear as a _possible_
2067     /// suffix for this HIR value. That is, the set returned corresponds to the
2068     /// set of assertions that _may_ be passed before matching any bytes in a
2069     /// haystack.
2070     ///
2071     /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
2072     /// true if and only if it's possible for the regex to match through a
2073     /// anchored assertion at the end of a match without consuming any input.
2074     #[inline]
look_set_suffix_any(&self) -> LookSet2075     pub fn look_set_suffix_any(&self) -> LookSet {
2076         self.0.look_set_suffix_any
2077     }
2078 
2079     /// Return true if and only if the corresponding HIR will always match
2080     /// valid UTF-8.
2081     ///
2082     /// When this returns false, then it is possible for this HIR expression to
2083     /// match invalid UTF-8, including by matching between the code units of
2084     /// a single UTF-8 encoded codepoint.
2085     ///
2086     /// Note that this returns true even when the corresponding HIR can match
2087     /// the empty string. Since an empty string can technically appear between
2088     /// UTF-8 code units, it is possible for a match to be reported that splits
2089     /// a codepoint which could in turn be considered matching invalid UTF-8.
2090     /// However, it is generally assumed that such empty matches are handled
2091     /// specially by the search routine if it is absolutely required that
2092     /// matches not split a codepoint.
2093     ///
2094     /// # Example
2095     ///
2096     /// This code example shows the UTF-8 property of a variety of patterns.
2097     ///
2098     /// ```
2099     /// use regex_syntax::{ParserBuilder, parse};
2100     ///
2101     /// // Examples of 'is_utf8() == true'.
2102     /// assert!(parse(r"a")?.properties().is_utf8());
2103     /// assert!(parse(r"[^a]")?.properties().is_utf8());
2104     /// assert!(parse(r".")?.properties().is_utf8());
2105     /// assert!(parse(r"\W")?.properties().is_utf8());
2106     /// assert!(parse(r"\b")?.properties().is_utf8());
2107     /// assert!(parse(r"\B")?.properties().is_utf8());
2108     /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
2109     /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
2110     /// // Unicode mode is enabled by default, and in
2111     /// // that mode, all \x hex escapes are treated as
2112     /// // codepoints. So this actually matches the UTF-8
2113     /// // encoding of U+00FF.
2114     /// assert!(parse(r"\xFF")?.properties().is_utf8());
2115     ///
2116     /// // Now we show examples of 'is_utf8() == false'.
2117     /// // The only way to do this is to force the parser
2118     /// // to permit invalid UTF-8, otherwise all of these
2119     /// // would fail to parse!
2120     /// let parse = |pattern| {
2121     ///     ParserBuilder::new().utf8(false).build().parse(pattern)
2122     /// };
2123     /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
2124     /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
2125     /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
2126     /// // Conversely to the equivalent example above,
2127     /// // when Unicode mode is disabled, \x hex escapes
2128     /// // are treated as their raw byte values.
2129     /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2130     /// // Note that just because we disabled UTF-8 in the
2131     /// // parser doesn't mean we still can't use Unicode.
2132     /// // It is enabled by default, so \xFF is still
2133     /// // equivalent to matching the UTF-8 encoding of
2134     /// // U+00FF by default.
2135     /// assert!(parse(r"\xFF")?.properties().is_utf8());
2136     /// // Even though we use raw bytes that individually
2137     /// // are not valid UTF-8, when combined together, the
2138     /// // overall expression *does* match valid UTF-8!
2139     /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2140     ///
2141     /// # Ok::<(), Box<dyn std::error::Error>>(())
2142     /// ```
2143     #[inline]
is_utf8(&self) -> bool2144     pub fn is_utf8(&self) -> bool {
2145         self.0.utf8
2146     }
2147 
2148     /// Returns the total number of explicit capturing groups in the
2149     /// corresponding HIR.
2150     ///
2151     /// Note that this does not include the implicit capturing group
2152     /// corresponding to the entire match that is typically included by regex
2153     /// engines.
2154     ///
2155     /// # Example
2156     ///
2157     /// This method will return `0` for `a` and `1` for `(a)`:
2158     ///
2159     /// ```
2160     /// use regex_syntax::parse;
2161     ///
2162     /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
2163     /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
2164     ///
2165     /// # Ok::<(), Box<dyn std::error::Error>>(())
2166     /// ```
2167     #[inline]
explicit_captures_len(&self) -> usize2168     pub fn explicit_captures_len(&self) -> usize {
2169         self.0.explicit_captures_len
2170     }
2171 
2172     /// Returns the total number of explicit capturing groups that appear in
2173     /// every possible match.
2174     ///
2175     /// If the number of capture groups can vary depending on the match, then
2176     /// this returns `None`. That is, a value is only returned when the number
2177     /// of matching groups is invariant or "static."
2178     ///
2179     /// Note that this does not include the implicit capturing group
2180     /// corresponding to the entire match.
2181     ///
2182     /// # Example
2183     ///
2184     /// This shows a few cases where a static number of capture groups is
2185     /// available and a few cases where it is not.
2186     ///
2187     /// ```
2188     /// use regex_syntax::parse;
2189     ///
2190     /// let len = |pattern| {
2191     ///     parse(pattern).map(|h| {
2192     ///         h.properties().static_explicit_captures_len()
2193     ///     })
2194     /// };
2195     ///
2196     /// assert_eq!(Some(0), len("a")?);
2197     /// assert_eq!(Some(1), len("(a)")?);
2198     /// assert_eq!(Some(1), len("(a)|(b)")?);
2199     /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2200     /// assert_eq!(None, len("(a)|b")?);
2201     /// assert_eq!(None, len("a|(b)")?);
2202     /// assert_eq!(None, len("(b)*")?);
2203     /// assert_eq!(Some(1), len("(b)+")?);
2204     ///
2205     /// # Ok::<(), Box<dyn std::error::Error>>(())
2206     /// ```
2207     #[inline]
static_explicit_captures_len(&self) -> Option<usize>2208     pub fn static_explicit_captures_len(&self) -> Option<usize> {
2209         self.0.static_explicit_captures_len
2210     }
2211 
2212     /// Return true if and only if this HIR is a simple literal. This is
2213     /// only true when this HIR expression is either itself a `Literal` or a
2214     /// concatenation of only `Literal`s.
2215     ///
2216     /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2217     /// the empty string are not (even though they contain sub-expressions that
2218     /// are literals).
2219     #[inline]
is_literal(&self) -> bool2220     pub fn is_literal(&self) -> bool {
2221         self.0.literal
2222     }
2223 
2224     /// Return true if and only if this HIR is either a simple literal or an
2225     /// alternation of simple literals. This is only
2226     /// true when this HIR expression is either itself a `Literal` or a
2227     /// concatenation of only `Literal`s or an alternation of only `Literal`s.
2228     ///
2229     /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
2230     /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2231     /// (even though that contain sub-expressions that are literals).
2232     #[inline]
is_alternation_literal(&self) -> bool2233     pub fn is_alternation_literal(&self) -> bool {
2234         self.0.alternation_literal
2235     }
2236 
2237     /// Returns the total amount of heap memory usage, in bytes, used by this
2238     /// `Properties` value.
2239     #[inline]
memory_usage(&self) -> usize2240     pub fn memory_usage(&self) -> usize {
2241         core::mem::size_of::<PropertiesI>()
2242     }
2243 
2244     /// Returns a new set of properties that corresponds to the union of the
2245     /// iterator of properties given.
2246     ///
2247     /// This is useful when one has multiple `Hir` expressions and wants
2248     /// to combine them into a single alternation without constructing the
2249     /// corresponding `Hir`. This routine provides a way of combining the
2250     /// properties of each `Hir` expression into one set of properties
2251     /// representing the union of those expressions.
2252     ///
2253     /// # Example: union with HIRs that never match
2254     ///
2255     /// This example shows that unioning properties together with one that
2256     /// represents a regex that never matches will "poison" certain attributes,
2257     /// like the minimum and maximum lengths.
2258     ///
2259     /// ```
2260     /// use regex_syntax::{hir::Properties, parse};
2261     ///
2262     /// let hir1 = parse("ab?c?")?;
2263     /// assert_eq!(Some(1), hir1.properties().minimum_len());
2264     /// assert_eq!(Some(3), hir1.properties().maximum_len());
2265     ///
2266     /// let hir2 = parse(r"[a&&b]")?;
2267     /// assert_eq!(None, hir2.properties().minimum_len());
2268     /// assert_eq!(None, hir2.properties().maximum_len());
2269     ///
2270     /// let hir3 = parse(r"wxy?z?")?;
2271     /// assert_eq!(Some(2), hir3.properties().minimum_len());
2272     /// assert_eq!(Some(4), hir3.properties().maximum_len());
2273     ///
2274     /// let unioned = Properties::union([
2275     ///		hir1.properties(),
2276     ///		hir2.properties(),
2277     ///		hir3.properties(),
2278     ///	]);
2279     /// assert_eq!(None, unioned.minimum_len());
2280     /// assert_eq!(None, unioned.maximum_len());
2281     ///
2282     /// # Ok::<(), Box<dyn std::error::Error>>(())
2283     /// ```
2284     ///
2285     /// The maximum length can also be "poisoned" by a pattern that has no
2286     /// upper bound on the length of a match. The minimum length remains
2287     /// unaffected:
2288     ///
2289     /// ```
2290     /// use regex_syntax::{hir::Properties, parse};
2291     ///
2292     /// let hir1 = parse("ab?c?")?;
2293     /// assert_eq!(Some(1), hir1.properties().minimum_len());
2294     /// assert_eq!(Some(3), hir1.properties().maximum_len());
2295     ///
2296     /// let hir2 = parse(r"a+")?;
2297     /// assert_eq!(Some(1), hir2.properties().minimum_len());
2298     /// assert_eq!(None, hir2.properties().maximum_len());
2299     ///
2300     /// let hir3 = parse(r"wxy?z?")?;
2301     /// assert_eq!(Some(2), hir3.properties().minimum_len());
2302     /// assert_eq!(Some(4), hir3.properties().maximum_len());
2303     ///
2304     /// let unioned = Properties::union([
2305     ///		hir1.properties(),
2306     ///		hir2.properties(),
2307     ///		hir3.properties(),
2308     ///	]);
2309     /// assert_eq!(Some(1), unioned.minimum_len());
2310     /// assert_eq!(None, unioned.maximum_len());
2311     ///
2312     /// # Ok::<(), Box<dyn std::error::Error>>(())
2313     /// ```
union<I, P>(props: I) -> Properties where I: IntoIterator<Item = P>, P: core::borrow::Borrow<Properties>,2314     pub fn union<I, P>(props: I) -> Properties
2315     where
2316         I: IntoIterator<Item = P>,
2317         P: core::borrow::Borrow<Properties>,
2318     {
2319         let mut it = props.into_iter().peekable();
2320         // While empty alternations aren't possible, we still behave as if they
2321         // are. When we have an empty alternate, then clearly the look-around
2322         // prefix and suffix is empty. Otherwise, it is the intersection of all
2323         // prefixes and suffixes (respectively) of the branches.
2324         let fix = if it.peek().is_none() {
2325             LookSet::empty()
2326         } else {
2327             LookSet::full()
2328         };
2329         // And also, an empty alternate means we have 0 static capture groups,
2330         // but we otherwise start with the number corresponding to the first
2331         // alternate. If any subsequent alternate has a different number of
2332         // static capture groups, then we overall have a variation and not a
2333         // static number of groups.
2334         let static_explicit_captures_len =
2335             it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2336         // The base case is an empty alternation, which matches nothing.
2337         // Note though that empty alternations aren't possible, because the
2338         // Hir::alternation smart constructor rewrites those as empty character
2339         // classes.
2340         let mut props = PropertiesI {
2341             minimum_len: None,
2342             maximum_len: None,
2343             look_set: LookSet::empty(),
2344             look_set_prefix: fix,
2345             look_set_suffix: fix,
2346             look_set_prefix_any: LookSet::empty(),
2347             look_set_suffix_any: LookSet::empty(),
2348             utf8: true,
2349             explicit_captures_len: 0,
2350             static_explicit_captures_len,
2351             literal: false,
2352             alternation_literal: true,
2353         };
2354         let (mut min_poisoned, mut max_poisoned) = (false, false);
2355         // Handle properties that need to visit every child hir.
2356         for prop in it {
2357             let p = prop.borrow();
2358             props.look_set.set_union(p.look_set());
2359             props.look_set_prefix.set_intersect(p.look_set_prefix());
2360             props.look_set_suffix.set_intersect(p.look_set_suffix());
2361             props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2362             props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2363             props.utf8 = props.utf8 && p.is_utf8();
2364             props.explicit_captures_len = props
2365                 .explicit_captures_len
2366                 .saturating_add(p.explicit_captures_len());
2367             if props.static_explicit_captures_len
2368                 != p.static_explicit_captures_len()
2369             {
2370                 props.static_explicit_captures_len = None;
2371             }
2372             props.alternation_literal =
2373                 props.alternation_literal && p.is_literal();
2374             if !min_poisoned {
2375                 if let Some(xmin) = p.minimum_len() {
2376                     if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2377                         props.minimum_len = Some(xmin);
2378                     }
2379                 } else {
2380                     props.minimum_len = None;
2381                     min_poisoned = true;
2382                 }
2383             }
2384             if !max_poisoned {
2385                 if let Some(xmax) = p.maximum_len() {
2386                     if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2387                         props.maximum_len = Some(xmax);
2388                     }
2389                 } else {
2390                     props.maximum_len = None;
2391                     max_poisoned = true;
2392                 }
2393             }
2394         }
2395         Properties(Box::new(props))
2396     }
2397 }
2398 
2399 impl Properties {
2400     /// Create a new set of HIR properties for an empty regex.
empty() -> Properties2401     fn empty() -> Properties {
2402         let inner = PropertiesI {
2403             minimum_len: Some(0),
2404             maximum_len: Some(0),
2405             look_set: LookSet::empty(),
2406             look_set_prefix: LookSet::empty(),
2407             look_set_suffix: LookSet::empty(),
2408             look_set_prefix_any: LookSet::empty(),
2409             look_set_suffix_any: LookSet::empty(),
2410             // It is debatable whether an empty regex always matches at valid
2411             // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2412             // it is clearly false. There are, for example, many empty strings
2413             // between the bytes encoding a '☃'.
2414             //
2415             // However, when Unicode mode is enabled, the fundamental atom
2416             // of matching is really a codepoint. And in that scenario, an
2417             // empty regex is defined to only match at valid UTF-8 boundaries
2418             // and to never split a codepoint. It just so happens that this
2419             // enforcement is somewhat tricky to do for regexes that match
2420             // the empty string inside regex engines themselves. It usually
2421             // requires some layer above the regex engine to filter out such
2422             // matches.
2423             //
2424             // In any case, 'true' is really the only coherent option. If it
2425             // were false, for example, then 'a*' would also need to be false
2426             // since it too can match the empty string.
2427             utf8: true,
2428             explicit_captures_len: 0,
2429             static_explicit_captures_len: Some(0),
2430             literal: false,
2431             alternation_literal: false,
2432         };
2433         Properties(Box::new(inner))
2434     }
2435 
2436     /// Create a new set of HIR properties for a literal regex.
literal(lit: &Literal) -> Properties2437     fn literal(lit: &Literal) -> Properties {
2438         let inner = PropertiesI {
2439             minimum_len: Some(lit.0.len()),
2440             maximum_len: Some(lit.0.len()),
2441             look_set: LookSet::empty(),
2442             look_set_prefix: LookSet::empty(),
2443             look_set_suffix: LookSet::empty(),
2444             look_set_prefix_any: LookSet::empty(),
2445             look_set_suffix_any: LookSet::empty(),
2446             utf8: core::str::from_utf8(&lit.0).is_ok(),
2447             explicit_captures_len: 0,
2448             static_explicit_captures_len: Some(0),
2449             literal: true,
2450             alternation_literal: true,
2451         };
2452         Properties(Box::new(inner))
2453     }
2454 
2455     /// Create a new set of HIR properties for a character class.
class(class: &Class) -> Properties2456     fn class(class: &Class) -> Properties {
2457         let inner = PropertiesI {
2458             minimum_len: class.minimum_len(),
2459             maximum_len: class.maximum_len(),
2460             look_set: LookSet::empty(),
2461             look_set_prefix: LookSet::empty(),
2462             look_set_suffix: LookSet::empty(),
2463             look_set_prefix_any: LookSet::empty(),
2464             look_set_suffix_any: LookSet::empty(),
2465             utf8: class.is_utf8(),
2466             explicit_captures_len: 0,
2467             static_explicit_captures_len: Some(0),
2468             literal: false,
2469             alternation_literal: false,
2470         };
2471         Properties(Box::new(inner))
2472     }
2473 
2474     /// Create a new set of HIR properties for a look-around assertion.
look(look: Look) -> Properties2475     fn look(look: Look) -> Properties {
2476         let inner = PropertiesI {
2477             minimum_len: Some(0),
2478             maximum_len: Some(0),
2479             look_set: LookSet::singleton(look),
2480             look_set_prefix: LookSet::singleton(look),
2481             look_set_suffix: LookSet::singleton(look),
2482             look_set_prefix_any: LookSet::singleton(look),
2483             look_set_suffix_any: LookSet::singleton(look),
2484             // This requires a little explanation. Basically, we don't consider
2485             // matching an empty string to be equivalent to matching invalid
2486             // UTF-8, even though technically matching every empty string will
2487             // split the UTF-8 encoding of a single codepoint when treating a
2488             // UTF-8 encoded string as a sequence of bytes. Our defense here is
2489             // that in such a case, a codepoint should logically be treated as
2490             // the fundamental atom for matching, and thus the only valid match
2491             // points are between codepoints and not bytes.
2492             //
2493             // More practically, this is true here because it's also true
2494             // for 'Hir::empty()', otherwise something like 'a*' would be
2495             // considered to match invalid UTF-8. That in turn makes this
2496             // property borderline useless.
2497             utf8: true,
2498             explicit_captures_len: 0,
2499             static_explicit_captures_len: Some(0),
2500             literal: false,
2501             alternation_literal: false,
2502         };
2503         Properties(Box::new(inner))
2504     }
2505 
2506     /// Create a new set of HIR properties for a repetition.
repetition(rep: &Repetition) -> Properties2507     fn repetition(rep: &Repetition) -> Properties {
2508         let p = rep.sub.properties();
2509         let minimum_len = p.minimum_len().map(|child_min| {
2510             let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2511             child_min.saturating_mul(rep_min)
2512         });
2513         let maximum_len = rep.max.and_then(|rep_max| {
2514             let rep_max = usize::try_from(rep_max).ok()?;
2515             let child_max = p.maximum_len()?;
2516             child_max.checked_mul(rep_max)
2517         });
2518 
2519         let mut inner = PropertiesI {
2520             minimum_len,
2521             maximum_len,
2522             look_set: p.look_set(),
2523             look_set_prefix: LookSet::empty(),
2524             look_set_suffix: LookSet::empty(),
2525             look_set_prefix_any: p.look_set_prefix_any(),
2526             look_set_suffix_any: p.look_set_suffix_any(),
2527             utf8: p.is_utf8(),
2528             explicit_captures_len: p.explicit_captures_len(),
2529             static_explicit_captures_len: p.static_explicit_captures_len(),
2530             literal: false,
2531             alternation_literal: false,
2532         };
2533         // If the repetition operator can match the empty string, then its
2534         // lookset prefix and suffixes themselves remain empty since they are
2535         // no longer required to match.
2536         if rep.min > 0 {
2537             inner.look_set_prefix = p.look_set_prefix();
2538             inner.look_set_suffix = p.look_set_suffix();
2539         }
2540         // If the static captures len of the sub-expression is not known or
2541         // is greater than zero, then it automatically propagates to the
2542         // repetition, regardless of the repetition. Otherwise, it might
2543         // change, but only when the repetition can match 0 times.
2544         if rep.min == 0
2545             && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
2546         {
2547             // If we require a match 0 times, then our captures len is
2548             // guaranteed to be zero. Otherwise, if we *can* match the empty
2549             // string, then it's impossible to know how many captures will be
2550             // in the resulting match.
2551             if rep.max == Some(0) {
2552                 inner.static_explicit_captures_len = Some(0);
2553             } else {
2554                 inner.static_explicit_captures_len = None;
2555             }
2556         }
2557         Properties(Box::new(inner))
2558     }
2559 
2560     /// Create a new set of HIR properties for a capture.
capture(capture: &Capture) -> Properties2561     fn capture(capture: &Capture) -> Properties {
2562         let p = capture.sub.properties();
2563         Properties(Box::new(PropertiesI {
2564             explicit_captures_len: p.explicit_captures_len().saturating_add(1),
2565             static_explicit_captures_len: p
2566                 .static_explicit_captures_len()
2567                 .map(|len| len.saturating_add(1)),
2568             literal: false,
2569             alternation_literal: false,
2570             ..*p.0.clone()
2571         }))
2572     }
2573 
2574     /// Create a new set of HIR properties for a concatenation.
concat(concat: &[Hir]) -> Properties2575     fn concat(concat: &[Hir]) -> Properties {
2576         // The base case is an empty concatenation, which matches the empty
2577         // string. Note though that empty concatenations aren't possible,
2578         // because the Hir::concat smart constructor rewrites those as
2579         // Hir::empty.
2580         let mut props = PropertiesI {
2581             minimum_len: Some(0),
2582             maximum_len: Some(0),
2583             look_set: LookSet::empty(),
2584             look_set_prefix: LookSet::empty(),
2585             look_set_suffix: LookSet::empty(),
2586             look_set_prefix_any: LookSet::empty(),
2587             look_set_suffix_any: LookSet::empty(),
2588             utf8: true,
2589             explicit_captures_len: 0,
2590             static_explicit_captures_len: Some(0),
2591             literal: true,
2592             alternation_literal: true,
2593         };
2594         // Handle properties that need to visit every child hir.
2595         for x in concat.iter() {
2596             let p = x.properties();
2597             props.look_set.set_union(p.look_set());
2598             props.utf8 = props.utf8 && p.is_utf8();
2599             props.explicit_captures_len = props
2600                 .explicit_captures_len
2601                 .saturating_add(p.explicit_captures_len());
2602             props.static_explicit_captures_len = p
2603                 .static_explicit_captures_len()
2604                 .and_then(|len1| {
2605                     Some((len1, props.static_explicit_captures_len?))
2606                 })
2607                 .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
2608             props.literal = props.literal && p.is_literal();
2609             props.alternation_literal =
2610                 props.alternation_literal && p.is_alternation_literal();
2611             if let Some(minimum_len) = props.minimum_len {
2612                 match p.minimum_len() {
2613                     None => props.minimum_len = None,
2614                     Some(len) => {
2615                         // We use saturating arithmetic here because the
2616                         // minimum is just a lower bound. We can't go any
2617                         // higher than what our number types permit.
2618                         props.minimum_len =
2619                             Some(minimum_len.saturating_add(len));
2620                     }
2621                 }
2622             }
2623             if let Some(maximum_len) = props.maximum_len {
2624                 match p.maximum_len() {
2625                     None => props.maximum_len = None,
2626                     Some(len) => {
2627                         props.maximum_len = maximum_len.checked_add(len)
2628                     }
2629                 }
2630             }
2631         }
2632         // Handle the prefix properties, which only requires visiting
2633         // child exprs until one matches more than the empty string.
2634         let mut it = concat.iter();
2635         while let Some(x) = it.next() {
2636             props.look_set_prefix.set_union(x.properties().look_set_prefix());
2637             props
2638                 .look_set_prefix_any
2639                 .set_union(x.properties().look_set_prefix_any());
2640             if x.properties().maximum_len().map_or(true, |x| x > 0) {
2641                 break;
2642             }
2643         }
2644         // Same thing for the suffix properties, but in reverse.
2645         let mut it = concat.iter().rev();
2646         while let Some(x) = it.next() {
2647             props.look_set_suffix.set_union(x.properties().look_set_suffix());
2648             props
2649                 .look_set_suffix_any
2650                 .set_union(x.properties().look_set_suffix_any());
2651             if x.properties().maximum_len().map_or(true, |x| x > 0) {
2652                 break;
2653             }
2654         }
2655         Properties(Box::new(props))
2656     }
2657 
2658     /// Create a new set of HIR properties for a concatenation.
alternation(alts: &[Hir]) -> Properties2659     fn alternation(alts: &[Hir]) -> Properties {
2660         Properties::union(alts.iter().map(|hir| hir.properties()))
2661     }
2662 }
2663 
2664 /// A set of look-around assertions.
2665 ///
2666 /// This is useful for efficiently tracking look-around assertions. For
2667 /// example, an [`Hir`] provides properties that return `LookSet`s.
2668 #[derive(Clone, Copy, Default, Eq, PartialEq)]
2669 pub struct LookSet {
2670     /// The underlying representation this set is exposed to make it possible
2671     /// to store it somewhere efficiently. The representation is that
2672     /// of a bitset, where each assertion occupies bit `i` where `i =
2673     /// Look::as_repr()`.
2674     ///
2675     /// Note that users of this internal representation must permit the full
2676     /// range of `u16` values to be represented. For example, even if the
2677     /// current implementation only makes use of the 10 least significant bits,
2678     /// it may use more bits in a future semver compatible release.
2679     pub bits: u32,
2680 }
2681 
2682 impl LookSet {
2683     /// Create an empty set of look-around assertions.
2684     #[inline]
empty() -> LookSet2685     pub fn empty() -> LookSet {
2686         LookSet { bits: 0 }
2687     }
2688 
2689     /// Create a full set of look-around assertions.
2690     ///
2691     /// This set contains all possible look-around assertions.
2692     #[inline]
full() -> LookSet2693     pub fn full() -> LookSet {
2694         LookSet { bits: !0 }
2695     }
2696 
2697     /// Create a look-around set containing the look-around assertion given.
2698     ///
2699     /// This is a convenience routine for creating an empty set and inserting
2700     /// one look-around assertions.
2701     #[inline]
singleton(look: Look) -> LookSet2702     pub fn singleton(look: Look) -> LookSet {
2703         LookSet::empty().insert(look)
2704     }
2705 
2706     /// Returns the total number of look-around assertions in this set.
2707     #[inline]
len(self) -> usize2708     pub fn len(self) -> usize {
2709         // OK because max value always fits in a u8, which in turn always
2710         // fits in a usize, regardless of target.
2711         usize::try_from(self.bits.count_ones()).unwrap()
2712     }
2713 
2714     /// Returns true if and only if this set is empty.
2715     #[inline]
is_empty(self) -> bool2716     pub fn is_empty(self) -> bool {
2717         self.len() == 0
2718     }
2719 
2720     /// Returns true if and only if the given look-around assertion is in this
2721     /// set.
2722     #[inline]
contains(self, look: Look) -> bool2723     pub fn contains(self, look: Look) -> bool {
2724         self.bits & look.as_repr() != 0
2725     }
2726 
2727     /// Returns true if and only if this set contains any anchor assertions.
2728     /// This includes both "start/end of haystack" and "start/end of line."
2729     #[inline]
contains_anchor(&self) -> bool2730     pub fn contains_anchor(&self) -> bool {
2731         self.contains_anchor_haystack() || self.contains_anchor_line()
2732     }
2733 
2734     /// Returns true if and only if this set contains any "start/end of
2735     /// haystack" anchors. This doesn't include "start/end of line" anchors.
2736     #[inline]
contains_anchor_haystack(&self) -> bool2737     pub fn contains_anchor_haystack(&self) -> bool {
2738         self.contains(Look::Start) || self.contains(Look::End)
2739     }
2740 
2741     /// Returns true if and only if this set contains any "start/end of line"
2742     /// anchors. This doesn't include "start/end of haystack" anchors. This
2743     /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2744     #[inline]
contains_anchor_line(&self) -> bool2745     pub fn contains_anchor_line(&self) -> bool {
2746         self.contains(Look::StartLF)
2747             || self.contains(Look::EndLF)
2748             || self.contains(Look::StartCRLF)
2749             || self.contains(Look::EndCRLF)
2750     }
2751 
2752     /// Returns true if and only if this set contains any "start/end of line"
2753     /// anchors that only treat `\n` as line terminators. This does not include
2754     /// haystack anchors or CRLF aware line anchors.
2755     #[inline]
contains_anchor_lf(&self) -> bool2756     pub fn contains_anchor_lf(&self) -> bool {
2757         self.contains(Look::StartLF) || self.contains(Look::EndLF)
2758     }
2759 
2760     /// Returns true if and only if this set contains any "start/end of line"
2761     /// anchors that are CRLF-aware. This doesn't include "start/end of
2762     /// haystack" or "start/end of line-feed" anchors.
2763     #[inline]
contains_anchor_crlf(&self) -> bool2764     pub fn contains_anchor_crlf(&self) -> bool {
2765         self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
2766     }
2767 
2768     /// Returns true if and only if this set contains any word boundary or
2769     /// negated word boundary assertions. This include both Unicode and ASCII
2770     /// word boundaries.
2771     #[inline]
contains_word(self) -> bool2772     pub fn contains_word(self) -> bool {
2773         self.contains_word_unicode() || self.contains_word_ascii()
2774     }
2775 
2776     /// Returns true if and only if this set contains any Unicode word boundary
2777     /// or negated Unicode word boundary assertions.
2778     #[inline]
contains_word_unicode(self) -> bool2779     pub fn contains_word_unicode(self) -> bool {
2780         self.contains(Look::WordUnicode)
2781             || self.contains(Look::WordUnicodeNegate)
2782             || self.contains(Look::WordStartUnicode)
2783             || self.contains(Look::WordEndUnicode)
2784             || self.contains(Look::WordStartHalfUnicode)
2785             || self.contains(Look::WordEndHalfUnicode)
2786     }
2787 
2788     /// Returns true if and only if this set contains any ASCII word boundary
2789     /// or negated ASCII word boundary assertions.
2790     #[inline]
contains_word_ascii(self) -> bool2791     pub fn contains_word_ascii(self) -> bool {
2792         self.contains(Look::WordAscii)
2793             || self.contains(Look::WordAsciiNegate)
2794             || self.contains(Look::WordStartAscii)
2795             || self.contains(Look::WordEndAscii)
2796             || self.contains(Look::WordStartHalfAscii)
2797             || self.contains(Look::WordEndHalfAscii)
2798     }
2799 
2800     /// Returns an iterator over all of the look-around assertions in this set.
2801     #[inline]
iter(self) -> LookSetIter2802     pub fn iter(self) -> LookSetIter {
2803         LookSetIter { set: self }
2804     }
2805 
2806     /// Return a new set that is equivalent to the original, but with the given
2807     /// assertion added to it. If the assertion is already in the set, then the
2808     /// returned set is equivalent to the original.
2809     #[inline]
insert(self, look: Look) -> LookSet2810     pub fn insert(self, look: Look) -> LookSet {
2811         LookSet { bits: self.bits | look.as_repr() }
2812     }
2813 
2814     /// Updates this set in place with the result of inserting the given
2815     /// assertion into this set.
2816     #[inline]
set_insert(&mut self, look: Look)2817     pub fn set_insert(&mut self, look: Look) {
2818         *self = self.insert(look);
2819     }
2820 
2821     /// Return a new set that is equivalent to the original, but with the given
2822     /// assertion removed from it. If the assertion is not in the set, then the
2823     /// returned set is equivalent to the original.
2824     #[inline]
remove(self, look: Look) -> LookSet2825     pub fn remove(self, look: Look) -> LookSet {
2826         LookSet { bits: self.bits & !look.as_repr() }
2827     }
2828 
2829     /// Updates this set in place with the result of removing the given
2830     /// assertion from this set.
2831     #[inline]
set_remove(&mut self, look: Look)2832     pub fn set_remove(&mut self, look: Look) {
2833         *self = self.remove(look);
2834     }
2835 
2836     /// Returns a new set that is the result of subtracting the given set from
2837     /// this set.
2838     #[inline]
subtract(self, other: LookSet) -> LookSet2839     pub fn subtract(self, other: LookSet) -> LookSet {
2840         LookSet { bits: self.bits & !other.bits }
2841     }
2842 
2843     /// Updates this set in place with the result of subtracting the given set
2844     /// from this set.
2845     #[inline]
set_subtract(&mut self, other: LookSet)2846     pub fn set_subtract(&mut self, other: LookSet) {
2847         *self = self.subtract(other);
2848     }
2849 
2850     /// Returns a new set that is the union of this and the one given.
2851     #[inline]
union(self, other: LookSet) -> LookSet2852     pub fn union(self, other: LookSet) -> LookSet {
2853         LookSet { bits: self.bits | other.bits }
2854     }
2855 
2856     /// Updates this set in place with the result of unioning it with the one
2857     /// given.
2858     #[inline]
set_union(&mut self, other: LookSet)2859     pub fn set_union(&mut self, other: LookSet) {
2860         *self = self.union(other);
2861     }
2862 
2863     /// Returns a new set that is the intersection of this and the one given.
2864     #[inline]
intersect(self, other: LookSet) -> LookSet2865     pub fn intersect(self, other: LookSet) -> LookSet {
2866         LookSet { bits: self.bits & other.bits }
2867     }
2868 
2869     /// Updates this set in place with the result of intersecting it with the
2870     /// one given.
2871     #[inline]
set_intersect(&mut self, other: LookSet)2872     pub fn set_intersect(&mut self, other: LookSet) {
2873         *self = self.intersect(other);
2874     }
2875 
2876     /// Return a `LookSet` from the slice given as a native endian 32-bit
2877     /// integer.
2878     ///
2879     /// # Panics
2880     ///
2881     /// This panics if `slice.len() < 4`.
2882     #[inline]
read_repr(slice: &[u8]) -> LookSet2883     pub fn read_repr(slice: &[u8]) -> LookSet {
2884         let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
2885         LookSet { bits }
2886     }
2887 
2888     /// Write a `LookSet` as a native endian 32-bit integer to the beginning
2889     /// of the slice given.
2890     ///
2891     /// # Panics
2892     ///
2893     /// This panics if `slice.len() < 4`.
2894     #[inline]
write_repr(self, slice: &mut [u8])2895     pub fn write_repr(self, slice: &mut [u8]) {
2896         let raw = self.bits.to_ne_bytes();
2897         slice[0] = raw[0];
2898         slice[1] = raw[1];
2899         slice[2] = raw[2];
2900         slice[3] = raw[3];
2901     }
2902 }
2903 
2904 impl core::fmt::Debug for LookSet {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result2905     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2906         if self.is_empty() {
2907             return write!(f, "∅");
2908         }
2909         for look in self.iter() {
2910             write!(f, "{}", look.as_char())?;
2911         }
2912         Ok(())
2913     }
2914 }
2915 
2916 /// An iterator over all look-around assertions in a [`LookSet`].
2917 ///
2918 /// This iterator is created by [`LookSet::iter`].
2919 #[derive(Clone, Debug)]
2920 pub struct LookSetIter {
2921     set: LookSet,
2922 }
2923 
2924 impl Iterator for LookSetIter {
2925     type Item = Look;
2926 
2927     #[inline]
next(&mut self) -> Option<Look>2928     fn next(&mut self) -> Option<Look> {
2929         if self.set.is_empty() {
2930             return None;
2931         }
2932         // We'll never have more than u8::MAX distinct look-around assertions,
2933         // so 'bit' will always fit into a u16.
2934         let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2935         let look = Look::from_repr(1 << bit)?;
2936         self.set = self.set.remove(look);
2937         Some(look)
2938     }
2939 }
2940 
2941 /// Given a sequence of HIR values where each value corresponds to a Unicode
2942 /// class (or an all-ASCII byte class), return a single Unicode class
2943 /// corresponding to the union of the classes found.
class_chars(hirs: &[Hir]) -> Option<Class>2944 fn class_chars(hirs: &[Hir]) -> Option<Class> {
2945     let mut cls = ClassUnicode::new(vec![]);
2946     for hir in hirs.iter() {
2947         match *hir.kind() {
2948             HirKind::Class(Class::Unicode(ref cls2)) => {
2949                 cls.union(cls2);
2950             }
2951             HirKind::Class(Class::Bytes(ref cls2)) => {
2952                 cls.union(&cls2.to_unicode_class()?);
2953             }
2954             _ => return None,
2955         };
2956     }
2957     Some(Class::Unicode(cls))
2958 }
2959 
2960 /// Given a sequence of HIR values where each value corresponds to a byte class
2961 /// (or an all-ASCII Unicode class), return a single byte class corresponding
2962 /// to the union of the classes found.
class_bytes(hirs: &[Hir]) -> Option<Class>2963 fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2964     let mut cls = ClassBytes::new(vec![]);
2965     for hir in hirs.iter() {
2966         match *hir.kind() {
2967             HirKind::Class(Class::Unicode(ref cls2)) => {
2968                 cls.union(&cls2.to_byte_class()?);
2969             }
2970             HirKind::Class(Class::Bytes(ref cls2)) => {
2971                 cls.union(cls2);
2972             }
2973             _ => return None,
2974         };
2975     }
2976     Some(Class::Bytes(cls))
2977 }
2978 
2979 /// Given a sequence of HIR values where each value corresponds to a literal
2980 /// that is a single `char`, return that sequence of `char`s. Otherwise return
2981 /// None. No deduplication is done.
singleton_chars(hirs: &[Hir]) -> Option<Vec<char>>2982 fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2983     let mut singletons = vec![];
2984     for hir in hirs.iter() {
2985         let literal = match *hir.kind() {
2986             HirKind::Literal(Literal(ref bytes)) => bytes,
2987             _ => return None,
2988         };
2989         let ch = match crate::debug::utf8_decode(literal) {
2990             None => return None,
2991             Some(Err(_)) => return None,
2992             Some(Ok(ch)) => ch,
2993         };
2994         if literal.len() != ch.len_utf8() {
2995             return None;
2996         }
2997         singletons.push(ch);
2998     }
2999     Some(singletons)
3000 }
3001 
3002 /// Given a sequence of HIR values where each value corresponds to a literal
3003 /// that is a single byte, return that sequence of bytes. Otherwise return
3004 /// None. No deduplication is done.
singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>>3005 fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
3006     let mut singletons = vec![];
3007     for hir in hirs.iter() {
3008         let literal = match *hir.kind() {
3009             HirKind::Literal(Literal(ref bytes)) => bytes,
3010             _ => return None,
3011         };
3012         if literal.len() != 1 {
3013             return None;
3014         }
3015         singletons.push(literal[0]);
3016     }
3017     Some(singletons)
3018 }
3019 
3020 /// Looks for a common prefix in the list of alternation branches given. If one
3021 /// is found, then an equivalent but (hopefully) simplified Hir is returned.
3022 /// Otherwise, the original given list of branches is returned unmodified.
3023 ///
3024 /// This is not quite as good as it could be. Right now, it requires that
3025 /// all branches are 'Concat' expressions. It also doesn't do well with
3026 /// literals. For example, given 'foofoo|foobar', it will not refactor it to
3027 /// 'foo(?:foo|bar)' because literals are flattened into their own special
3028 /// concatenation. (One wonders if perhaps 'Literal' should be a single atom
3029 /// instead of a string of bytes because of this. Otherwise, handling the
3030 /// current representation in this routine will be pretty gnarly. Sigh.)
lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>>3031 fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
3032     if hirs.len() <= 1 {
3033         return Err(hirs);
3034     }
3035     let mut prefix = match hirs[0].kind() {
3036         HirKind::Concat(ref xs) => &**xs,
3037         _ => return Err(hirs),
3038     };
3039     if prefix.is_empty() {
3040         return Err(hirs);
3041     }
3042     for h in hirs.iter().skip(1) {
3043         let concat = match h.kind() {
3044             HirKind::Concat(ref xs) => xs,
3045             _ => return Err(hirs),
3046         };
3047         let common_len = prefix
3048             .iter()
3049             .zip(concat.iter())
3050             .take_while(|(x, y)| x == y)
3051             .count();
3052         prefix = &prefix[..common_len];
3053         if prefix.is_empty() {
3054             return Err(hirs);
3055         }
3056     }
3057     let len = prefix.len();
3058     assert_ne!(0, len);
3059     let mut prefix_concat = vec![];
3060     let mut suffix_alts = vec![];
3061     for h in hirs {
3062         let mut concat = match h.into_kind() {
3063             HirKind::Concat(xs) => xs,
3064             // We required all sub-expressions to be
3065             // concats above, so we're only here if we
3066             // have a concat.
3067             _ => unreachable!(),
3068         };
3069         suffix_alts.push(Hir::concat(concat.split_off(len)));
3070         if prefix_concat.is_empty() {
3071             prefix_concat = concat;
3072         }
3073     }
3074     let mut concat = prefix_concat;
3075     concat.push(Hir::alternation(suffix_alts));
3076     Ok(Hir::concat(concat))
3077 }
3078 
3079 #[cfg(test)]
3080 mod tests {
3081     use super::*;
3082 
uclass(ranges: &[(char, char)]) -> ClassUnicode3083     fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
3084         let ranges: Vec<ClassUnicodeRange> = ranges
3085             .iter()
3086             .map(|&(s, e)| ClassUnicodeRange::new(s, e))
3087             .collect();
3088         ClassUnicode::new(ranges)
3089     }
3090 
bclass(ranges: &[(u8, u8)]) -> ClassBytes3091     fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
3092         let ranges: Vec<ClassBytesRange> =
3093             ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
3094         ClassBytes::new(ranges)
3095     }
3096 
uranges(cls: &ClassUnicode) -> Vec<(char, char)>3097     fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
3098         cls.iter().map(|x| (x.start(), x.end())).collect()
3099     }
3100 
3101     #[cfg(feature = "unicode-case")]
ucasefold(cls: &ClassUnicode) -> ClassUnicode3102     fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
3103         let mut cls_ = cls.clone();
3104         cls_.case_fold_simple();
3105         cls_
3106     }
3107 
uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode3108     fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3109         let mut cls_ = cls1.clone();
3110         cls_.union(cls2);
3111         cls_
3112     }
3113 
uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode3114     fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3115         let mut cls_ = cls1.clone();
3116         cls_.intersect(cls2);
3117         cls_
3118     }
3119 
udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode3120     fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3121         let mut cls_ = cls1.clone();
3122         cls_.difference(cls2);
3123         cls_
3124     }
3125 
usymdifference( cls1: &ClassUnicode, cls2: &ClassUnicode, ) -> ClassUnicode3126     fn usymdifference(
3127         cls1: &ClassUnicode,
3128         cls2: &ClassUnicode,
3129     ) -> ClassUnicode {
3130         let mut cls_ = cls1.clone();
3131         cls_.symmetric_difference(cls2);
3132         cls_
3133     }
3134 
unegate(cls: &ClassUnicode) -> ClassUnicode3135     fn unegate(cls: &ClassUnicode) -> ClassUnicode {
3136         let mut cls_ = cls.clone();
3137         cls_.negate();
3138         cls_
3139     }
3140 
branges(cls: &ClassBytes) -> Vec<(u8, u8)>3141     fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3142         cls.iter().map(|x| (x.start(), x.end())).collect()
3143     }
3144 
bcasefold(cls: &ClassBytes) -> ClassBytes3145     fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3146         let mut cls_ = cls.clone();
3147         cls_.case_fold_simple();
3148         cls_
3149     }
3150 
bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes3151     fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3152         let mut cls_ = cls1.clone();
3153         cls_.union(cls2);
3154         cls_
3155     }
3156 
bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes3157     fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3158         let mut cls_ = cls1.clone();
3159         cls_.intersect(cls2);
3160         cls_
3161     }
3162 
bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes3163     fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3164         let mut cls_ = cls1.clone();
3165         cls_.difference(cls2);
3166         cls_
3167     }
3168 
bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes3169     fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3170         let mut cls_ = cls1.clone();
3171         cls_.symmetric_difference(cls2);
3172         cls_
3173     }
3174 
bnegate(cls: &ClassBytes) -> ClassBytes3175     fn bnegate(cls: &ClassBytes) -> ClassBytes {
3176         let mut cls_ = cls.clone();
3177         cls_.negate();
3178         cls_
3179     }
3180 
3181     #[test]
class_range_canonical_unicode()3182     fn class_range_canonical_unicode() {
3183         let range = ClassUnicodeRange::new('\u{00FF}', '\0');
3184         assert_eq!('\0', range.start());
3185         assert_eq!('\u{00FF}', range.end());
3186     }
3187 
3188     #[test]
class_range_canonical_bytes()3189     fn class_range_canonical_bytes() {
3190         let range = ClassBytesRange::new(b'\xFF', b'\0');
3191         assert_eq!(b'\0', range.start());
3192         assert_eq!(b'\xFF', range.end());
3193     }
3194 
3195     #[test]
class_canonicalize_unicode()3196     fn class_canonicalize_unicode() {
3197         let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3198         let expected = vec![('a', 'c'), ('x', 'z')];
3199         assert_eq!(expected, uranges(&cls));
3200 
3201         let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3202         let expected = vec![('a', 'c'), ('x', 'z')];
3203         assert_eq!(expected, uranges(&cls));
3204 
3205         let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3206         let expected = vec![('w', 'z')];
3207         assert_eq!(expected, uranges(&cls));
3208 
3209         let cls = uclass(&[
3210             ('c', 'f'),
3211             ('a', 'g'),
3212             ('d', 'j'),
3213             ('a', 'c'),
3214             ('m', 'p'),
3215             ('l', 's'),
3216         ]);
3217         let expected = vec![('a', 'j'), ('l', 's')];
3218         assert_eq!(expected, uranges(&cls));
3219 
3220         let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3221         let expected = vec![('u', 'z')];
3222         assert_eq!(expected, uranges(&cls));
3223 
3224         let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
3225         let expected = vec![('\x00', '\u{10FFFF}')];
3226         assert_eq!(expected, uranges(&cls));
3227 
3228         let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3229         let expected = vec![('a', 'b')];
3230         assert_eq!(expected, uranges(&cls));
3231     }
3232 
3233     #[test]
class_canonicalize_bytes()3234     fn class_canonicalize_bytes() {
3235         let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3236         let expected = vec![(b'a', b'c'), (b'x', b'z')];
3237         assert_eq!(expected, branges(&cls));
3238 
3239         let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3240         let expected = vec![(b'a', b'c'), (b'x', b'z')];
3241         assert_eq!(expected, branges(&cls));
3242 
3243         let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3244         let expected = vec![(b'w', b'z')];
3245         assert_eq!(expected, branges(&cls));
3246 
3247         let cls = bclass(&[
3248             (b'c', b'f'),
3249             (b'a', b'g'),
3250             (b'd', b'j'),
3251             (b'a', b'c'),
3252             (b'm', b'p'),
3253             (b'l', b's'),
3254         ]);
3255         let expected = vec![(b'a', b'j'), (b'l', b's')];
3256         assert_eq!(expected, branges(&cls));
3257 
3258         let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3259         let expected = vec![(b'u', b'z')];
3260         assert_eq!(expected, branges(&cls));
3261 
3262         let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
3263         let expected = vec![(b'\x00', b'\xFF')];
3264         assert_eq!(expected, branges(&cls));
3265 
3266         let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3267         let expected = vec![(b'a', b'b')];
3268         assert_eq!(expected, branges(&cls));
3269     }
3270 
3271     #[test]
3272     #[cfg(feature = "unicode-case")]
class_case_fold_unicode()3273     fn class_case_fold_unicode() {
3274         let cls = uclass(&[
3275             ('C', 'F'),
3276             ('A', 'G'),
3277             ('D', 'J'),
3278             ('A', 'C'),
3279             ('M', 'P'),
3280             ('L', 'S'),
3281             ('c', 'f'),
3282         ]);
3283         let expected = uclass(&[
3284             ('A', 'J'),
3285             ('L', 'S'),
3286             ('a', 'j'),
3287             ('l', 's'),
3288             ('\u{17F}', '\u{17F}'),
3289         ]);
3290         assert_eq!(expected, ucasefold(&cls));
3291 
3292         let cls = uclass(&[('A', 'Z')]);
3293         let expected = uclass(&[
3294             ('A', 'Z'),
3295             ('a', 'z'),
3296             ('\u{17F}', '\u{17F}'),
3297             ('\u{212A}', '\u{212A}'),
3298         ]);
3299         assert_eq!(expected, ucasefold(&cls));
3300 
3301         let cls = uclass(&[('a', 'z')]);
3302         let expected = uclass(&[
3303             ('A', 'Z'),
3304             ('a', 'z'),
3305             ('\u{17F}', '\u{17F}'),
3306             ('\u{212A}', '\u{212A}'),
3307         ]);
3308         assert_eq!(expected, ucasefold(&cls));
3309 
3310         let cls = uclass(&[('A', 'A'), ('_', '_')]);
3311         let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3312         assert_eq!(expected, ucasefold(&cls));
3313 
3314         let cls = uclass(&[('A', 'A'), ('=', '=')]);
3315         let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3316         assert_eq!(expected, ucasefold(&cls));
3317 
3318         let cls = uclass(&[('\x00', '\x10')]);
3319         assert_eq!(cls, ucasefold(&cls));
3320 
3321         let cls = uclass(&[('k', 'k')]);
3322         let expected =
3323             uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
3324         assert_eq!(expected, ucasefold(&cls));
3325 
3326         let cls = uclass(&[('@', '@')]);
3327         assert_eq!(cls, ucasefold(&cls));
3328     }
3329 
3330     #[test]
3331     #[cfg(not(feature = "unicode-case"))]
class_case_fold_unicode_disabled()3332     fn class_case_fold_unicode_disabled() {
3333         let mut cls = uclass(&[
3334             ('C', 'F'),
3335             ('A', 'G'),
3336             ('D', 'J'),
3337             ('A', 'C'),
3338             ('M', 'P'),
3339             ('L', 'S'),
3340             ('c', 'f'),
3341         ]);
3342         assert!(cls.try_case_fold_simple().is_err());
3343     }
3344 
3345     #[test]
3346     #[should_panic]
3347     #[cfg(not(feature = "unicode-case"))]
class_case_fold_unicode_disabled_panics()3348     fn class_case_fold_unicode_disabled_panics() {
3349         let mut cls = uclass(&[
3350             ('C', 'F'),
3351             ('A', 'G'),
3352             ('D', 'J'),
3353             ('A', 'C'),
3354             ('M', 'P'),
3355             ('L', 'S'),
3356             ('c', 'f'),
3357         ]);
3358         cls.case_fold_simple();
3359     }
3360 
3361     #[test]
class_case_fold_bytes()3362     fn class_case_fold_bytes() {
3363         let cls = bclass(&[
3364             (b'C', b'F'),
3365             (b'A', b'G'),
3366             (b'D', b'J'),
3367             (b'A', b'C'),
3368             (b'M', b'P'),
3369             (b'L', b'S'),
3370             (b'c', b'f'),
3371         ]);
3372         let expected =
3373             bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3374         assert_eq!(expected, bcasefold(&cls));
3375 
3376         let cls = bclass(&[(b'A', b'Z')]);
3377         let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3378         assert_eq!(expected, bcasefold(&cls));
3379 
3380         let cls = bclass(&[(b'a', b'z')]);
3381         let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3382         assert_eq!(expected, bcasefold(&cls));
3383 
3384         let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3385         let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3386         assert_eq!(expected, bcasefold(&cls));
3387 
3388         let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3389         let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3390         assert_eq!(expected, bcasefold(&cls));
3391 
3392         let cls = bclass(&[(b'\x00', b'\x10')]);
3393         assert_eq!(cls, bcasefold(&cls));
3394 
3395         let cls = bclass(&[(b'k', b'k')]);
3396         let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3397         assert_eq!(expected, bcasefold(&cls));
3398 
3399         let cls = bclass(&[(b'@', b'@')]);
3400         assert_eq!(cls, bcasefold(&cls));
3401     }
3402 
3403     #[test]
class_negate_unicode()3404     fn class_negate_unicode() {
3405         let cls = uclass(&[('a', 'a')]);
3406         let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
3407         assert_eq!(expected, unegate(&cls));
3408 
3409         let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3410         let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
3411         assert_eq!(expected, unegate(&cls));
3412 
3413         let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3414         let expected = uclass(&[
3415             ('\x00', '\x60'),
3416             ('\x64', '\x77'),
3417             ('\x7B', '\u{10FFFF}'),
3418         ]);
3419         assert_eq!(expected, unegate(&cls));
3420 
3421         let cls = uclass(&[('\x00', 'a')]);
3422         let expected = uclass(&[('\x62', '\u{10FFFF}')]);
3423         assert_eq!(expected, unegate(&cls));
3424 
3425         let cls = uclass(&[('a', '\u{10FFFF}')]);
3426         let expected = uclass(&[('\x00', '\x60')]);
3427         assert_eq!(expected, unegate(&cls));
3428 
3429         let cls = uclass(&[('\x00', '\u{10FFFF}')]);
3430         let expected = uclass(&[]);
3431         assert_eq!(expected, unegate(&cls));
3432 
3433         let cls = uclass(&[]);
3434         let expected = uclass(&[('\x00', '\u{10FFFF}')]);
3435         assert_eq!(expected, unegate(&cls));
3436 
3437         let cls =
3438             uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
3439         let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
3440         assert_eq!(expected, unegate(&cls));
3441 
3442         let cls = uclass(&[('\x00', '\u{D7FF}')]);
3443         let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3444         assert_eq!(expected, unegate(&cls));
3445 
3446         let cls = uclass(&[('\x00', '\u{D7FE}')]);
3447         let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
3448         assert_eq!(expected, unegate(&cls));
3449 
3450         let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3451         let expected = uclass(&[('\x00', '\u{D7FF}')]);
3452         assert_eq!(expected, unegate(&cls));
3453 
3454         let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
3455         let expected = uclass(&[('\x00', '\u{E000}')]);
3456         assert_eq!(expected, unegate(&cls));
3457     }
3458 
3459     #[test]
class_negate_bytes()3460     fn class_negate_bytes() {
3461         let cls = bclass(&[(b'a', b'a')]);
3462         let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
3463         assert_eq!(expected, bnegate(&cls));
3464 
3465         let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3466         let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
3467         assert_eq!(expected, bnegate(&cls));
3468 
3469         let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3470         let expected = bclass(&[
3471             (b'\x00', b'\x60'),
3472             (b'\x64', b'\x77'),
3473             (b'\x7B', b'\xFF'),
3474         ]);
3475         assert_eq!(expected, bnegate(&cls));
3476 
3477         let cls = bclass(&[(b'\x00', b'a')]);
3478         let expected = bclass(&[(b'\x62', b'\xFF')]);
3479         assert_eq!(expected, bnegate(&cls));
3480 
3481         let cls = bclass(&[(b'a', b'\xFF')]);
3482         let expected = bclass(&[(b'\x00', b'\x60')]);
3483         assert_eq!(expected, bnegate(&cls));
3484 
3485         let cls = bclass(&[(b'\x00', b'\xFF')]);
3486         let expected = bclass(&[]);
3487         assert_eq!(expected, bnegate(&cls));
3488 
3489         let cls = bclass(&[]);
3490         let expected = bclass(&[(b'\x00', b'\xFF')]);
3491         assert_eq!(expected, bnegate(&cls));
3492 
3493         let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
3494         let expected = bclass(&[(b'\xFE', b'\xFE')]);
3495         assert_eq!(expected, bnegate(&cls));
3496     }
3497 
3498     #[test]
class_union_unicode()3499     fn class_union_unicode() {
3500         let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3501         let cls2 = uclass(&[('a', 'z')]);
3502         let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3503         assert_eq!(expected, uunion(&cls1, &cls2));
3504     }
3505 
3506     #[test]
class_union_bytes()3507     fn class_union_bytes() {
3508         let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3509         let cls2 = bclass(&[(b'a', b'z')]);
3510         let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3511         assert_eq!(expected, bunion(&cls1, &cls2));
3512     }
3513 
3514     #[test]
class_intersect_unicode()3515     fn class_intersect_unicode() {
3516         let cls1 = uclass(&[]);
3517         let cls2 = uclass(&[('a', 'a')]);
3518         let expected = uclass(&[]);
3519         assert_eq!(expected, uintersect(&cls1, &cls2));
3520 
3521         let cls1 = uclass(&[('a', 'a')]);
3522         let cls2 = uclass(&[('a', 'a')]);
3523         let expected = uclass(&[('a', 'a')]);
3524         assert_eq!(expected, uintersect(&cls1, &cls2));
3525 
3526         let cls1 = uclass(&[('a', 'a')]);
3527         let cls2 = uclass(&[('b', 'b')]);
3528         let expected = uclass(&[]);
3529         assert_eq!(expected, uintersect(&cls1, &cls2));
3530 
3531         let cls1 = uclass(&[('a', 'a')]);
3532         let cls2 = uclass(&[('a', 'c')]);
3533         let expected = uclass(&[('a', 'a')]);
3534         assert_eq!(expected, uintersect(&cls1, &cls2));
3535 
3536         let cls1 = uclass(&[('a', 'b')]);
3537         let cls2 = uclass(&[('a', 'c')]);
3538         let expected = uclass(&[('a', 'b')]);
3539         assert_eq!(expected, uintersect(&cls1, &cls2));
3540 
3541         let cls1 = uclass(&[('a', 'b')]);
3542         let cls2 = uclass(&[('b', 'c')]);
3543         let expected = uclass(&[('b', 'b')]);
3544         assert_eq!(expected, uintersect(&cls1, &cls2));
3545 
3546         let cls1 = uclass(&[('a', 'b')]);
3547         let cls2 = uclass(&[('c', 'd')]);
3548         let expected = uclass(&[]);
3549         assert_eq!(expected, uintersect(&cls1, &cls2));
3550 
3551         let cls1 = uclass(&[('b', 'c')]);
3552         let cls2 = uclass(&[('a', 'd')]);
3553         let expected = uclass(&[('b', 'c')]);
3554         assert_eq!(expected, uintersect(&cls1, &cls2));
3555 
3556         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3557         let cls2 = uclass(&[('a', 'h')]);
3558         let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3559         assert_eq!(expected, uintersect(&cls1, &cls2));
3560 
3561         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3562         let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3563         let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3564         assert_eq!(expected, uintersect(&cls1, &cls2));
3565 
3566         let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3567         let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3568         let expected = uclass(&[]);
3569         assert_eq!(expected, uintersect(&cls1, &cls2));
3570 
3571         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3572         let cls2 = uclass(&[('h', 'h')]);
3573         let expected = uclass(&[('h', 'h')]);
3574         assert_eq!(expected, uintersect(&cls1, &cls2));
3575 
3576         let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3577         let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3578         let expected = uclass(&[]);
3579         assert_eq!(expected, uintersect(&cls1, &cls2));
3580 
3581         let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3582         let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3583         let expected = uclass(&[('b', 'f')]);
3584         assert_eq!(expected, uintersect(&cls1, &cls2));
3585     }
3586 
3587     #[test]
class_intersect_bytes()3588     fn class_intersect_bytes() {
3589         let cls1 = bclass(&[]);
3590         let cls2 = bclass(&[(b'a', b'a')]);
3591         let expected = bclass(&[]);
3592         assert_eq!(expected, bintersect(&cls1, &cls2));
3593 
3594         let cls1 = bclass(&[(b'a', b'a')]);
3595         let cls2 = bclass(&[(b'a', b'a')]);
3596         let expected = bclass(&[(b'a', b'a')]);
3597         assert_eq!(expected, bintersect(&cls1, &cls2));
3598 
3599         let cls1 = bclass(&[(b'a', b'a')]);
3600         let cls2 = bclass(&[(b'b', b'b')]);
3601         let expected = bclass(&[]);
3602         assert_eq!(expected, bintersect(&cls1, &cls2));
3603 
3604         let cls1 = bclass(&[(b'a', b'a')]);
3605         let cls2 = bclass(&[(b'a', b'c')]);
3606         let expected = bclass(&[(b'a', b'a')]);
3607         assert_eq!(expected, bintersect(&cls1, &cls2));
3608 
3609         let cls1 = bclass(&[(b'a', b'b')]);
3610         let cls2 = bclass(&[(b'a', b'c')]);
3611         let expected = bclass(&[(b'a', b'b')]);
3612         assert_eq!(expected, bintersect(&cls1, &cls2));
3613 
3614         let cls1 = bclass(&[(b'a', b'b')]);
3615         let cls2 = bclass(&[(b'b', b'c')]);
3616         let expected = bclass(&[(b'b', b'b')]);
3617         assert_eq!(expected, bintersect(&cls1, &cls2));
3618 
3619         let cls1 = bclass(&[(b'a', b'b')]);
3620         let cls2 = bclass(&[(b'c', b'd')]);
3621         let expected = bclass(&[]);
3622         assert_eq!(expected, bintersect(&cls1, &cls2));
3623 
3624         let cls1 = bclass(&[(b'b', b'c')]);
3625         let cls2 = bclass(&[(b'a', b'd')]);
3626         let expected = bclass(&[(b'b', b'c')]);
3627         assert_eq!(expected, bintersect(&cls1, &cls2));
3628 
3629         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3630         let cls2 = bclass(&[(b'a', b'h')]);
3631         let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3632         assert_eq!(expected, bintersect(&cls1, &cls2));
3633 
3634         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3635         let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3636         let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3637         assert_eq!(expected, bintersect(&cls1, &cls2));
3638 
3639         let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3640         let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3641         let expected = bclass(&[]);
3642         assert_eq!(expected, bintersect(&cls1, &cls2));
3643 
3644         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3645         let cls2 = bclass(&[(b'h', b'h')]);
3646         let expected = bclass(&[(b'h', b'h')]);
3647         assert_eq!(expected, bintersect(&cls1, &cls2));
3648 
3649         let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3650         let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3651         let expected = bclass(&[]);
3652         assert_eq!(expected, bintersect(&cls1, &cls2));
3653 
3654         let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3655         let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3656         let expected = bclass(&[(b'b', b'f')]);
3657         assert_eq!(expected, bintersect(&cls1, &cls2));
3658     }
3659 
3660     #[test]
class_difference_unicode()3661     fn class_difference_unicode() {
3662         let cls1 = uclass(&[('a', 'a')]);
3663         let cls2 = uclass(&[('a', 'a')]);
3664         let expected = uclass(&[]);
3665         assert_eq!(expected, udifference(&cls1, &cls2));
3666 
3667         let cls1 = uclass(&[('a', 'a')]);
3668         let cls2 = uclass(&[]);
3669         let expected = uclass(&[('a', 'a')]);
3670         assert_eq!(expected, udifference(&cls1, &cls2));
3671 
3672         let cls1 = uclass(&[]);
3673         let cls2 = uclass(&[('a', 'a')]);
3674         let expected = uclass(&[]);
3675         assert_eq!(expected, udifference(&cls1, &cls2));
3676 
3677         let cls1 = uclass(&[('a', 'z')]);
3678         let cls2 = uclass(&[('a', 'a')]);
3679         let expected = uclass(&[('b', 'z')]);
3680         assert_eq!(expected, udifference(&cls1, &cls2));
3681 
3682         let cls1 = uclass(&[('a', 'z')]);
3683         let cls2 = uclass(&[('z', 'z')]);
3684         let expected = uclass(&[('a', 'y')]);
3685         assert_eq!(expected, udifference(&cls1, &cls2));
3686 
3687         let cls1 = uclass(&[('a', 'z')]);
3688         let cls2 = uclass(&[('m', 'm')]);
3689         let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3690         assert_eq!(expected, udifference(&cls1, &cls2));
3691 
3692         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3693         let cls2 = uclass(&[('a', 'z')]);
3694         let expected = uclass(&[]);
3695         assert_eq!(expected, udifference(&cls1, &cls2));
3696 
3697         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3698         let cls2 = uclass(&[('d', 'v')]);
3699         let expected = uclass(&[('a', 'c')]);
3700         assert_eq!(expected, udifference(&cls1, &cls2));
3701 
3702         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3703         let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3704         let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3705         assert_eq!(expected, udifference(&cls1, &cls2));
3706 
3707         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3708         let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3709         let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3710         assert_eq!(expected, udifference(&cls1, &cls2));
3711 
3712         let cls1 = uclass(&[('x', 'z')]);
3713         let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3714         let expected = uclass(&[('x', 'z')]);
3715         assert_eq!(expected, udifference(&cls1, &cls2));
3716 
3717         let cls1 = uclass(&[('a', 'z')]);
3718         let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3719         let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3720         assert_eq!(expected, udifference(&cls1, &cls2));
3721     }
3722 
3723     #[test]
class_difference_bytes()3724     fn class_difference_bytes() {
3725         let cls1 = bclass(&[(b'a', b'a')]);
3726         let cls2 = bclass(&[(b'a', b'a')]);
3727         let expected = bclass(&[]);
3728         assert_eq!(expected, bdifference(&cls1, &cls2));
3729 
3730         let cls1 = bclass(&[(b'a', b'a')]);
3731         let cls2 = bclass(&[]);
3732         let expected = bclass(&[(b'a', b'a')]);
3733         assert_eq!(expected, bdifference(&cls1, &cls2));
3734 
3735         let cls1 = bclass(&[]);
3736         let cls2 = bclass(&[(b'a', b'a')]);
3737         let expected = bclass(&[]);
3738         assert_eq!(expected, bdifference(&cls1, &cls2));
3739 
3740         let cls1 = bclass(&[(b'a', b'z')]);
3741         let cls2 = bclass(&[(b'a', b'a')]);
3742         let expected = bclass(&[(b'b', b'z')]);
3743         assert_eq!(expected, bdifference(&cls1, &cls2));
3744 
3745         let cls1 = bclass(&[(b'a', b'z')]);
3746         let cls2 = bclass(&[(b'z', b'z')]);
3747         let expected = bclass(&[(b'a', b'y')]);
3748         assert_eq!(expected, bdifference(&cls1, &cls2));
3749 
3750         let cls1 = bclass(&[(b'a', b'z')]);
3751         let cls2 = bclass(&[(b'm', b'm')]);
3752         let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3753         assert_eq!(expected, bdifference(&cls1, &cls2));
3754 
3755         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3756         let cls2 = bclass(&[(b'a', b'z')]);
3757         let expected = bclass(&[]);
3758         assert_eq!(expected, bdifference(&cls1, &cls2));
3759 
3760         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3761         let cls2 = bclass(&[(b'd', b'v')]);
3762         let expected = bclass(&[(b'a', b'c')]);
3763         assert_eq!(expected, bdifference(&cls1, &cls2));
3764 
3765         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3766         let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3767         let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3768         assert_eq!(expected, bdifference(&cls1, &cls2));
3769 
3770         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3771         let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3772         let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3773         assert_eq!(expected, bdifference(&cls1, &cls2));
3774 
3775         let cls1 = bclass(&[(b'x', b'z')]);
3776         let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3777         let expected = bclass(&[(b'x', b'z')]);
3778         assert_eq!(expected, bdifference(&cls1, &cls2));
3779 
3780         let cls1 = bclass(&[(b'a', b'z')]);
3781         let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3782         let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3783         assert_eq!(expected, bdifference(&cls1, &cls2));
3784     }
3785 
3786     #[test]
class_symmetric_difference_unicode()3787     fn class_symmetric_difference_unicode() {
3788         let cls1 = uclass(&[('a', 'm')]);
3789         let cls2 = uclass(&[('g', 't')]);
3790         let expected = uclass(&[('a', 'f'), ('n', 't')]);
3791         assert_eq!(expected, usymdifference(&cls1, &cls2));
3792     }
3793 
3794     #[test]
class_symmetric_difference_bytes()3795     fn class_symmetric_difference_bytes() {
3796         let cls1 = bclass(&[(b'a', b'm')]);
3797         let cls2 = bclass(&[(b'g', b't')]);
3798         let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3799         assert_eq!(expected, bsymdifference(&cls1, &cls2));
3800     }
3801 
3802     // We use a thread with an explicit stack size to test that our destructor
3803     // for Hir can handle arbitrarily sized expressions in constant stack
3804     // space. In case we run on a platform without threads (WASM?), we limit
3805     // this test to Windows/Unix.
3806     #[test]
3807     #[cfg(any(unix, windows))]
no_stack_overflow_on_drop()3808     fn no_stack_overflow_on_drop() {
3809         use std::thread;
3810 
3811         let run = || {
3812             let mut expr = Hir::empty();
3813             for _ in 0..100 {
3814                 expr = Hir::capture(Capture {
3815                     index: 1,
3816                     name: None,
3817                     sub: Box::new(expr),
3818                 });
3819                 expr = Hir::repetition(Repetition {
3820                     min: 0,
3821                     max: Some(1),
3822                     greedy: true,
3823                     sub: Box::new(expr),
3824                 });
3825 
3826                 expr = Hir {
3827                     kind: HirKind::Concat(vec![expr]),
3828                     props: Properties::empty(),
3829                 };
3830                 expr = Hir {
3831                     kind: HirKind::Alternation(vec![expr]),
3832                     props: Properties::empty(),
3833                 };
3834             }
3835             assert!(!matches!(*expr.kind(), HirKind::Empty));
3836         };
3837 
3838         // We run our test on a thread with a small stack size so we can
3839         // force the issue more easily.
3840         //
3841         // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3842         // for context on the specific stack size chosen here.
3843         thread::Builder::new()
3844             .stack_size(16 << 10)
3845             .spawn(run)
3846             .unwrap()
3847             .join()
3848             .unwrap();
3849     }
3850 
3851     #[test]
look_set_iter()3852     fn look_set_iter() {
3853         let set = LookSet::empty();
3854         assert_eq!(0, set.iter().count());
3855 
3856         let set = LookSet::full();
3857         assert_eq!(18, set.iter().count());
3858 
3859         let set =
3860             LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3861         assert_eq!(2, set.iter().count());
3862 
3863         let set = LookSet::empty().insert(Look::StartLF);
3864         assert_eq!(1, set.iter().count());
3865 
3866         let set = LookSet::empty().insert(Look::WordAsciiNegate);
3867         assert_eq!(1, set.iter().count());
3868     }
3869 
3870     #[test]
look_set_debug()3871     fn look_set_debug() {
3872         let res = format!("{:?}", LookSet::empty());
3873         assert_eq!("∅", res);
3874         let res = format!("{:?}", LookSet::full());
3875         assert_eq!("Az^$rRbB����<>〈〉◁▷◀▶", res);
3876     }
3877 }
3878