1 use core::{fmt, mem};
2 
3 use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec};
4 
5 #[cfg(feature = "syntax")]
6 use crate::nfa::thompson::{
7     compiler::{Compiler, Config},
8     error::BuildError,
9 };
10 use crate::{
11     nfa::thompson::builder::Builder,
12     util::{
13         alphabet::{self, ByteClassSet, ByteClasses},
14         captures::{GroupInfo, GroupInfoError},
15         look::{Look, LookMatcher, LookSet},
16         primitives::{
17             IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID,
18         },
19         sparse_set::SparseSet,
20     },
21 };
22 
23 /// A byte oriented Thompson non-deterministic finite automaton (NFA).
24 ///
25 /// A Thompson NFA is a finite state machine that permits unconditional epsilon
26 /// transitions, but guarantees that there exists at most one non-epsilon
27 /// transition for each element in the alphabet for each state.
28 ///
29 /// An NFA may be used directly for searching, for analysis or to build
30 /// a deterministic finite automaton (DFA).
31 ///
32 /// # Cheap clones
33 ///
34 /// Since an NFA is a core data type in this crate that many other regex
35 /// engines are based on top of, it is convenient to give ownership of an NFA
36 /// to said regex engines. Because of this, an NFA uses reference counting
37 /// internally. Therefore, it is cheap to clone and it is encouraged to do so.
38 ///
39 /// # Capabilities
40 ///
41 /// Using an NFA for searching via the
42 /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount
43 /// of "power" of any regex engine in this crate. Namely, it supports the
44 /// following in all cases:
45 ///
46 /// 1. Detection of a match.
47 /// 2. Location of a match, including both the start and end offset, in a
48 /// single pass of the haystack.
49 /// 3. Location of matching capturing groups.
50 /// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are
51 /// present.
52 ///
53 /// # Capturing Groups
54 ///
55 /// Groups refer to parenthesized expressions inside a regex pattern. They look
56 /// like this, where `exp` is an arbitrary regex:
57 ///
58 /// * `(exp)` - An unnamed capturing group.
59 /// * `(?P<name>exp)` or `(?<name>exp)` - A named capturing group.
60 /// * `(?:exp)` - A non-capturing group.
61 /// * `(?i:exp)` - A non-capturing group that sets flags.
62 ///
63 /// Only the first two forms are said to be _capturing_. Capturing
64 /// means that the last position at which they match is reportable. The
65 /// [`Captures`](crate::util::captures::Captures) type provides convenient
66 /// access to the match positions of capturing groups, which includes looking
67 /// up capturing groups by their name.
68 ///
69 /// # Byte oriented
70 ///
71 /// This NFA is byte oriented, which means that all of its transitions are
72 /// defined on bytes. In other words, the alphabet of an NFA consists of the
73 /// 256 different byte values.
74 ///
75 /// While DFAs nearly demand that they be byte oriented for performance
76 /// reasons, an NFA could conceivably be *Unicode codepoint* oriented. Indeed,
77 /// a previous version of this NFA supported both byte and codepoint oriented
78 /// modes. A codepoint oriented mode can work because an NFA fundamentally uses
79 /// a sparse representation of transitions, which works well with the large
80 /// sparse space of Unicode codepoints.
81 ///
82 /// Nevertheless, this NFA is only byte oriented. This choice is primarily
83 /// driven by implementation simplicity, and also in part memory usage. In
84 /// practice, performance between the two is roughly comparable. However,
85 /// building a DFA (including a hybrid DFA) really wants a byte oriented NFA.
86 /// So if we do have a codepoint oriented NFA, then we also need to generate
87 /// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only
88 /// generating byte oriented NFAs, we can produce one less NFA. In other words,
89 /// if we made our NFA codepoint oriented, we'd need to *also* make it support
90 /// a byte oriented mode, which is more complicated. But a byte oriented mode
91 /// can support everything.
92 ///
93 /// # Differences with DFAs
94 ///
95 /// At the theoretical level, the precise difference between an NFA and a DFA
96 /// is that, in a DFA, for every state, an input symbol unambiguously refers
97 /// to a single transition _and_ that an input symbol is required for each
98 /// transition. At a practical level, this permits DFA implementations to be
99 /// implemented at their core with a small constant number of CPU instructions
100 /// for each byte of input searched. In practice, this makes them quite a bit
101 /// faster than NFAs _in general_. Namely, in order to execute a search for any
102 /// Thompson NFA, one needs to keep track of a _set_ of states, and execute
103 /// the possible transitions on all of those states for each input symbol.
104 /// Overall, this results in much more overhead. To a first approximation, one
105 /// can expect DFA searches to be about an order of magnitude faster.
106 ///
107 /// So why use an NFA at all? The main advantage of an NFA is that it takes
108 /// linear time (in the size of the pattern string after repetitions have been
109 /// expanded) to build and linear memory usage. A DFA, on the other hand, may
110 /// take exponential time and/or space to build. Even in non-pathological
111 /// cases, DFAs often take quite a bit more memory than their NFA counterparts,
112 /// _especially_ if large Unicode character classes are involved. Of course,
113 /// an NFA also provides additional capabilities. For example, it can match
114 /// Unicode word boundaries on non-ASCII text and resolve the positions of
115 /// capturing groups.
116 ///
117 /// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a
118 /// good balance between an NFA and a DFA. It avoids the exponential build time
119 /// of a DFA while maintaining its fast search time. The downside of a hybrid
120 /// NFA/DFA is that in some cases it can be slower at search time than the NFA.
121 /// (It also has less functionality than a pure NFA. It cannot handle Unicode
122 /// word boundaries on non-ASCII text and cannot resolve capturing groups.)
123 ///
124 /// # Example
125 ///
126 /// This shows how to build an NFA with the default configuration and execute a
127 /// search using the Pike VM.
128 ///
129 /// ```
130 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
131 ///
132 /// let re = PikeVM::new(r"foo[0-9]+")?;
133 /// let mut cache = re.create_cache();
134 /// let mut caps = re.create_captures();
135 ///
136 /// let expected = Some(Match::must(0, 0..8));
137 /// re.captures(&mut cache, b"foo12345", &mut caps);
138 /// assert_eq!(expected, caps.get_match());
139 ///
140 /// # Ok::<(), Box<dyn std::error::Error>>(())
141 /// ```
142 ///
143 /// # Example: resolving capturing groups
144 ///
145 /// This example shows how to parse some simple dates and extract the
146 /// components of each date via capturing groups.
147 ///
148 /// ```
149 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
150 /// use regex_automata::{
151 ///     nfa::thompson::pikevm::PikeVM,
152 ///     util::captures::Captures,
153 /// };
154 ///
155 /// let vm = PikeVM::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})")?;
156 /// let mut cache = vm.create_cache();
157 ///
158 /// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05";
159 /// let all: Vec<Captures> = vm.captures_iter(
160 ///     &mut cache, haystack.as_bytes()
161 /// ).collect();
162 /// // There should be a total of 3 matches.
163 /// assert_eq!(3, all.len());
164 /// // The year from the second match is '2013'.
165 /// let span = all[1].get_group_by_name("y").unwrap();
166 /// assert_eq!("2013", &haystack[span]);
167 ///
168 /// # Ok::<(), Box<dyn std::error::Error>>(())
169 /// ```
170 ///
171 /// This example shows that only the last match of a capturing group is
172 /// reported, even if it had to match multiple times for an overall match
173 /// to occur.
174 ///
175 /// ```
176 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
177 ///
178 /// let re = PikeVM::new(r"([a-z]){4}")?;
179 /// let mut cache = re.create_cache();
180 /// let mut caps = re.create_captures();
181 ///
182 /// let haystack = b"quux";
183 /// re.captures(&mut cache, haystack, &mut caps);
184 /// assert!(caps.is_match());
185 /// assert_eq!(Some(Span::from(3..4)), caps.get_group(1));
186 ///
187 /// # Ok::<(), Box<dyn std::error::Error>>(())
188 /// ```
189 #[derive(Clone)]
190 pub struct NFA(
191     // We make NFAs reference counted primarily for two reasons. First is that
192     // the NFA type itself is quite large (at least 0.5KB), and so it makes
193     // sense to put it on the heap by default anyway. Second is that, for Arc
194     // specifically, this enables cheap clones. This tends to be useful because
195     // several structures (the backtracker, the Pike VM, the hybrid NFA/DFA)
196     // all want to hang on to an NFA for use during search time. We could
197     // provide the NFA at search time via a function argument, but this makes
198     // for an unnecessarily annoying API. Instead, we just let each structure
199     // share ownership of the NFA. Using a deep clone would not be smart, since
200     // the NFA can use quite a bit of heap space.
201     Arc<Inner>,
202 );
203 
204 impl NFA {
205     /// Parse the given regular expression using a default configuration and
206     /// build an NFA from it.
207     ///
208     /// If you want a non-default configuration, then use the NFA
209     /// [`Compiler`] with a [`Config`].
210     ///
211     /// # Example
212     ///
213     /// ```
214     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
215     ///
216     /// let re = PikeVM::new(r"foo[0-9]+")?;
217     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
218     ///
219     /// let expected = Some(Match::must(0, 0..8));
220     /// re.captures(&mut cache, b"foo12345", &mut caps);
221     /// assert_eq!(expected, caps.get_match());
222     ///
223     /// # Ok::<(), Box<dyn std::error::Error>>(())
224     /// ```
225     #[cfg(feature = "syntax")]
new(pattern: &str) -> Result<NFA, BuildError>226     pub fn new(pattern: &str) -> Result<NFA, BuildError> {
227         NFA::compiler().build(pattern)
228     }
229 
230     /// Parse the given regular expressions using a default configuration and
231     /// build a multi-NFA from them.
232     ///
233     /// If you want a non-default configuration, then use the NFA
234     /// [`Compiler`] with a [`Config`].
235     ///
236     /// # Example
237     ///
238     /// ```
239     /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
240     ///
241     /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+"])?;
242     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
243     ///
244     /// let expected = Some(Match::must(1, 0..3));
245     /// re.captures(&mut cache, b"foo12345bar", &mut caps);
246     /// assert_eq!(expected, caps.get_match());
247     ///
248     /// # Ok::<(), Box<dyn std::error::Error>>(())
249     /// ```
250     #[cfg(feature = "syntax")]
new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError>251     pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError> {
252         NFA::compiler().build_many(patterns)
253     }
254 
255     /// Returns an NFA with a single regex pattern that always matches at every
256     /// position.
257     ///
258     /// # Example
259     ///
260     /// ```
261     /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
262     ///
263     /// let re = PikeVM::new_from_nfa(NFA::always_match())?;
264     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
265     ///
266     /// let expected = Some(Match::must(0, 0..0));
267     /// re.captures(&mut cache, b"", &mut caps);
268     /// assert_eq!(expected, caps.get_match());
269     /// re.captures(&mut cache, b"foo", &mut caps);
270     /// assert_eq!(expected, caps.get_match());
271     ///
272     /// # Ok::<(), Box<dyn std::error::Error>>(())
273     /// ```
always_match() -> NFA274     pub fn always_match() -> NFA {
275         // We could use NFA::new("") here and we'd get the same semantics, but
276         // hand-assembling the NFA (as below) does the same thing with a fewer
277         // number of states. It also avoids needing the 'syntax' feature
278         // enabled.
279         //
280         // Technically all we need is the "match" state, but we add the
281         // "capture" states so that the PikeVM can use this NFA.
282         //
283         // The unwraps below are OK because we add so few states that they will
284         // never exhaust any default limits in any environment.
285         let mut builder = Builder::new();
286         let pid = builder.start_pattern().unwrap();
287         assert_eq!(pid.as_usize(), 0);
288         let start_id =
289             builder.add_capture_start(StateID::ZERO, 0, None).unwrap();
290         let end_id = builder.add_capture_end(StateID::ZERO, 0).unwrap();
291         let match_id = builder.add_match().unwrap();
292         builder.patch(start_id, end_id).unwrap();
293         builder.patch(end_id, match_id).unwrap();
294         let pid = builder.finish_pattern(start_id).unwrap();
295         assert_eq!(pid.as_usize(), 0);
296         builder.build(start_id, start_id).unwrap()
297     }
298 
299     /// Returns an NFA that never matches at any position.
300     ///
301     /// This is a convenience routine for creating an NFA with zero patterns.
302     ///
303     /// # Example
304     ///
305     /// ```
306     /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM};
307     ///
308     /// let re = PikeVM::new_from_nfa(NFA::never_match())?;
309     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
310     ///
311     /// re.captures(&mut cache, b"", &mut caps);
312     /// assert!(!caps.is_match());
313     /// re.captures(&mut cache, b"foo", &mut caps);
314     /// assert!(!caps.is_match());
315     ///
316     /// # Ok::<(), Box<dyn std::error::Error>>(())
317     /// ```
never_match() -> NFA318     pub fn never_match() -> NFA {
319         // This always succeeds because it only requires one NFA state, which
320         // will never exhaust any (default) limits.
321         let mut builder = Builder::new();
322         let sid = builder.add_fail().unwrap();
323         builder.build(sid, sid).unwrap()
324     }
325 
326     /// Return a default configuration for an `NFA`.
327     ///
328     /// This is a convenience routine to avoid needing to import the `Config`
329     /// type when customizing the construction of an NFA.
330     ///
331     /// # Example
332     ///
333     /// This example shows how to build an NFA with a small size limit that
334     /// results in a compilation error for any regex that tries to use more
335     /// heap memory than the configured limit.
336     ///
337     /// ```
338     /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM};
339     ///
340     /// let result = PikeVM::builder()
341     ///     .thompson(NFA::config().nfa_size_limit(Some(1_000)))
342     ///     // Remember, \w is Unicode-aware by default and thus huge.
343     ///     .build(r"\w+");
344     /// assert!(result.is_err());
345     /// ```
346     #[cfg(feature = "syntax")]
config() -> Config347     pub fn config() -> Config {
348         Config::new()
349     }
350 
351     /// Return a compiler for configuring the construction of an `NFA`.
352     ///
353     /// This is a convenience routine to avoid needing to import the
354     /// [`Compiler`] type in common cases.
355     ///
356     /// # Example
357     ///
358     /// This example shows how to build an NFA that is permitted match invalid
359     /// UTF-8. Without the additional syntax configuration here, compilation of
360     /// `(?-u:.)` would fail because it is permitted to match invalid UTF-8.
361     ///
362     /// ```
363     /// use regex_automata::{
364     ///     nfa::thompson::pikevm::PikeVM,
365     ///     util::syntax,
366     ///     Match,
367     /// };
368     ///
369     /// let re = PikeVM::builder()
370     ///     .syntax(syntax::Config::new().utf8(false))
371     ///     .build(r"[a-z]+(?-u:.)")?;
372     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
373     ///
374     /// let expected = Some(Match::must(0, 1..5));
375     /// re.captures(&mut cache, b"\xFFabc\xFF", &mut caps);
376     /// assert_eq!(expected, caps.get_match());
377     ///
378     /// # Ok::<(), Box<dyn std::error::Error>>(())
379     /// ```
380     #[cfg(feature = "syntax")]
compiler() -> Compiler381     pub fn compiler() -> Compiler {
382         Compiler::new()
383     }
384 
385     /// Returns an iterator over all pattern identifiers in this NFA.
386     ///
387     /// Pattern IDs are allocated in sequential order starting from zero,
388     /// where the order corresponds to the order of patterns provided to the
389     /// [`NFA::new_many`] constructor.
390     ///
391     /// # Example
392     ///
393     /// ```
394     /// use regex_automata::{nfa::thompson::NFA, PatternID};
395     ///
396     /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
397     /// let pids: Vec<PatternID> = nfa.patterns().collect();
398     /// assert_eq!(pids, vec![
399     ///     PatternID::must(0),
400     ///     PatternID::must(1),
401     ///     PatternID::must(2),
402     /// ]);
403     ///
404     /// # Ok::<(), Box<dyn std::error::Error>>(())
405     /// ```
patterns(&self) -> PatternIter<'_>406     pub fn patterns(&self) -> PatternIter<'_> {
407         PatternIter {
408             it: PatternID::iter(self.pattern_len()),
409             _marker: core::marker::PhantomData,
410         }
411     }
412 
413     /// Returns the total number of regex patterns in this NFA.
414     ///
415     /// This may return zero if the NFA was constructed with no patterns. In
416     /// this case, the NFA can never produce a match for any input.
417     ///
418     /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because
419     /// NFA construction will fail if too many patterns are added.
420     ///
421     /// It is always true that `nfa.patterns().count() == nfa.pattern_len()`.
422     ///
423     /// # Example
424     ///
425     /// ```
426     /// use regex_automata::nfa::thompson::NFA;
427     ///
428     /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
429     /// assert_eq!(3, nfa.pattern_len());
430     ///
431     /// let nfa = NFA::never_match();
432     /// assert_eq!(0, nfa.pattern_len());
433     ///
434     /// let nfa = NFA::always_match();
435     /// assert_eq!(1, nfa.pattern_len());
436     ///
437     /// # Ok::<(), Box<dyn std::error::Error>>(())
438     /// ```
439     #[inline]
pattern_len(&self) -> usize440     pub fn pattern_len(&self) -> usize {
441         self.0.start_pattern.len()
442     }
443 
444     /// Return the state identifier of the initial anchored state of this NFA.
445     ///
446     /// The returned identifier is guaranteed to be a valid index into the
447     /// slice returned by [`NFA::states`], and is also a valid argument to
448     /// [`NFA::state`].
449     ///
450     /// # Example
451     ///
452     /// This example shows a somewhat contrived example where we can easily
453     /// predict the anchored starting state.
454     ///
455     /// ```
456     /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures};
457     ///
458     /// let nfa = NFA::compiler()
459     ///     .configure(NFA::config().which_captures(WhichCaptures::None))
460     ///     .build("a")?;
461     /// let state = nfa.state(nfa.start_anchored());
462     /// match *state {
463     ///     State::ByteRange { trans } => {
464     ///         assert_eq!(b'a', trans.start);
465     ///         assert_eq!(b'a', trans.end);
466     ///     }
467     ///     _ => unreachable!("unexpected state"),
468     /// }
469     ///
470     /// # Ok::<(), Box<dyn std::error::Error>>(())
471     /// ```
472     #[inline]
start_anchored(&self) -> StateID473     pub fn start_anchored(&self) -> StateID {
474         self.0.start_anchored
475     }
476 
477     /// Return the state identifier of the initial unanchored state of this
478     /// NFA.
479     ///
480     /// This is equivalent to the identifier returned by
481     /// [`NFA::start_anchored`] when the NFA has no unanchored starting state.
482     ///
483     /// The returned identifier is guaranteed to be a valid index into the
484     /// slice returned by [`NFA::states`], and is also a valid argument to
485     /// [`NFA::state`].
486     ///
487     /// # Example
488     ///
489     /// This example shows that the anchored and unanchored starting states
490     /// are equivalent when an anchored NFA is built.
491     ///
492     /// ```
493     /// use regex_automata::nfa::thompson::NFA;
494     ///
495     /// let nfa = NFA::new("^a")?;
496     /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored());
497     ///
498     /// # Ok::<(), Box<dyn std::error::Error>>(())
499     /// ```
500     #[inline]
start_unanchored(&self) -> StateID501     pub fn start_unanchored(&self) -> StateID {
502         self.0.start_unanchored
503     }
504 
505     /// Return the state identifier of the initial anchored state for the given
506     /// pattern, or `None` if there is no pattern corresponding to the given
507     /// identifier.
508     ///
509     /// If one uses the starting state for a particular pattern, then the only
510     /// match that can be returned is for the corresponding pattern.
511     ///
512     /// The returned identifier is guaranteed to be a valid index into the
513     /// slice returned by [`NFA::states`], and is also a valid argument to
514     /// [`NFA::state`].
515     ///
516     /// # Errors
517     ///
518     /// If the pattern doesn't exist in this NFA, then this returns an error.
519     /// This occurs when `pid.as_usize() >= nfa.pattern_len()`.
520     ///
521     /// # Example
522     ///
523     /// This example shows that the anchored and unanchored starting states
524     /// are equivalent when an anchored NFA is built.
525     ///
526     /// ```
527     /// use regex_automata::{nfa::thompson::NFA, PatternID};
528     ///
529     /// let nfa = NFA::new_many(&["^a", "^b"])?;
530     /// // The anchored and unanchored states for the entire NFA are the same,
531     /// // since all of the patterns are anchored.
532     /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored());
533     /// // But the anchored starting states for each pattern are distinct,
534     /// // because these starting states can only lead to matches for the
535     /// // corresponding pattern.
536     /// let anchored = Some(nfa.start_anchored());
537     /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(0)));
538     /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(1)));
539     /// // Requesting a pattern not in the NFA will result in None:
540     /// assert_eq!(None, nfa.start_pattern(PatternID::must(2)));
541     ///
542     /// # Ok::<(), Box<dyn std::error::Error>>(())
543     /// ```
544     #[inline]
start_pattern(&self, pid: PatternID) -> Option<StateID>545     pub fn start_pattern(&self, pid: PatternID) -> Option<StateID> {
546         self.0.start_pattern.get(pid.as_usize()).copied()
547     }
548 
549     /// Get the byte class set for this NFA.
550     ///
551     /// A byte class set is a partitioning of this NFA's alphabet into
552     /// equivalence classes. Any two bytes in the same equivalence class are
553     /// guaranteed to never discriminate between a match or a non-match. (The
554     /// partitioning may not be minimal.)
555     ///
556     /// Byte classes are used internally by this crate when building DFAs.
557     /// Namely, among other optimizations, they enable a space optimization
558     /// where the DFA's internal alphabet is defined over the equivalence
559     /// classes of bytes instead of all possible byte values. The former is
560     /// often quite a bit smaller than the latter, which permits the DFA to use
561     /// less space for its transition table.
562     #[inline]
byte_class_set(&self) -> &ByteClassSet563     pub(crate) fn byte_class_set(&self) -> &ByteClassSet {
564         &self.0.byte_class_set
565     }
566 
567     /// Get the byte classes for this NFA.
568     ///
569     /// Byte classes represent a partitioning of this NFA's alphabet into
570     /// equivalence classes. Any two bytes in the same equivalence class are
571     /// guaranteed to never discriminate between a match or a non-match. (The
572     /// partitioning may not be minimal.)
573     ///
574     /// Byte classes are used internally by this crate when building DFAs.
575     /// Namely, among other optimizations, they enable a space optimization
576     /// where the DFA's internal alphabet is defined over the equivalence
577     /// classes of bytes instead of all possible byte values. The former is
578     /// often quite a bit smaller than the latter, which permits the DFA to use
579     /// less space for its transition table.
580     ///
581     /// # Example
582     ///
583     /// This example shows how to query the class of various bytes.
584     ///
585     /// ```
586     /// use regex_automata::nfa::thompson::NFA;
587     ///
588     /// let nfa = NFA::new("[a-z]+")?;
589     /// let classes = nfa.byte_classes();
590     /// // 'a' and 'z' are in the same class for this regex.
591     /// assert_eq!(classes.get(b'a'), classes.get(b'z'));
592     /// // But 'a' and 'A' are not.
593     /// assert_ne!(classes.get(b'a'), classes.get(b'A'));
594     ///
595     /// # Ok::<(), Box<dyn std::error::Error>>(())
596     /// ```
597     #[inline]
byte_classes(&self) -> &ByteClasses598     pub fn byte_classes(&self) -> &ByteClasses {
599         &self.0.byte_classes
600     }
601 
602     /// Return a reference to the NFA state corresponding to the given ID.
603     ///
604     /// This is a convenience routine for `nfa.states()[id]`.
605     ///
606     /// # Panics
607     ///
608     /// This panics when the given identifier does not reference a valid state.
609     /// That is, when `id.as_usize() >= nfa.states().len()`.
610     ///
611     /// # Example
612     ///
613     /// The anchored state for a pattern will typically correspond to a
614     /// capturing state for that pattern. (Although, this is not an API
615     /// guarantee!)
616     ///
617     /// ```
618     /// use regex_automata::{nfa::thompson::{NFA, State}, PatternID};
619     ///
620     /// let nfa = NFA::new("a")?;
621     /// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap());
622     /// match *state {
623     ///     State::Capture { slot, .. } => {
624     ///         assert_eq!(0, slot.as_usize());
625     ///     }
626     ///     _ => unreachable!("unexpected state"),
627     /// }
628     ///
629     /// # Ok::<(), Box<dyn std::error::Error>>(())
630     /// ```
631     #[inline]
state(&self, id: StateID) -> &State632     pub fn state(&self, id: StateID) -> &State {
633         &self.states()[id]
634     }
635 
636     /// Returns a slice of all states in this NFA.
637     ///
638     /// The slice returned is indexed by `StateID`. This provides a convenient
639     /// way to access states while following transitions among those states.
640     ///
641     /// # Example
642     ///
643     /// This demonstrates that disabling UTF-8 mode can shrink the size of the
644     /// NFA considerably in some cases, especially when using Unicode character
645     /// classes.
646     ///
647     /// ```
648     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
649     /// use regex_automata::nfa::thompson::NFA;
650     ///
651     /// let nfa_unicode = NFA::new(r"\w")?;
652     /// let nfa_ascii = NFA::new(r"(?-u)\w")?;
653     /// // Yes, a factor of 45 difference. No lie.
654     /// assert!(40 * nfa_ascii.states().len() < nfa_unicode.states().len());
655     ///
656     /// # Ok::<(), Box<dyn std::error::Error>>(())
657     /// ```
658     #[inline]
states(&self) -> &[State]659     pub fn states(&self) -> &[State] {
660         &self.0.states
661     }
662 
663     /// Returns the capturing group info for this NFA.
664     ///
665     /// The [`GroupInfo`] provides a way to map to and from capture index
666     /// and capture name for each pattern. It also provides a mapping from
667     /// each of the capturing groups in every pattern to their corresponding
668     /// slot offsets encoded in [`State::Capture`] states.
669     ///
670     /// Note that `GroupInfo` uses reference counting internally, such that
671     /// cloning a `GroupInfo` is very cheap.
672     ///
673     /// # Example
674     ///
675     /// This example shows how to get a list of all capture group names for
676     /// a particular pattern.
677     ///
678     /// ```
679     /// use regex_automata::{nfa::thompson::NFA, PatternID};
680     ///
681     /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?;
682     /// // The first is the implicit group that is always unnammed. The next
683     /// // 5 groups are the explicit groups found in the concrete syntax above.
684     /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")];
685     /// let got: Vec<Option<&str>> =
686     ///     nfa.group_info().pattern_names(PatternID::ZERO).collect();
687     /// assert_eq!(expected, got);
688     ///
689     /// // Using an invalid pattern ID will result in nothing yielded.
690     /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count();
691     /// assert_eq!(0, got);
692     ///
693     /// # Ok::<(), Box<dyn std::error::Error>>(())
694     /// ```
695     #[inline]
group_info(&self) -> &GroupInfo696     pub fn group_info(&self) -> &GroupInfo {
697         &self.0.group_info()
698     }
699 
700     /// Returns true if and only if this NFA has at least one
701     /// [`Capture`](State::Capture) in its sequence of states.
702     ///
703     /// This is useful as a way to perform a quick test before attempting
704     /// something that does or does not require capture states. For example,
705     /// some regex engines (like the PikeVM) require capture states in order to
706     /// work at all.
707     ///
708     /// # Example
709     ///
710     /// This example shows a few different NFAs and whether they have captures
711     /// or not.
712     ///
713     /// ```
714     /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
715     ///
716     /// // Obviously has capture states.
717     /// let nfa = NFA::new("(a)")?;
718     /// assert!(nfa.has_capture());
719     ///
720     /// // Less obviously has capture states, because every pattern has at
721     /// // least one anonymous capture group corresponding to the match for the
722     /// // entire pattern.
723     /// let nfa = NFA::new("a")?;
724     /// assert!(nfa.has_capture());
725     ///
726     /// // Other than hand building your own NFA, this is the only way to build
727     /// // an NFA without capturing groups. In general, you should only do this
728     /// // if you don't intend to use any of the NFA-oriented regex engines.
729     /// // Overall, capturing groups don't have many downsides. Although they
730     /// // can add a bit of noise to simple NFAs, so it can be nice to disable
731     /// // them for debugging purposes.
732     /// //
733     /// // Notice that 'has_capture' is false here even when we have an
734     /// // explicit capture group in the pattern.
735     /// let nfa = NFA::compiler()
736     ///     .configure(NFA::config().which_captures(WhichCaptures::None))
737     ///     .build("(a)")?;
738     /// assert!(!nfa.has_capture());
739     ///
740     /// # Ok::<(), Box<dyn std::error::Error>>(())
741     /// ```
742     #[inline]
has_capture(&self) -> bool743     pub fn has_capture(&self) -> bool {
744         self.0.has_capture
745     }
746 
747     /// Returns true if and only if this NFA can match the empty string.
748     /// When it returns false, all possible matches are guaranteed to have a
749     /// non-zero length.
750     ///
751     /// This is useful as cheap way to know whether code needs to handle the
752     /// case of a zero length match. This is particularly important when UTF-8
753     /// modes are enabled, as when UTF-8 mode is enabled, empty matches that
754     /// split a codepoint must never be reported. This extra handling can
755     /// sometimes be costly, and since regexes matching an empty string are
756     /// somewhat rare, it can be beneficial to treat such regexes specially.
757     ///
758     /// # Example
759     ///
760     /// This example shows a few different NFAs and whether they match the
761     /// empty string or not. Notice the empty string isn't merely a matter
762     /// of a string of length literally `0`, but rather, whether a match can
763     /// occur between specific pairs of bytes.
764     ///
765     /// ```
766     /// use regex_automata::{nfa::thompson::NFA, util::syntax};
767     ///
768     /// // The empty regex matches the empty string.
769     /// let nfa = NFA::new("")?;
770     /// assert!(nfa.has_empty(), "empty matches empty");
771     /// // The '+' repetition operator requires at least one match, and so
772     /// // does not match the empty string.
773     /// let nfa = NFA::new("a+")?;
774     /// assert!(!nfa.has_empty(), "+ does not match empty");
775     /// // But the '*' repetition operator does.
776     /// let nfa = NFA::new("a*")?;
777     /// assert!(nfa.has_empty(), "* does match empty");
778     /// // And wrapping '+' in an operator that can match an empty string also
779     /// // causes it to match the empty string too.
780     /// let nfa = NFA::new("(a+)*")?;
781     /// assert!(nfa.has_empty(), "+ inside of * matches empty");
782     ///
783     /// // If a regex is just made of a look-around assertion, even if the
784     /// // assertion requires some kind of non-empty string around it (such as
785     /// // \b), then it is still treated as if it matches the empty string.
786     /// // Namely, if a match occurs of just a look-around assertion, then the
787     /// // match returned is empty.
788     /// let nfa = NFA::compiler()
789     ///     .syntax(syntax::Config::new().utf8(false))
790     ///     .build(r"^$\A\z\b\B(?-u:\b\B)")?;
791     /// assert!(nfa.has_empty(), "assertions match empty");
792     /// // Even when an assertion is wrapped in a '+', it still matches the
793     /// // empty string.
794     /// let nfa = NFA::new(r"\b+")?;
795     /// assert!(nfa.has_empty(), "+ of an assertion matches empty");
796     ///
797     /// // An alternation with even one branch that can match the empty string
798     /// // is also said to match the empty string overall.
799     /// let nfa = NFA::new("foo|(bar)?|quux")?;
800     /// assert!(nfa.has_empty(), "alternations can match empty");
801     ///
802     /// // An NFA that matches nothing does not match the empty string.
803     /// let nfa = NFA::new("[a&&b]")?;
804     /// assert!(!nfa.has_empty(), "never matching means not matching empty");
805     /// // But if it's wrapped in something that doesn't require a match at
806     /// // all, then it can match the empty string!
807     /// let nfa = NFA::new("[a&&b]*")?;
808     /// assert!(nfa.has_empty(), "* on never-match still matches empty");
809     /// // Since a '+' requires a match, using it on something that can never
810     /// // match will itself produce a regex that can never match anything,
811     /// // and thus does not match the empty string.
812     /// let nfa = NFA::new("[a&&b]+")?;
813     /// assert!(!nfa.has_empty(), "+ on never-match still matches nothing");
814     ///
815     /// # Ok::<(), Box<dyn std::error::Error>>(())
816     /// ```
817     #[inline]
has_empty(&self) -> bool818     pub fn has_empty(&self) -> bool {
819         self.0.has_empty
820     }
821 
822     /// Whether UTF-8 mode is enabled for this NFA or not.
823     ///
824     /// When UTF-8 mode is enabled, all matches reported by a regex engine
825     /// derived from this NFA are guaranteed to correspond to spans of valid
826     /// UTF-8. This includes zero-width matches. For example, the regex engine
827     /// must guarantee that the empty regex will not match at the positions
828     /// between code units in the UTF-8 encoding of a single codepoint.
829     ///
830     /// See [`Config::utf8`] for more information.
831     ///
832     /// This is enabled by default.
833     ///
834     /// # Example
835     ///
836     /// This example shows how UTF-8 mode can impact the match spans that may
837     /// be reported in certain cases.
838     ///
839     /// ```
840     /// use regex_automata::{
841     ///     nfa::thompson::{self, pikevm::PikeVM},
842     ///     Match, Input,
843     /// };
844     ///
845     /// let re = PikeVM::new("")?;
846     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
847     ///
848     /// // UTF-8 mode is enabled by default.
849     /// let mut input = Input::new("☃");
850     /// re.search(&mut cache, &input, &mut caps);
851     /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
852     ///
853     /// // Even though an empty regex matches at 1..1, our next match is
854     /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
855     /// // three bytes long).
856     /// input.set_start(1);
857     /// re.search(&mut cache, &input, &mut caps);
858     /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
859     ///
860     /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
861     /// let re = PikeVM::builder()
862     ///     .thompson(thompson::Config::new().utf8(false))
863     ///     .build("")?;
864     /// re.search(&mut cache, &input, &mut caps);
865     /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
866     ///
867     /// input.set_start(2);
868     /// re.search(&mut cache, &input, &mut caps);
869     /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
870     ///
871     /// input.set_start(3);
872     /// re.search(&mut cache, &input, &mut caps);
873     /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
874     ///
875     /// input.set_start(4);
876     /// re.search(&mut cache, &input, &mut caps);
877     /// assert_eq!(None, caps.get_match());
878     ///
879     /// # Ok::<(), Box<dyn std::error::Error>>(())
880     /// ```
881     #[inline]
is_utf8(&self) -> bool882     pub fn is_utf8(&self) -> bool {
883         self.0.utf8
884     }
885 
886     /// Returns true when this NFA is meant to be matched in reverse.
887     ///
888     /// Generally speaking, when this is true, it means the NFA is supposed to
889     /// be used in conjunction with moving backwards through the haystack. That
890     /// is, from a higher memory address to a lower memory address.
891     ///
892     /// It is often the case that lower level routines dealing with an NFA
893     /// don't need to care about whether it is "meant" to be matched in reverse
894     /// or not. However, there are some specific cases where it matters. For
895     /// example, the implementation of CRLF-aware `^` and `$` line anchors
896     /// needs to know whether the search is in the forward or reverse
897     /// direction. In the forward direction, neither `^` nor `$` should match
898     /// when a `\r` has been seen previously and a `\n` is next. However, in
899     /// the reverse direction, neither `^` nor `$` should match when a `\n`
900     /// has been seen previously and a `\r` is next. This fundamentally changes
901     /// how the state machine is constructed, and thus needs to be altered
902     /// based on the direction of the search.
903     ///
904     /// This is automatically set when using a [`Compiler`] with a configuration
905     /// where [`Config::reverse`] is enabled. If you're building your own NFA
906     /// by hand via a [`Builder`]
907     #[inline]
is_reverse(&self) -> bool908     pub fn is_reverse(&self) -> bool {
909         self.0.reverse
910     }
911 
912     /// Returns true if and only if all starting states for this NFA correspond
913     /// to the beginning of an anchored search.
914     ///
915     /// Typically, an NFA will have both an anchored and an unanchored starting
916     /// state. Namely, because it tends to be useful to have both and the cost
917     /// of having an unanchored starting state is almost zero (for an NFA).
918     /// However, if all patterns in the NFA are themselves anchored, then even
919     /// the unanchored starting state will correspond to an anchored search
920     /// since the pattern doesn't permit anything else.
921     ///
922     /// # Example
923     ///
924     /// This example shows a few different scenarios where this method's
925     /// return value varies.
926     ///
927     /// ```
928     /// use regex_automata::nfa::thompson::NFA;
929     ///
930     /// // The unanchored starting state permits matching this pattern anywhere
931     /// // in a haystack, instead of just at the beginning.
932     /// let nfa = NFA::new("a")?;
933     /// assert!(!nfa.is_always_start_anchored());
934     ///
935     /// // In this case, the pattern is itself anchored, so there is no way
936     /// // to run an unanchored search.
937     /// let nfa = NFA::new("^a")?;
938     /// assert!(nfa.is_always_start_anchored());
939     ///
940     /// // When multiline mode is enabled, '^' can match at the start of a line
941     /// // in addition to the start of a haystack, so an unanchored search is
942     /// // actually possible.
943     /// let nfa = NFA::new("(?m)^a")?;
944     /// assert!(!nfa.is_always_start_anchored());
945     ///
946     /// // Weird cases also work. A pattern is only considered anchored if all
947     /// // matches may only occur at the start of a haystack.
948     /// let nfa = NFA::new("(^a)|a")?;
949     /// assert!(!nfa.is_always_start_anchored());
950     ///
951     /// // When multiple patterns are present, if they are all anchored, then
952     /// // the NFA is always anchored too.
953     /// let nfa = NFA::new_many(&["^a", "^b", "^c"])?;
954     /// assert!(nfa.is_always_start_anchored());
955     ///
956     /// // But if one pattern is unanchored, then the NFA must permit an
957     /// // unanchored search.
958     /// let nfa = NFA::new_many(&["^a", "b", "^c"])?;
959     /// assert!(!nfa.is_always_start_anchored());
960     ///
961     /// # Ok::<(), Box<dyn std::error::Error>>(())
962     /// ```
963     #[inline]
is_always_start_anchored(&self) -> bool964     pub fn is_always_start_anchored(&self) -> bool {
965         self.start_anchored() == self.start_unanchored()
966     }
967 
968     /// Returns the look-around matcher associated with this NFA.
969     ///
970     /// A look-around matcher determines how to match look-around assertions.
971     /// In particular, some assertions are configurable. For example, the
972     /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
973     /// from the default of `\n` to any other byte.
974     ///
975     /// If the NFA was built using a [`Compiler`], then this matcher
976     /// can be set via the [`Config::look_matcher`] configuration
977     /// knob. Otherwise, if you've built an NFA by hand, it is set via
978     /// [`Builder::set_look_matcher`].
979     ///
980     /// # Example
981     ///
982     /// This shows how to change the line terminator for multi-line assertions.
983     ///
984     /// ```
985     /// use regex_automata::{
986     ///     nfa::thompson::{self, pikevm::PikeVM},
987     ///     util::look::LookMatcher,
988     ///     Match, Input,
989     /// };
990     ///
991     /// let mut lookm = LookMatcher::new();
992     /// lookm.set_line_terminator(b'\x00');
993     ///
994     /// let re = PikeVM::builder()
995     ///     .thompson(thompson::Config::new().look_matcher(lookm))
996     ///     .build(r"(?m)^[a-z]+$")?;
997     /// let mut cache = re.create_cache();
998     ///
999     /// // Multi-line assertions now use NUL as a terminator.
1000     /// assert_eq!(
1001     ///     Some(Match::must(0, 1..4)),
1002     ///     re.find(&mut cache, b"\x00abc\x00"),
1003     /// );
1004     /// // ... and \n is no longer recognized as a terminator.
1005     /// assert_eq!(
1006     ///     None,
1007     ///     re.find(&mut cache, b"\nabc\n"),
1008     /// );
1009     ///
1010     /// # Ok::<(), Box<dyn std::error::Error>>(())
1011     /// ```
1012     #[inline]
look_matcher(&self) -> &LookMatcher1013     pub fn look_matcher(&self) -> &LookMatcher {
1014         &self.0.look_matcher
1015     }
1016 
1017     /// Returns the union of all look-around assertions used throughout this
1018     /// NFA. When the returned set is empty, it implies that the NFA has no
1019     /// look-around assertions and thus zero conditional epsilon transitions.
1020     ///
1021     /// This is useful in some cases enabling optimizations. It is not
1022     /// unusual, for example, for optimizations to be of the form, "for any
1023     /// regex with zero conditional epsilon transitions, do ..." where "..."
1024     /// is some kind of optimization.
1025     ///
1026     /// This isn't only helpful for optimizations either. Sometimes look-around
1027     /// assertions are difficult to support. For example, many of the DFAs in
1028     /// this crate don't support Unicode word boundaries or handle them using
1029     /// heuristics. Handling that correctly typically requires some kind of
1030     /// cheap check of whether the NFA has a Unicode word boundary in the first
1031     /// place.
1032     ///
1033     /// # Example
1034     ///
1035     /// This example shows how this routine varies based on the regex pattern:
1036     ///
1037     /// ```
1038     /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
1039     ///
1040     /// // No look-around at all.
1041     /// let nfa = NFA::new("a")?;
1042     /// assert!(nfa.look_set_any().is_empty());
1043     ///
1044     /// // When multiple patterns are present, since this returns the union,
1045     /// // it will include look-around assertions that only appear in one
1046     /// // pattern.
1047     /// let nfa = NFA::new_many(&["a", "b", "a^b", "c"])?;
1048     /// assert!(nfa.look_set_any().contains(Look::Start));
1049     ///
1050     /// // Some groups of assertions have various shortcuts. For example:
1051     /// let nfa = NFA::new(r"(?-u:\b)")?;
1052     /// assert!(nfa.look_set_any().contains_word());
1053     /// assert!(!nfa.look_set_any().contains_word_unicode());
1054     /// assert!(nfa.look_set_any().contains_word_ascii());
1055     ///
1056     /// # Ok::<(), Box<dyn std::error::Error>>(())
1057     /// ```
1058     #[inline]
look_set_any(&self) -> LookSet1059     pub fn look_set_any(&self) -> LookSet {
1060         self.0.look_set_any
1061     }
1062 
1063     /// Returns the union of all prefix look-around assertions for every
1064     /// pattern in this NFA. When the returned set is empty, it implies none of
1065     /// the patterns require moving through a conditional epsilon transition
1066     /// before inspecting the first byte in the haystack.
1067     ///
1068     /// This can be useful for determining what kinds of assertions need to be
1069     /// satisfied at the beginning of a search. For example, typically DFAs
1070     /// in this crate will build a distinct starting state for each possible
1071     /// starting configuration that might result in look-around assertions
1072     /// being satisfied differently. However, if the set returned here is
1073     /// empty, then you know that the start state is invariant because there
1074     /// are no conditional epsilon transitions to consider.
1075     ///
1076     /// # Example
1077     ///
1078     /// This example shows how this routine varies based on the regex pattern:
1079     ///
1080     /// ```
1081     /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
1082     ///
1083     /// // No look-around at all.
1084     /// let nfa = NFA::new("a")?;
1085     /// assert!(nfa.look_set_prefix_any().is_empty());
1086     ///
1087     /// // When multiple patterns are present, since this returns the union,
1088     /// // it will include look-around assertions that only appear in one
1089     /// // pattern. But it will only include assertions that are in the prefix
1090     /// // of a pattern. For example, this includes '^' but not '$' even though
1091     /// // '$' does appear.
1092     /// let nfa = NFA::new_many(&["a", "b", "^ab$", "c"])?;
1093     /// assert!(nfa.look_set_prefix_any().contains(Look::Start));
1094     /// assert!(!nfa.look_set_prefix_any().contains(Look::End));
1095     ///
1096     /// # Ok::<(), Box<dyn std::error::Error>>(())
1097     /// ```
1098     #[inline]
look_set_prefix_any(&self) -> LookSet1099     pub fn look_set_prefix_any(&self) -> LookSet {
1100         self.0.look_set_prefix_any
1101     }
1102 
1103     // FIXME: The `look_set_prefix_all` computation was not correct, and it
1104     // seemed a little tricky to fix it. Since I wasn't actually using it for
1105     // anything, I just decided to remove it in the run up to the regex 1.9
1106     // release. If you need this, please file an issue.
1107     /*
1108     /// Returns the intersection of all prefix look-around assertions for every
1109     /// pattern in this NFA. When the returned set is empty, it implies at
1110     /// least one of the patterns does not require moving through a conditional
1111     /// epsilon transition before inspecting the first byte in the haystack.
1112     /// Conversely, when the set contains an assertion, it implies that every
1113     /// pattern in the NFA also contains that assertion in its prefix.
1114     ///
1115     /// This can be useful for determining what kinds of assertions need to be
1116     /// satisfied at the beginning of a search. For example, if you know that
1117     /// [`Look::Start`] is in the prefix intersection set returned here, then
1118     /// you know that all searches, regardless of input configuration, will be
1119     /// anchored.
1120     ///
1121     /// # Example
1122     ///
1123     /// This example shows how this routine varies based on the regex pattern:
1124     ///
1125     /// ```
1126     /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
1127     ///
1128     /// // No look-around at all.
1129     /// let nfa = NFA::new("a")?;
1130     /// assert!(nfa.look_set_prefix_all().is_empty());
1131     ///
1132     /// // When multiple patterns are present, since this returns the
1133     /// // intersection, it will only include assertions present in every
1134     /// // prefix, and only the prefix.
1135     /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?;
1136     /// assert!(nfa.look_set_prefix_all().contains(Look::Start));
1137     /// assert!(!nfa.look_set_prefix_all().contains(Look::End));
1138     ///
1139     /// # Ok::<(), Box<dyn std::error::Error>>(())
1140     /// ```
1141     #[inline]
1142     pub fn look_set_prefix_all(&self) -> LookSet {
1143         self.0.look_set_prefix_all
1144     }
1145     */
1146 
1147     /// Returns the memory usage, in bytes, of this NFA.
1148     ///
1149     /// This does **not** include the stack size used up by this NFA. To
1150     /// compute that, use `std::mem::size_of::<NFA>()`.
1151     ///
1152     /// # Example
1153     ///
1154     /// This example shows that large Unicode character classes can use quite
1155     /// a bit of memory.
1156     ///
1157     /// ```
1158     /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1159     /// use regex_automata::nfa::thompson::NFA;
1160     ///
1161     /// let nfa_unicode = NFA::new(r"\w")?;
1162     /// let nfa_ascii = NFA::new(r"(?-u:\w)")?;
1163     ///
1164     /// assert!(10 * nfa_ascii.memory_usage() < nfa_unicode.memory_usage());
1165     ///
1166     /// # Ok::<(), Box<dyn std::error::Error>>(())
1167     /// ```
1168     #[inline]
memory_usage(&self) -> usize1169     pub fn memory_usage(&self) -> usize {
1170         use core::mem::size_of;
1171 
1172         size_of::<Inner>() // allocated on the heap via Arc
1173             + self.0.states.len() * size_of::<State>()
1174             + self.0.start_pattern.len() * size_of::<StateID>()
1175             + self.0.group_info.memory_usage()
1176             + self.0.memory_extra
1177     }
1178 }
1179 
1180 impl fmt::Debug for NFA {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1181     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1182         self.0.fmt(f)
1183     }
1184 }
1185 
1186 /// The "inner" part of the NFA. We split this part out so that we can easily
1187 /// wrap it in an `Arc` above in the definition of `NFA`.
1188 ///
1189 /// See builder.rs for the code that actually builds this type. This module
1190 /// does provide (internal) mutable methods for adding things to this
1191 /// NFA before finalizing it, but the high level construction process is
1192 /// controlled by the builder abstraction. (Which is complicated enough to
1193 /// get its own module.)
1194 #[derive(Default)]
1195 pub(super) struct Inner {
1196     /// The state sequence. This sequence is guaranteed to be indexable by all
1197     /// starting state IDs, and it is also guaranteed to contain at most one
1198     /// `Match` state for each pattern compiled into this NFA. (A pattern may
1199     /// not have a corresponding `Match` state if a `Match` state is impossible
1200     /// to reach.)
1201     states: Vec<State>,
1202     /// The anchored starting state of this NFA.
1203     start_anchored: StateID,
1204     /// The unanchored starting state of this NFA.
1205     start_unanchored: StateID,
1206     /// The starting states for each individual pattern. Starting at any
1207     /// of these states will result in only an anchored search for the
1208     /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
1209     /// contains a single regex, then `start_pattern[0]` and `start_anchored`
1210     /// are always equivalent.
1211     start_pattern: Vec<StateID>,
1212     /// Info about the capturing groups in this NFA. This is responsible for
1213     /// mapping groups to slots, mapping groups to names and names to groups.
1214     group_info: GroupInfo,
1215     /// A representation of equivalence classes over the transitions in this
1216     /// NFA. Two bytes in the same equivalence class must not discriminate
1217     /// between a match or a non-match. This map can be used to shrink the
1218     /// total size of a DFA's transition table with a small match-time cost.
1219     ///
1220     /// Note that the NFA's transitions are *not* defined in terms of these
1221     /// equivalence classes. The NFA's transitions are defined on the original
1222     /// byte values. For the most part, this is because they wouldn't really
1223     /// help the NFA much since the NFA already uses a sparse representation
1224     /// to represent transitions. Byte classes are most effective in a dense
1225     /// representation.
1226     byte_class_set: ByteClassSet,
1227     /// This is generated from `byte_class_set`, and essentially represents the
1228     /// same thing but supports different access patterns. Namely, this permits
1229     /// looking up the equivalence class of a byte very cheaply.
1230     ///
1231     /// Ideally we would just store this, but because of annoying code
1232     /// structure reasons, we keep both this and `byte_class_set` around for
1233     /// now. I think I would prefer that `byte_class_set` were computed in the
1234     /// `Builder`, but right now, we compute it as states are added to the
1235     /// `NFA`.
1236     byte_classes: ByteClasses,
1237     /// Whether this NFA has a `Capture` state anywhere.
1238     has_capture: bool,
1239     /// When the empty string is in the language matched by this NFA.
1240     has_empty: bool,
1241     /// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that
1242     /// all non-empty matches produced by this NFA correspond to spans of valid
1243     /// UTF-8, and any empty matches produced by this NFA that split a UTF-8
1244     /// encoded codepoint should be filtered out by the corresponding regex
1245     /// engine.
1246     utf8: bool,
1247     /// Whether this NFA is meant to be matched in reverse or not.
1248     reverse: bool,
1249     /// The matcher to be used for look-around assertions.
1250     look_matcher: LookMatcher,
1251     /// The union of all look-around assertions that occur anywhere within
1252     /// this NFA. If this set is empty, then it means there are precisely zero
1253     /// conditional epsilon transitions in the NFA.
1254     look_set_any: LookSet,
1255     /// The union of all look-around assertions that occur as a zero-length
1256     /// prefix for any of the patterns in this NFA.
1257     look_set_prefix_any: LookSet,
1258     /*
1259     /// The intersection of all look-around assertions that occur as a
1260     /// zero-length prefix for any of the patterns in this NFA.
1261     look_set_prefix_all: LookSet,
1262     */
1263     /// Heap memory used indirectly by NFA states and other things (like the
1264     /// various capturing group representations above). Since each state
1265     /// might use a different amount of heap, we need to keep track of this
1266     /// incrementally.
1267     memory_extra: usize,
1268 }
1269 
1270 impl Inner {
1271     /// Runs any last finalization bits and turns this into a full NFA.
into_nfa(mut self) -> NFA1272     pub(super) fn into_nfa(mut self) -> NFA {
1273         self.byte_classes = self.byte_class_set.byte_classes();
1274         // Do epsilon closure from the start state of every pattern in order
1275         // to compute various properties such as look-around assertions and
1276         // whether the empty string can be matched.
1277         let mut stack = vec![];
1278         let mut seen = SparseSet::new(self.states.len());
1279         for &start_id in self.start_pattern.iter() {
1280             stack.push(start_id);
1281             seen.clear();
1282             // let mut prefix_all = LookSet::full();
1283             let mut prefix_any = LookSet::empty();
1284             while let Some(sid) = stack.pop() {
1285                 if !seen.insert(sid) {
1286                     continue;
1287                 }
1288                 match self.states[sid] {
1289                     State::ByteRange { .. }
1290                     | State::Dense { .. }
1291                     | State::Fail => continue,
1292                     State::Sparse(_) => {
1293                         // This snippet below will rewrite this sparse state
1294                         // as a dense state. By doing it here, we apply this
1295                         // optimization to all hot "sparse" states since these
1296                         // are the states that are reachable from the start
1297                         // state via an epsilon closure.
1298                         //
1299                         // Unfortunately, this optimization did not seem to
1300                         // help much in some very limited ad hoc benchmarking.
1301                         //
1302                         // I left the 'Dense' state type in place in case we
1303                         // want to revisit this, but I suspect the real way
1304                         // to make forward progress is a more fundamental
1305                         // rearchitecting of how data in the NFA is laid out.
1306                         // I think we should consider a single contiguous
1307                         // allocation instead of all this indirection and
1308                         // potential heap allocations for every state. But this
1309                         // is a large re-design and will require API breaking
1310                         // changes.
1311                         // self.memory_extra -= self.states[sid].memory_usage();
1312                         // let trans = DenseTransitions::from_sparse(sparse);
1313                         // self.states[sid] = State::Dense(trans);
1314                         // self.memory_extra += self.states[sid].memory_usage();
1315                         continue;
1316                     }
1317                     State::Match { .. } => self.has_empty = true,
1318                     State::Look { look, next } => {
1319                         prefix_any = prefix_any.insert(look);
1320                         stack.push(next);
1321                     }
1322                     State::Union { ref alternates } => {
1323                         // Order doesn't matter here, since we're just dealing
1324                         // with look-around sets. But if we do richer analysis
1325                         // here that needs to care about preference order, then
1326                         // this should be done in reverse.
1327                         stack.extend(alternates.iter());
1328                     }
1329                     State::BinaryUnion { alt1, alt2 } => {
1330                         stack.push(alt2);
1331                         stack.push(alt1);
1332                     }
1333                     State::Capture { next, .. } => {
1334                         stack.push(next);
1335                     }
1336                 }
1337             }
1338             self.look_set_prefix_any =
1339                 self.look_set_prefix_any.union(prefix_any);
1340         }
1341         NFA(Arc::new(self))
1342     }
1343 
1344     /// Returns the capturing group info for this NFA.
group_info(&self) -> &GroupInfo1345     pub(super) fn group_info(&self) -> &GroupInfo {
1346         &self.group_info
1347     }
1348 
1349     /// Add the given state to this NFA after allocating a fresh identifier for
1350     /// it.
1351     ///
1352     /// This panics if too many states are added such that a fresh identifier
1353     /// could not be created. (Currently, the only caller of this routine is
1354     /// a `Builder`, and it upholds this invariant.)
add(&mut self, state: State) -> StateID1355     pub(super) fn add(&mut self, state: State) -> StateID {
1356         match state {
1357             State::ByteRange { ref trans } => {
1358                 self.byte_class_set.set_range(trans.start, trans.end);
1359             }
1360             State::Sparse(ref sparse) => {
1361                 for trans in sparse.transitions.iter() {
1362                     self.byte_class_set.set_range(trans.start, trans.end);
1363                 }
1364             }
1365             State::Dense { .. } => unreachable!(),
1366             State::Look { look, .. } => {
1367                 self.look_matcher
1368                     .add_to_byteset(look, &mut self.byte_class_set);
1369                 self.look_set_any = self.look_set_any.insert(look);
1370             }
1371             State::Capture { .. } => {
1372                 self.has_capture = true;
1373             }
1374             State::Union { .. }
1375             | State::BinaryUnion { .. }
1376             | State::Fail
1377             | State::Match { .. } => {}
1378         }
1379 
1380         let id = StateID::new(self.states.len()).unwrap();
1381         self.memory_extra += state.memory_usage();
1382         self.states.push(state);
1383         id
1384     }
1385 
1386     /// Set the starting state identifiers for this NFA.
1387     ///
1388     /// `start_anchored` and `start_unanchored` may be equivalent. When they
1389     /// are, then the NFA can only execute anchored searches. This might
1390     /// occur, for example, for patterns that are unconditionally anchored.
1391     /// e.g., `^foo`.
set_starts( &mut self, start_anchored: StateID, start_unanchored: StateID, start_pattern: &[StateID], )1392     pub(super) fn set_starts(
1393         &mut self,
1394         start_anchored: StateID,
1395         start_unanchored: StateID,
1396         start_pattern: &[StateID],
1397     ) {
1398         self.start_anchored = start_anchored;
1399         self.start_unanchored = start_unanchored;
1400         self.start_pattern = start_pattern.to_vec();
1401     }
1402 
1403     /// Sets the UTF-8 mode of this NFA.
set_utf8(&mut self, yes: bool)1404     pub(super) fn set_utf8(&mut self, yes: bool) {
1405         self.utf8 = yes;
1406     }
1407 
1408     /// Sets the reverse mode of this NFA.
set_reverse(&mut self, yes: bool)1409     pub(super) fn set_reverse(&mut self, yes: bool) {
1410         self.reverse = yes;
1411     }
1412 
1413     /// Sets the look-around assertion matcher for this NFA.
set_look_matcher(&mut self, m: LookMatcher)1414     pub(super) fn set_look_matcher(&mut self, m: LookMatcher) {
1415         self.look_matcher = m;
1416     }
1417 
1418     /// Set the capturing groups for this NFA.
1419     ///
1420     /// The given slice should contain the capturing groups for each pattern,
1421     /// The capturing groups in turn should correspond to the total number of
1422     /// capturing groups in the pattern, including the anonymous first capture
1423     /// group for each pattern. If a capturing group does have a name, then it
1424     /// should be provided as a Arc<str>.
1425     ///
1426     /// This returns an error if a corresponding `GroupInfo` could not be
1427     /// built.
set_captures( &mut self, captures: &[Vec<Option<Arc<str>>>], ) -> Result<(), GroupInfoError>1428     pub(super) fn set_captures(
1429         &mut self,
1430         captures: &[Vec<Option<Arc<str>>>],
1431     ) -> Result<(), GroupInfoError> {
1432         self.group_info = GroupInfo::new(
1433             captures.iter().map(|x| x.iter().map(|y| y.as_ref())),
1434         )?;
1435         Ok(())
1436     }
1437 
1438     /// Remap the transitions in every state of this NFA using the given map.
1439     /// The given map should be indexed according to state ID namespace used by
1440     /// the transitions of the states currently in this NFA.
1441     ///
1442     /// This is particularly useful to the NFA builder, since it is convenient
1443     /// to add NFA states in order to produce their final IDs. Then, after all
1444     /// of the intermediate "empty" states (unconditional epsilon transitions)
1445     /// have been removed from the builder's representation, we can re-map all
1446     /// of the transitions in the states already added to their final IDs.
remap(&mut self, old_to_new: &[StateID])1447     pub(super) fn remap(&mut self, old_to_new: &[StateID]) {
1448         for state in &mut self.states {
1449             state.remap(old_to_new);
1450         }
1451         self.start_anchored = old_to_new[self.start_anchored];
1452         self.start_unanchored = old_to_new[self.start_unanchored];
1453         for id in self.start_pattern.iter_mut() {
1454             *id = old_to_new[*id];
1455         }
1456     }
1457 }
1458 
1459 impl fmt::Debug for Inner {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1460     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1461         writeln!(f, "thompson::NFA(")?;
1462         for (sid, state) in self.states.iter().with_state_ids() {
1463             let status = if sid == self.start_anchored {
1464                 '^'
1465             } else if sid == self.start_unanchored {
1466                 '>'
1467             } else {
1468                 ' '
1469             };
1470             writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?;
1471         }
1472         let pattern_len = self.start_pattern.len();
1473         if pattern_len > 1 {
1474             writeln!(f, "")?;
1475             for pid in 0..pattern_len {
1476                 let sid = self.start_pattern[pid];
1477                 writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?;
1478             }
1479         }
1480         writeln!(f, "")?;
1481         writeln!(
1482             f,
1483             "transition equivalence classes: {:?}",
1484             self.byte_classes,
1485         )?;
1486         writeln!(f, ")")?;
1487         Ok(())
1488     }
1489 }
1490 
1491 /// A state in an NFA.
1492 ///
1493 /// In theory, it can help to conceptualize an `NFA` as a graph consisting of
1494 /// `State`s. Each `State` contains its complete set of outgoing transitions.
1495 ///
1496 /// In practice, it can help to conceptualize an `NFA` as a sequence of
1497 /// instructions for a virtual machine. Each `State` says what to do and where
1498 /// to go next.
1499 ///
1500 /// Strictly speaking, the practical interpretation is the most correct one,
1501 /// because of the [`Capture`](State::Capture) state. Namely, a `Capture`
1502 /// state always forwards execution to another state unconditionally. Its only
1503 /// purpose is to cause a side effect: the recording of the current input
1504 /// position at a particular location in memory. In this sense, an `NFA`
1505 /// has more power than a theoretical non-deterministic finite automaton.
1506 ///
1507 /// For most uses of this crate, it is likely that one may never even need to
1508 /// be aware of this type at all. The main use cases for looking at `State`s
1509 /// directly are if you need to write your own search implementation or if you
1510 /// need to do some kind of analysis on the NFA.
1511 #[derive(Clone, Eq, PartialEq)]
1512 pub enum State {
1513     /// A state with a single transition that can only be taken if the current
1514     /// input symbol is in a particular range of bytes.
1515     ByteRange {
1516         /// The transition from this state to the next.
1517         trans: Transition,
1518     },
1519     /// A state with possibly many transitions represented in a sparse fashion.
1520     /// Transitions are non-overlapping and ordered lexicographically by input
1521     /// range.
1522     ///
1523     /// In practice, this is used for encoding UTF-8 automata. Its presence is
1524     /// primarily an optimization that avoids many additional unconditional
1525     /// epsilon transitions (via [`Union`](State::Union) states), and thus
1526     /// decreases the overhead of traversing the NFA. This can improve both
1527     /// matching time and DFA construction time.
1528     Sparse(SparseTransitions),
1529     /// A dense representation of a state with multiple transitions.
1530     Dense(DenseTransitions),
1531     /// A conditional epsilon transition satisfied via some sort of
1532     /// look-around. Look-around is limited to anchor and word boundary
1533     /// assertions.
1534     ///
1535     /// Look-around states are meant to be evaluated while performing epsilon
1536     /// closure (computing the set of states reachable from a particular state
1537     /// via only epsilon transitions). If the current position in the haystack
1538     /// satisfies the look-around assertion, then you're permitted to follow
1539     /// that epsilon transition.
1540     Look {
1541         /// The look-around assertion that must be satisfied before moving
1542         /// to `next`.
1543         look: Look,
1544         /// The state to transition to if the look-around assertion is
1545         /// satisfied.
1546         next: StateID,
1547     },
1548     /// An alternation such that there exists an epsilon transition to all
1549     /// states in `alternates`, where matches found via earlier transitions
1550     /// are preferred over later transitions.
1551     Union {
1552         /// An ordered sequence of unconditional epsilon transitions to other
1553         /// states. Transitions earlier in the sequence are preferred over
1554         /// transitions later in the sequence.
1555         alternates: Box<[StateID]>,
1556     },
1557     /// An alternation such that there exists precisely two unconditional
1558     /// epsilon transitions, where matches found via `alt1` are preferred over
1559     /// matches found via `alt2`.
1560     ///
1561     /// This state exists as a common special case of Union where there are
1562     /// only two alternates. In this case, we don't need any allocations to
1563     /// represent the state. This saves a bit of memory and also saves an
1564     /// additional memory access when traversing the NFA.
1565     BinaryUnion {
1566         /// An unconditional epsilon transition to another NFA state. This
1567         /// is preferred over `alt2`.
1568         alt1: StateID,
1569         /// An unconditional epsilon transition to another NFA state. Matches
1570         /// reported via this transition should only be reported if no matches
1571         /// were found by following `alt1`.
1572         alt2: StateID,
1573     },
1574     /// An empty state that records a capture location.
1575     ///
1576     /// From the perspective of finite automata, this is precisely equivalent
1577     /// to an unconditional epsilon transition, but serves the purpose of
1578     /// instructing NFA simulations to record additional state when the finite
1579     /// state machine passes through this epsilon transition.
1580     ///
1581     /// `slot` in this context refers to the specific capture group slot
1582     /// offset that is being recorded. Each capturing group has two slots
1583     /// corresponding to the start and end of the matching portion of that
1584     /// group.
1585     ///
1586     /// The pattern ID and capture group index are also included in this state
1587     /// in case they are useful. But mostly, all you'll need is `next` and
1588     /// `slot`.
1589     Capture {
1590         /// The state to transition to, unconditionally.
1591         next: StateID,
1592         /// The pattern ID that this capture belongs to.
1593         pattern_id: PatternID,
1594         /// The capture group index that this capture belongs to. Capture group
1595         /// indices are local to each pattern. For example, when capturing
1596         /// groups are enabled, every pattern has a capture group at index
1597         /// `0`.
1598         group_index: SmallIndex,
1599         /// The slot index for this capture. Every capturing group has two
1600         /// slots: one for the start haystack offset and one for the end
1601         /// haystack offset. Unlike capture group indices, slot indices are
1602         /// global across all patterns in this NFA. That is, each slot belongs
1603         /// to a single pattern, but there is only one slot at index `i`.
1604         slot: SmallIndex,
1605     },
1606     /// A state that cannot be transitioned out of. This is useful for cases
1607     /// where you want to prevent matching from occurring. For example, if your
1608     /// regex parser permits empty character classes, then one could choose
1609     /// a `Fail` state to represent them. (An empty character class can be
1610     /// thought of as an empty set. Since nothing is in an empty set, they can
1611     /// never match anything.)
1612     Fail,
1613     /// A match state. There is at least one such occurrence of this state for
1614     /// each regex that can match that is in this NFA.
1615     Match {
1616         /// The matching pattern ID.
1617         pattern_id: PatternID,
1618     },
1619 }
1620 
1621 impl State {
1622     /// Returns true if and only if this state contains one or more epsilon
1623     /// transitions.
1624     ///
1625     /// In practice, a state has no outgoing transitions (like `Match`), has
1626     /// only non-epsilon transitions (like `ByteRange`) or has only epsilon
1627     /// transitions (like `Union`).
1628     ///
1629     /// # Example
1630     ///
1631     /// ```
1632     /// use regex_automata::{
1633     ///     nfa::thompson::{State, Transition},
1634     ///     util::primitives::{PatternID, StateID, SmallIndex},
1635     /// };
1636     ///
1637     /// // Capture states are epsilon transitions.
1638     /// let state = State::Capture {
1639     ///     next: StateID::ZERO,
1640     ///     pattern_id: PatternID::ZERO,
1641     ///     group_index: SmallIndex::ZERO,
1642     ///     slot: SmallIndex::ZERO,
1643     /// };
1644     /// assert!(state.is_epsilon());
1645     ///
1646     /// // ByteRange states are not.
1647     /// let state = State::ByteRange {
1648     ///     trans: Transition { start: b'a', end: b'z', next: StateID::ZERO },
1649     /// };
1650     /// assert!(!state.is_epsilon());
1651     ///
1652     /// # Ok::<(), Box<dyn std::error::Error>>(())
1653     /// ```
1654     #[inline]
is_epsilon(&self) -> bool1655     pub fn is_epsilon(&self) -> bool {
1656         match *self {
1657             State::ByteRange { .. }
1658             | State::Sparse { .. }
1659             | State::Dense { .. }
1660             | State::Fail
1661             | State::Match { .. } => false,
1662             State::Look { .. }
1663             | State::Union { .. }
1664             | State::BinaryUnion { .. }
1665             | State::Capture { .. } => true,
1666         }
1667     }
1668 
1669     /// Returns the heap memory usage of this NFA state in bytes.
memory_usage(&self) -> usize1670     fn memory_usage(&self) -> usize {
1671         match *self {
1672             State::ByteRange { .. }
1673             | State::Look { .. }
1674             | State::BinaryUnion { .. }
1675             | State::Capture { .. }
1676             | State::Match { .. }
1677             | State::Fail => 0,
1678             State::Sparse(SparseTransitions { ref transitions }) => {
1679                 transitions.len() * mem::size_of::<Transition>()
1680             }
1681             State::Dense { .. } => 256 * mem::size_of::<StateID>(),
1682             State::Union { ref alternates } => {
1683                 alternates.len() * mem::size_of::<StateID>()
1684             }
1685         }
1686     }
1687 
1688     /// Remap the transitions in this state using the given map. Namely, the
1689     /// given map should be indexed according to the transitions currently
1690     /// in this state.
1691     ///
1692     /// This is used during the final phase of the NFA compiler, which turns
1693     /// its intermediate NFA into the final NFA.
remap(&mut self, remap: &[StateID])1694     fn remap(&mut self, remap: &[StateID]) {
1695         match *self {
1696             State::ByteRange { ref mut trans } => {
1697                 trans.next = remap[trans.next]
1698             }
1699             State::Sparse(SparseTransitions { ref mut transitions }) => {
1700                 for t in transitions.iter_mut() {
1701                     t.next = remap[t.next];
1702                 }
1703             }
1704             State::Dense(DenseTransitions { ref mut transitions }) => {
1705                 for sid in transitions.iter_mut() {
1706                     *sid = remap[*sid];
1707                 }
1708             }
1709             State::Look { ref mut next, .. } => *next = remap[*next],
1710             State::Union { ref mut alternates } => {
1711                 for alt in alternates.iter_mut() {
1712                     *alt = remap[*alt];
1713                 }
1714             }
1715             State::BinaryUnion { ref mut alt1, ref mut alt2 } => {
1716                 *alt1 = remap[*alt1];
1717                 *alt2 = remap[*alt2];
1718             }
1719             State::Capture { ref mut next, .. } => *next = remap[*next],
1720             State::Fail => {}
1721             State::Match { .. } => {}
1722         }
1723     }
1724 }
1725 
1726 impl fmt::Debug for State {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1727     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1728         match *self {
1729             State::ByteRange { ref trans } => trans.fmt(f),
1730             State::Sparse(SparseTransitions { ref transitions }) => {
1731                 let rs = transitions
1732                     .iter()
1733                     .map(|t| format!("{:?}", t))
1734                     .collect::<Vec<String>>()
1735                     .join(", ");
1736                 write!(f, "sparse({})", rs)
1737             }
1738             State::Dense(ref dense) => {
1739                 write!(f, "dense(")?;
1740                 for (i, t) in dense.iter().enumerate() {
1741                     if i > 0 {
1742                         write!(f, ", ")?;
1743                     }
1744                     write!(f, "{:?}", t)?;
1745                 }
1746                 write!(f, ")")
1747             }
1748             State::Look { ref look, next } => {
1749                 write!(f, "{:?} => {:?}", look, next.as_usize())
1750             }
1751             State::Union { ref alternates } => {
1752                 let alts = alternates
1753                     .iter()
1754                     .map(|id| format!("{:?}", id.as_usize()))
1755                     .collect::<Vec<String>>()
1756                     .join(", ");
1757                 write!(f, "union({})", alts)
1758             }
1759             State::BinaryUnion { alt1, alt2 } => {
1760                 write!(
1761                     f,
1762                     "binary-union({}, {})",
1763                     alt1.as_usize(),
1764                     alt2.as_usize()
1765                 )
1766             }
1767             State::Capture { next, pattern_id, group_index, slot } => {
1768                 write!(
1769                     f,
1770                     "capture(pid={:?}, group={:?}, slot={:?}) => {:?}",
1771                     pattern_id.as_usize(),
1772                     group_index.as_usize(),
1773                     slot.as_usize(),
1774                     next.as_usize(),
1775                 )
1776             }
1777             State::Fail => write!(f, "FAIL"),
1778             State::Match { pattern_id } => {
1779                 write!(f, "MATCH({:?})", pattern_id.as_usize())
1780             }
1781         }
1782     }
1783 }
1784 
1785 /// A sequence of transitions used to represent a sparse state.
1786 ///
1787 /// This is the primary representation of a [`Sparse`](State::Sparse) state.
1788 /// It corresponds to a sorted sequence of transitions with non-overlapping
1789 /// byte ranges. If the byte at the current position in the haystack matches
1790 /// one of the byte ranges, then the finite state machine should take the
1791 /// corresponding transition.
1792 #[derive(Clone, Debug, Eq, PartialEq)]
1793 pub struct SparseTransitions {
1794     /// The sorted sequence of non-overlapping transitions.
1795     pub transitions: Box<[Transition]>,
1796 }
1797 
1798 impl SparseTransitions {
1799     /// This follows the matching transition for a particular byte.
1800     ///
1801     /// The matching transition is found by looking for a matching byte
1802     /// range (there is at most one) corresponding to the position `at` in
1803     /// `haystack`.
1804     ///
1805     /// If `at >= haystack.len()`, then this returns `None`.
1806     #[inline]
matches(&self, haystack: &[u8], at: usize) -> Option<StateID>1807     pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
1808         haystack.get(at).and_then(|&b| self.matches_byte(b))
1809     }
1810 
1811     /// This follows the matching transition for any member of the alphabet.
1812     ///
1813     /// The matching transition is found by looking for a matching byte
1814     /// range (there is at most one) corresponding to the position `at` in
1815     /// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi),
1816     /// then this always returns `None`.
1817     #[inline]
matches_unit( &self, unit: alphabet::Unit, ) -> Option<StateID>1818     pub(crate) fn matches_unit(
1819         &self,
1820         unit: alphabet::Unit,
1821     ) -> Option<StateID> {
1822         unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
1823     }
1824 
1825     /// This follows the matching transition for a particular byte.
1826     ///
1827     /// The matching transition is found by looking for a matching byte range
1828     /// (there is at most one) corresponding to the byte given.
1829     #[inline]
matches_byte(&self, byte: u8) -> Option<StateID>1830     pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
1831         for t in self.transitions.iter() {
1832             if t.start > byte {
1833                 break;
1834             } else if t.matches_byte(byte) {
1835                 return Some(t.next);
1836             }
1837         }
1838         None
1839 
1840         /*
1841         // This is an alternative implementation that uses binary search. In
1842         // some ad hoc experiments, like
1843         //
1844         //   regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file
1845         //
1846         // I could not observe any improvement, and in fact, things seemed to
1847         // be a bit slower. I can see an improvement in at least one benchmark:
1848         //
1849         //   regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8
1850         //
1851         // Where total search time goes from 3.2s to 2.4s when using binary
1852         // search.
1853         self.transitions
1854             .binary_search_by(|t| {
1855                 if t.end < byte {
1856                     core::cmp::Ordering::Less
1857                 } else if t.start > byte {
1858                     core::cmp::Ordering::Greater
1859                 } else {
1860                     core::cmp::Ordering::Equal
1861                 }
1862             })
1863             .ok()
1864             .map(|i| self.transitions[i].next)
1865         */
1866     }
1867 }
1868 
1869 /// A sequence of transitions used to represent a dense state.
1870 ///
1871 /// This is the primary representation of a [`Dense`](State::Dense) state. It
1872 /// provides constant time matching. That is, given a byte in a haystack and
1873 /// a `DenseTransitions`, one can determine if the state matches in constant
1874 /// time.
1875 ///
1876 /// This is in contrast to `SparseTransitions`, whose time complexity is
1877 /// necessarily bigger than constant time. Also in contrast, `DenseTransitions`
1878 /// usually requires (much) more heap memory.
1879 #[derive(Clone, Debug, Eq, PartialEq)]
1880 pub struct DenseTransitions {
1881     /// A dense representation of this state's transitions on the heap. This
1882     /// always has length 256.
1883     pub transitions: Box<[StateID]>,
1884 }
1885 
1886 impl DenseTransitions {
1887     /// This follows the matching transition for a particular byte.
1888     ///
1889     /// The matching transition is found by looking for a transition that
1890     /// doesn't correspond to `StateID::ZERO` for the byte `at` the given
1891     /// position in `haystack`.
1892     ///
1893     /// If `at >= haystack.len()`, then this returns `None`.
1894     #[inline]
matches(&self, haystack: &[u8], at: usize) -> Option<StateID>1895     pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
1896         haystack.get(at).and_then(|&b| self.matches_byte(b))
1897     }
1898 
1899     /// This follows the matching transition for any member of the alphabet.
1900     ///
1901     /// The matching transition is found by looking for a transition that
1902     /// doesn't correspond to `StateID::ZERO` for the byte `at` the given
1903     /// position in `haystack`.
1904     ///
1905     /// If `at >= haystack.len()` or if the given alphabet unit is
1906     /// [`EOI`](alphabet::Unit::eoi), then this returns `None`.
1907     #[inline]
matches_unit( &self, unit: alphabet::Unit, ) -> Option<StateID>1908     pub(crate) fn matches_unit(
1909         &self,
1910         unit: alphabet::Unit,
1911     ) -> Option<StateID> {
1912         unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
1913     }
1914 
1915     /// This follows the matching transition for a particular byte.
1916     ///
1917     /// The matching transition is found by looking for a transition that
1918     /// doesn't correspond to `StateID::ZERO` for the given `byte`.
1919     ///
1920     /// If `at >= haystack.len()`, then this returns `None`.
1921     #[inline]
matches_byte(&self, byte: u8) -> Option<StateID>1922     pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
1923         let next = self.transitions[usize::from(byte)];
1924         if next == StateID::ZERO {
1925             None
1926         } else {
1927             Some(next)
1928         }
1929     }
1930 
1931     /*
1932     /// The dense state optimization isn't currently enabled, so permit a
1933     /// little bit of dead code.
1934     pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions {
1935         let mut dense = vec![StateID::ZERO; 256];
1936         for t in sparse.transitions.iter() {
1937             for b in t.start..=t.end {
1938                 dense[usize::from(b)] = t.next;
1939             }
1940         }
1941         DenseTransitions { transitions: dense.into_boxed_slice() }
1942     }
1943     */
1944 
1945     /// Returns an iterator over all transitions that don't point to
1946     /// `StateID::ZERO`.
iter(&self) -> impl Iterator<Item = Transition> + '_1947     pub(crate) fn iter(&self) -> impl Iterator<Item = Transition> + '_ {
1948         use crate::util::int::Usize;
1949         self.transitions
1950             .iter()
1951             .enumerate()
1952             .filter(|&(_, &sid)| sid != StateID::ZERO)
1953             .map(|(byte, &next)| Transition {
1954                 start: byte.as_u8(),
1955                 end: byte.as_u8(),
1956                 next,
1957             })
1958     }
1959 }
1960 
1961 /// A single transition to another state.
1962 ///
1963 /// This transition may only be followed if the current byte in the haystack
1964 /// falls in the inclusive range of bytes specified.
1965 #[derive(Clone, Copy, Eq, Hash, PartialEq)]
1966 pub struct Transition {
1967     /// The inclusive start of the byte range.
1968     pub start: u8,
1969     /// The inclusive end of the byte range.
1970     pub end: u8,
1971     /// The identifier of the state to transition to.
1972     pub next: StateID,
1973 }
1974 
1975 impl Transition {
1976     /// Returns true if the position `at` in `haystack` falls in this
1977     /// transition's range of bytes.
1978     ///
1979     /// If `at >= haystack.len()`, then this returns `false`.
matches(&self, haystack: &[u8], at: usize) -> bool1980     pub fn matches(&self, haystack: &[u8], at: usize) -> bool {
1981         haystack.get(at).map_or(false, |&b| self.matches_byte(b))
1982     }
1983 
1984     /// Returns true if the given alphabet unit falls in this transition's
1985     /// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then
1986     /// this returns `false`.
matches_unit(&self, unit: alphabet::Unit) -> bool1987     pub fn matches_unit(&self, unit: alphabet::Unit) -> bool {
1988         unit.as_u8().map_or(false, |byte| self.matches_byte(byte))
1989     }
1990 
1991     /// Returns true if the given byte falls in this transition's range of
1992     /// bytes.
matches_byte(&self, byte: u8) -> bool1993     pub fn matches_byte(&self, byte: u8) -> bool {
1994         self.start <= byte && byte <= self.end
1995     }
1996 }
1997 
1998 impl fmt::Debug for Transition {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1999     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2000         use crate::util::escape::DebugByte;
2001 
2002         let Transition { start, end, next } = *self;
2003         if self.start == self.end {
2004             write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())
2005         } else {
2006             write!(
2007                 f,
2008                 "{:?}-{:?} => {:?}",
2009                 DebugByte(start),
2010                 DebugByte(end),
2011                 next.as_usize(),
2012             )
2013         }
2014     }
2015 }
2016 
2017 /// An iterator over all pattern IDs in an NFA.
2018 ///
2019 /// This iterator is created by [`NFA::patterns`].
2020 ///
2021 /// The lifetime parameter `'a` refers to the lifetime of the NFA from which
2022 /// this pattern iterator was created.
2023 #[derive(Debug)]
2024 pub struct PatternIter<'a> {
2025     it: PatternIDIter,
2026     /// We explicitly associate a lifetime with this iterator even though we
2027     /// don't actually borrow anything from the NFA. We do this for backward
2028     /// compatibility purposes. If we ever do need to borrow something from
2029     /// the NFA, then we can and just get rid of this marker without breaking
2030     /// the public API.
2031     _marker: core::marker::PhantomData<&'a ()>,
2032 }
2033 
2034 impl<'a> Iterator for PatternIter<'a> {
2035     type Item = PatternID;
2036 
next(&mut self) -> Option<PatternID>2037     fn next(&mut self) -> Option<PatternID> {
2038         self.it.next()
2039     }
2040 }
2041 
2042 #[cfg(all(test, feature = "nfa-pikevm"))]
2043 mod tests {
2044     use super::*;
2045     use crate::{nfa::thompson::pikevm::PikeVM, Input};
2046 
2047     // This asserts that an NFA state doesn't have its size changed. It is
2048     // *really* easy to accidentally increase the size, and thus potentially
2049     // dramatically increase the memory usage of every NFA.
2050     //
2051     // This assert doesn't mean we absolutely cannot increase the size of an
2052     // NFA state. We can. It's just here to make sure we do it knowingly and
2053     // intentionally.
2054     #[test]
state_has_small_size()2055     fn state_has_small_size() {
2056         #[cfg(target_pointer_width = "64")]
2057         assert_eq!(24, core::mem::size_of::<State>());
2058         #[cfg(target_pointer_width = "32")]
2059         assert_eq!(20, core::mem::size_of::<State>());
2060     }
2061 
2062     #[test]
always_match()2063     fn always_match() {
2064         let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap();
2065         let mut cache = re.create_cache();
2066         let mut caps = re.create_captures();
2067         let mut find = |haystack, start, end| {
2068             let input = Input::new(haystack).range(start..end);
2069             re.search(&mut cache, &input, &mut caps);
2070             caps.get_match().map(|m| m.end())
2071         };
2072 
2073         assert_eq!(Some(0), find("", 0, 0));
2074         assert_eq!(Some(0), find("a", 0, 1));
2075         assert_eq!(Some(1), find("a", 1, 1));
2076         assert_eq!(Some(0), find("ab", 0, 2));
2077         assert_eq!(Some(1), find("ab", 1, 2));
2078         assert_eq!(Some(2), find("ab", 2, 2));
2079     }
2080 
2081     #[test]
never_match()2082     fn never_match() {
2083         let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap();
2084         let mut cache = re.create_cache();
2085         let mut caps = re.create_captures();
2086         let mut find = |haystack, start, end| {
2087             let input = Input::new(haystack).range(start..end);
2088             re.search(&mut cache, &input, &mut caps);
2089             caps.get_match().map(|m| m.end())
2090         };
2091 
2092         assert_eq!(None, find("", 0, 0));
2093         assert_eq!(None, find("a", 0, 1));
2094         assert_eq!(None, find("a", 1, 1));
2095         assert_eq!(None, find("ab", 0, 2));
2096         assert_eq!(None, find("ab", 1, 2));
2097         assert_eq!(None, find("ab", 2, 2));
2098     }
2099 }
2100