1 use core::{fmt, mem}; 2 3 use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; 4 5 #[cfg(feature = "syntax")] 6 use crate::nfa::thompson::{ 7 compiler::{Compiler, Config}, 8 error::BuildError, 9 }; 10 use crate::{ 11 nfa::thompson::builder::Builder, 12 util::{ 13 alphabet::{self, ByteClassSet, ByteClasses}, 14 captures::{GroupInfo, GroupInfoError}, 15 look::{Look, LookMatcher, LookSet}, 16 primitives::{ 17 IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID, 18 }, 19 sparse_set::SparseSet, 20 }, 21 }; 22 23 /// A byte oriented Thompson non-deterministic finite automaton (NFA). 24 /// 25 /// A Thompson NFA is a finite state machine that permits unconditional epsilon 26 /// transitions, but guarantees that there exists at most one non-epsilon 27 /// transition for each element in the alphabet for each state. 28 /// 29 /// An NFA may be used directly for searching, for analysis or to build 30 /// a deterministic finite automaton (DFA). 31 /// 32 /// # Cheap clones 33 /// 34 /// Since an NFA is a core data type in this crate that many other regex 35 /// engines are based on top of, it is convenient to give ownership of an NFA 36 /// to said regex engines. Because of this, an NFA uses reference counting 37 /// internally. Therefore, it is cheap to clone and it is encouraged to do so. 38 /// 39 /// # Capabilities 40 /// 41 /// Using an NFA for searching via the 42 /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount 43 /// of "power" of any regex engine in this crate. Namely, it supports the 44 /// following in all cases: 45 /// 46 /// 1. Detection of a match. 47 /// 2. Location of a match, including both the start and end offset, in a 48 /// single pass of the haystack. 49 /// 3. Location of matching capturing groups. 50 /// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are 51 /// present. 52 /// 53 /// # Capturing Groups 54 /// 55 /// Groups refer to parenthesized expressions inside a regex pattern. They look 56 /// like this, where `exp` is an arbitrary regex: 57 /// 58 /// * `(exp)` - An unnamed capturing group. 59 /// * `(?P<name>exp)` or `(?<name>exp)` - A named capturing group. 60 /// * `(?:exp)` - A non-capturing group. 61 /// * `(?i:exp)` - A non-capturing group that sets flags. 62 /// 63 /// Only the first two forms are said to be _capturing_. Capturing 64 /// means that the last position at which they match is reportable. The 65 /// [`Captures`](crate::util::captures::Captures) type provides convenient 66 /// access to the match positions of capturing groups, which includes looking 67 /// up capturing groups by their name. 68 /// 69 /// # Byte oriented 70 /// 71 /// This NFA is byte oriented, which means that all of its transitions are 72 /// defined on bytes. In other words, the alphabet of an NFA consists of the 73 /// 256 different byte values. 74 /// 75 /// While DFAs nearly demand that they be byte oriented for performance 76 /// reasons, an NFA could conceivably be *Unicode codepoint* oriented. Indeed, 77 /// a previous version of this NFA supported both byte and codepoint oriented 78 /// modes. A codepoint oriented mode can work because an NFA fundamentally uses 79 /// a sparse representation of transitions, which works well with the large 80 /// sparse space of Unicode codepoints. 81 /// 82 /// Nevertheless, this NFA is only byte oriented. This choice is primarily 83 /// driven by implementation simplicity, and also in part memory usage. In 84 /// practice, performance between the two is roughly comparable. However, 85 /// building a DFA (including a hybrid DFA) really wants a byte oriented NFA. 86 /// So if we do have a codepoint oriented NFA, then we also need to generate 87 /// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only 88 /// generating byte oriented NFAs, we can produce one less NFA. In other words, 89 /// if we made our NFA codepoint oriented, we'd need to *also* make it support 90 /// a byte oriented mode, which is more complicated. But a byte oriented mode 91 /// can support everything. 92 /// 93 /// # Differences with DFAs 94 /// 95 /// At the theoretical level, the precise difference between an NFA and a DFA 96 /// is that, in a DFA, for every state, an input symbol unambiguously refers 97 /// to a single transition _and_ that an input symbol is required for each 98 /// transition. At a practical level, this permits DFA implementations to be 99 /// implemented at their core with a small constant number of CPU instructions 100 /// for each byte of input searched. In practice, this makes them quite a bit 101 /// faster than NFAs _in general_. Namely, in order to execute a search for any 102 /// Thompson NFA, one needs to keep track of a _set_ of states, and execute 103 /// the possible transitions on all of those states for each input symbol. 104 /// Overall, this results in much more overhead. To a first approximation, one 105 /// can expect DFA searches to be about an order of magnitude faster. 106 /// 107 /// So why use an NFA at all? The main advantage of an NFA is that it takes 108 /// linear time (in the size of the pattern string after repetitions have been 109 /// expanded) to build and linear memory usage. A DFA, on the other hand, may 110 /// take exponential time and/or space to build. Even in non-pathological 111 /// cases, DFAs often take quite a bit more memory than their NFA counterparts, 112 /// _especially_ if large Unicode character classes are involved. Of course, 113 /// an NFA also provides additional capabilities. For example, it can match 114 /// Unicode word boundaries on non-ASCII text and resolve the positions of 115 /// capturing groups. 116 /// 117 /// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a 118 /// good balance between an NFA and a DFA. It avoids the exponential build time 119 /// of a DFA while maintaining its fast search time. The downside of a hybrid 120 /// NFA/DFA is that in some cases it can be slower at search time than the NFA. 121 /// (It also has less functionality than a pure NFA. It cannot handle Unicode 122 /// word boundaries on non-ASCII text and cannot resolve capturing groups.) 123 /// 124 /// # Example 125 /// 126 /// This shows how to build an NFA with the default configuration and execute a 127 /// search using the Pike VM. 128 /// 129 /// ``` 130 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 131 /// 132 /// let re = PikeVM::new(r"foo[0-9]+")?; 133 /// let mut cache = re.create_cache(); 134 /// let mut caps = re.create_captures(); 135 /// 136 /// let expected = Some(Match::must(0, 0..8)); 137 /// re.captures(&mut cache, b"foo12345", &mut caps); 138 /// assert_eq!(expected, caps.get_match()); 139 /// 140 /// # Ok::<(), Box<dyn std::error::Error>>(()) 141 /// ``` 142 /// 143 /// # Example: resolving capturing groups 144 /// 145 /// This example shows how to parse some simple dates and extract the 146 /// components of each date via capturing groups. 147 /// 148 /// ``` 149 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 150 /// use regex_automata::{ 151 /// nfa::thompson::pikevm::PikeVM, 152 /// util::captures::Captures, 153 /// }; 154 /// 155 /// let vm = PikeVM::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})")?; 156 /// let mut cache = vm.create_cache(); 157 /// 158 /// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05"; 159 /// let all: Vec<Captures> = vm.captures_iter( 160 /// &mut cache, haystack.as_bytes() 161 /// ).collect(); 162 /// // There should be a total of 3 matches. 163 /// assert_eq!(3, all.len()); 164 /// // The year from the second match is '2013'. 165 /// let span = all[1].get_group_by_name("y").unwrap(); 166 /// assert_eq!("2013", &haystack[span]); 167 /// 168 /// # Ok::<(), Box<dyn std::error::Error>>(()) 169 /// ``` 170 /// 171 /// This example shows that only the last match of a capturing group is 172 /// reported, even if it had to match multiple times for an overall match 173 /// to occur. 174 /// 175 /// ``` 176 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; 177 /// 178 /// let re = PikeVM::new(r"([a-z]){4}")?; 179 /// let mut cache = re.create_cache(); 180 /// let mut caps = re.create_captures(); 181 /// 182 /// let haystack = b"quux"; 183 /// re.captures(&mut cache, haystack, &mut caps); 184 /// assert!(caps.is_match()); 185 /// assert_eq!(Some(Span::from(3..4)), caps.get_group(1)); 186 /// 187 /// # Ok::<(), Box<dyn std::error::Error>>(()) 188 /// ``` 189 #[derive(Clone)] 190 pub struct NFA( 191 // We make NFAs reference counted primarily for two reasons. First is that 192 // the NFA type itself is quite large (at least 0.5KB), and so it makes 193 // sense to put it on the heap by default anyway. Second is that, for Arc 194 // specifically, this enables cheap clones. This tends to be useful because 195 // several structures (the backtracker, the Pike VM, the hybrid NFA/DFA) 196 // all want to hang on to an NFA for use during search time. We could 197 // provide the NFA at search time via a function argument, but this makes 198 // for an unnecessarily annoying API. Instead, we just let each structure 199 // share ownership of the NFA. Using a deep clone would not be smart, since 200 // the NFA can use quite a bit of heap space. 201 Arc<Inner>, 202 ); 203 204 impl NFA { 205 /// Parse the given regular expression using a default configuration and 206 /// build an NFA from it. 207 /// 208 /// If you want a non-default configuration, then use the NFA 209 /// [`Compiler`] with a [`Config`]. 210 /// 211 /// # Example 212 /// 213 /// ``` 214 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 215 /// 216 /// let re = PikeVM::new(r"foo[0-9]+")?; 217 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 218 /// 219 /// let expected = Some(Match::must(0, 0..8)); 220 /// re.captures(&mut cache, b"foo12345", &mut caps); 221 /// assert_eq!(expected, caps.get_match()); 222 /// 223 /// # Ok::<(), Box<dyn std::error::Error>>(()) 224 /// ``` 225 #[cfg(feature = "syntax")] new(pattern: &str) -> Result<NFA, BuildError>226 pub fn new(pattern: &str) -> Result<NFA, BuildError> { 227 NFA::compiler().build(pattern) 228 } 229 230 /// Parse the given regular expressions using a default configuration and 231 /// build a multi-NFA from them. 232 /// 233 /// If you want a non-default configuration, then use the NFA 234 /// [`Compiler`] with a [`Config`]. 235 /// 236 /// # Example 237 /// 238 /// ``` 239 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; 240 /// 241 /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+"])?; 242 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 243 /// 244 /// let expected = Some(Match::must(1, 0..3)); 245 /// re.captures(&mut cache, b"foo12345bar", &mut caps); 246 /// assert_eq!(expected, caps.get_match()); 247 /// 248 /// # Ok::<(), Box<dyn std::error::Error>>(()) 249 /// ``` 250 #[cfg(feature = "syntax")] new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError>251 pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError> { 252 NFA::compiler().build_many(patterns) 253 } 254 255 /// Returns an NFA with a single regex pattern that always matches at every 256 /// position. 257 /// 258 /// # Example 259 /// 260 /// ``` 261 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; 262 /// 263 /// let re = PikeVM::new_from_nfa(NFA::always_match())?; 264 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 265 /// 266 /// let expected = Some(Match::must(0, 0..0)); 267 /// re.captures(&mut cache, b"", &mut caps); 268 /// assert_eq!(expected, caps.get_match()); 269 /// re.captures(&mut cache, b"foo", &mut caps); 270 /// assert_eq!(expected, caps.get_match()); 271 /// 272 /// # Ok::<(), Box<dyn std::error::Error>>(()) 273 /// ``` always_match() -> NFA274 pub fn always_match() -> NFA { 275 // We could use NFA::new("") here and we'd get the same semantics, but 276 // hand-assembling the NFA (as below) does the same thing with a fewer 277 // number of states. It also avoids needing the 'syntax' feature 278 // enabled. 279 // 280 // Technically all we need is the "match" state, but we add the 281 // "capture" states so that the PikeVM can use this NFA. 282 // 283 // The unwraps below are OK because we add so few states that they will 284 // never exhaust any default limits in any environment. 285 let mut builder = Builder::new(); 286 let pid = builder.start_pattern().unwrap(); 287 assert_eq!(pid.as_usize(), 0); 288 let start_id = 289 builder.add_capture_start(StateID::ZERO, 0, None).unwrap(); 290 let end_id = builder.add_capture_end(StateID::ZERO, 0).unwrap(); 291 let match_id = builder.add_match().unwrap(); 292 builder.patch(start_id, end_id).unwrap(); 293 builder.patch(end_id, match_id).unwrap(); 294 let pid = builder.finish_pattern(start_id).unwrap(); 295 assert_eq!(pid.as_usize(), 0); 296 builder.build(start_id, start_id).unwrap() 297 } 298 299 /// Returns an NFA that never matches at any position. 300 /// 301 /// This is a convenience routine for creating an NFA with zero patterns. 302 /// 303 /// # Example 304 /// 305 /// ``` 306 /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; 307 /// 308 /// let re = PikeVM::new_from_nfa(NFA::never_match())?; 309 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 310 /// 311 /// re.captures(&mut cache, b"", &mut caps); 312 /// assert!(!caps.is_match()); 313 /// re.captures(&mut cache, b"foo", &mut caps); 314 /// assert!(!caps.is_match()); 315 /// 316 /// # Ok::<(), Box<dyn std::error::Error>>(()) 317 /// ``` never_match() -> NFA318 pub fn never_match() -> NFA { 319 // This always succeeds because it only requires one NFA state, which 320 // will never exhaust any (default) limits. 321 let mut builder = Builder::new(); 322 let sid = builder.add_fail().unwrap(); 323 builder.build(sid, sid).unwrap() 324 } 325 326 /// Return a default configuration for an `NFA`. 327 /// 328 /// This is a convenience routine to avoid needing to import the `Config` 329 /// type when customizing the construction of an NFA. 330 /// 331 /// # Example 332 /// 333 /// This example shows how to build an NFA with a small size limit that 334 /// results in a compilation error for any regex that tries to use more 335 /// heap memory than the configured limit. 336 /// 337 /// ``` 338 /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; 339 /// 340 /// let result = PikeVM::builder() 341 /// .thompson(NFA::config().nfa_size_limit(Some(1_000))) 342 /// // Remember, \w is Unicode-aware by default and thus huge. 343 /// .build(r"\w+"); 344 /// assert!(result.is_err()); 345 /// ``` 346 #[cfg(feature = "syntax")] config() -> Config347 pub fn config() -> Config { 348 Config::new() 349 } 350 351 /// Return a compiler for configuring the construction of an `NFA`. 352 /// 353 /// This is a convenience routine to avoid needing to import the 354 /// [`Compiler`] type in common cases. 355 /// 356 /// # Example 357 /// 358 /// This example shows how to build an NFA that is permitted match invalid 359 /// UTF-8. Without the additional syntax configuration here, compilation of 360 /// `(?-u:.)` would fail because it is permitted to match invalid UTF-8. 361 /// 362 /// ``` 363 /// use regex_automata::{ 364 /// nfa::thompson::pikevm::PikeVM, 365 /// util::syntax, 366 /// Match, 367 /// }; 368 /// 369 /// let re = PikeVM::builder() 370 /// .syntax(syntax::Config::new().utf8(false)) 371 /// .build(r"[a-z]+(?-u:.)")?; 372 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 373 /// 374 /// let expected = Some(Match::must(0, 1..5)); 375 /// re.captures(&mut cache, b"\xFFabc\xFF", &mut caps); 376 /// assert_eq!(expected, caps.get_match()); 377 /// 378 /// # Ok::<(), Box<dyn std::error::Error>>(()) 379 /// ``` 380 #[cfg(feature = "syntax")] compiler() -> Compiler381 pub fn compiler() -> Compiler { 382 Compiler::new() 383 } 384 385 /// Returns an iterator over all pattern identifiers in this NFA. 386 /// 387 /// Pattern IDs are allocated in sequential order starting from zero, 388 /// where the order corresponds to the order of patterns provided to the 389 /// [`NFA::new_many`] constructor. 390 /// 391 /// # Example 392 /// 393 /// ``` 394 /// use regex_automata::{nfa::thompson::NFA, PatternID}; 395 /// 396 /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; 397 /// let pids: Vec<PatternID> = nfa.patterns().collect(); 398 /// assert_eq!(pids, vec![ 399 /// PatternID::must(0), 400 /// PatternID::must(1), 401 /// PatternID::must(2), 402 /// ]); 403 /// 404 /// # Ok::<(), Box<dyn std::error::Error>>(()) 405 /// ``` patterns(&self) -> PatternIter<'_>406 pub fn patterns(&self) -> PatternIter<'_> { 407 PatternIter { 408 it: PatternID::iter(self.pattern_len()), 409 _marker: core::marker::PhantomData, 410 } 411 } 412 413 /// Returns the total number of regex patterns in this NFA. 414 /// 415 /// This may return zero if the NFA was constructed with no patterns. In 416 /// this case, the NFA can never produce a match for any input. 417 /// 418 /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because 419 /// NFA construction will fail if too many patterns are added. 420 /// 421 /// It is always true that `nfa.patterns().count() == nfa.pattern_len()`. 422 /// 423 /// # Example 424 /// 425 /// ``` 426 /// use regex_automata::nfa::thompson::NFA; 427 /// 428 /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; 429 /// assert_eq!(3, nfa.pattern_len()); 430 /// 431 /// let nfa = NFA::never_match(); 432 /// assert_eq!(0, nfa.pattern_len()); 433 /// 434 /// let nfa = NFA::always_match(); 435 /// assert_eq!(1, nfa.pattern_len()); 436 /// 437 /// # Ok::<(), Box<dyn std::error::Error>>(()) 438 /// ``` 439 #[inline] pattern_len(&self) -> usize440 pub fn pattern_len(&self) -> usize { 441 self.0.start_pattern.len() 442 } 443 444 /// Return the state identifier of the initial anchored state of this NFA. 445 /// 446 /// The returned identifier is guaranteed to be a valid index into the 447 /// slice returned by [`NFA::states`], and is also a valid argument to 448 /// [`NFA::state`]. 449 /// 450 /// # Example 451 /// 452 /// This example shows a somewhat contrived example where we can easily 453 /// predict the anchored starting state. 454 /// 455 /// ``` 456 /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures}; 457 /// 458 /// let nfa = NFA::compiler() 459 /// .configure(NFA::config().which_captures(WhichCaptures::None)) 460 /// .build("a")?; 461 /// let state = nfa.state(nfa.start_anchored()); 462 /// match *state { 463 /// State::ByteRange { trans } => { 464 /// assert_eq!(b'a', trans.start); 465 /// assert_eq!(b'a', trans.end); 466 /// } 467 /// _ => unreachable!("unexpected state"), 468 /// } 469 /// 470 /// # Ok::<(), Box<dyn std::error::Error>>(()) 471 /// ``` 472 #[inline] start_anchored(&self) -> StateID473 pub fn start_anchored(&self) -> StateID { 474 self.0.start_anchored 475 } 476 477 /// Return the state identifier of the initial unanchored state of this 478 /// NFA. 479 /// 480 /// This is equivalent to the identifier returned by 481 /// [`NFA::start_anchored`] when the NFA has no unanchored starting state. 482 /// 483 /// The returned identifier is guaranteed to be a valid index into the 484 /// slice returned by [`NFA::states`], and is also a valid argument to 485 /// [`NFA::state`]. 486 /// 487 /// # Example 488 /// 489 /// This example shows that the anchored and unanchored starting states 490 /// are equivalent when an anchored NFA is built. 491 /// 492 /// ``` 493 /// use regex_automata::nfa::thompson::NFA; 494 /// 495 /// let nfa = NFA::new("^a")?; 496 /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); 497 /// 498 /// # Ok::<(), Box<dyn std::error::Error>>(()) 499 /// ``` 500 #[inline] start_unanchored(&self) -> StateID501 pub fn start_unanchored(&self) -> StateID { 502 self.0.start_unanchored 503 } 504 505 /// Return the state identifier of the initial anchored state for the given 506 /// pattern, or `None` if there is no pattern corresponding to the given 507 /// identifier. 508 /// 509 /// If one uses the starting state for a particular pattern, then the only 510 /// match that can be returned is for the corresponding pattern. 511 /// 512 /// The returned identifier is guaranteed to be a valid index into the 513 /// slice returned by [`NFA::states`], and is also a valid argument to 514 /// [`NFA::state`]. 515 /// 516 /// # Errors 517 /// 518 /// If the pattern doesn't exist in this NFA, then this returns an error. 519 /// This occurs when `pid.as_usize() >= nfa.pattern_len()`. 520 /// 521 /// # Example 522 /// 523 /// This example shows that the anchored and unanchored starting states 524 /// are equivalent when an anchored NFA is built. 525 /// 526 /// ``` 527 /// use regex_automata::{nfa::thompson::NFA, PatternID}; 528 /// 529 /// let nfa = NFA::new_many(&["^a", "^b"])?; 530 /// // The anchored and unanchored states for the entire NFA are the same, 531 /// // since all of the patterns are anchored. 532 /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); 533 /// // But the anchored starting states for each pattern are distinct, 534 /// // because these starting states can only lead to matches for the 535 /// // corresponding pattern. 536 /// let anchored = Some(nfa.start_anchored()); 537 /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(0))); 538 /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(1))); 539 /// // Requesting a pattern not in the NFA will result in None: 540 /// assert_eq!(None, nfa.start_pattern(PatternID::must(2))); 541 /// 542 /// # Ok::<(), Box<dyn std::error::Error>>(()) 543 /// ``` 544 #[inline] start_pattern(&self, pid: PatternID) -> Option<StateID>545 pub fn start_pattern(&self, pid: PatternID) -> Option<StateID> { 546 self.0.start_pattern.get(pid.as_usize()).copied() 547 } 548 549 /// Get the byte class set for this NFA. 550 /// 551 /// A byte class set is a partitioning of this NFA's alphabet into 552 /// equivalence classes. Any two bytes in the same equivalence class are 553 /// guaranteed to never discriminate between a match or a non-match. (The 554 /// partitioning may not be minimal.) 555 /// 556 /// Byte classes are used internally by this crate when building DFAs. 557 /// Namely, among other optimizations, they enable a space optimization 558 /// where the DFA's internal alphabet is defined over the equivalence 559 /// classes of bytes instead of all possible byte values. The former is 560 /// often quite a bit smaller than the latter, which permits the DFA to use 561 /// less space for its transition table. 562 #[inline] byte_class_set(&self) -> &ByteClassSet563 pub(crate) fn byte_class_set(&self) -> &ByteClassSet { 564 &self.0.byte_class_set 565 } 566 567 /// Get the byte classes for this NFA. 568 /// 569 /// Byte classes represent a partitioning of this NFA's alphabet into 570 /// equivalence classes. Any two bytes in the same equivalence class are 571 /// guaranteed to never discriminate between a match or a non-match. (The 572 /// partitioning may not be minimal.) 573 /// 574 /// Byte classes are used internally by this crate when building DFAs. 575 /// Namely, among other optimizations, they enable a space optimization 576 /// where the DFA's internal alphabet is defined over the equivalence 577 /// classes of bytes instead of all possible byte values. The former is 578 /// often quite a bit smaller than the latter, which permits the DFA to use 579 /// less space for its transition table. 580 /// 581 /// # Example 582 /// 583 /// This example shows how to query the class of various bytes. 584 /// 585 /// ``` 586 /// use regex_automata::nfa::thompson::NFA; 587 /// 588 /// let nfa = NFA::new("[a-z]+")?; 589 /// let classes = nfa.byte_classes(); 590 /// // 'a' and 'z' are in the same class for this regex. 591 /// assert_eq!(classes.get(b'a'), classes.get(b'z')); 592 /// // But 'a' and 'A' are not. 593 /// assert_ne!(classes.get(b'a'), classes.get(b'A')); 594 /// 595 /// # Ok::<(), Box<dyn std::error::Error>>(()) 596 /// ``` 597 #[inline] byte_classes(&self) -> &ByteClasses598 pub fn byte_classes(&self) -> &ByteClasses { 599 &self.0.byte_classes 600 } 601 602 /// Return a reference to the NFA state corresponding to the given ID. 603 /// 604 /// This is a convenience routine for `nfa.states()[id]`. 605 /// 606 /// # Panics 607 /// 608 /// This panics when the given identifier does not reference a valid state. 609 /// That is, when `id.as_usize() >= nfa.states().len()`. 610 /// 611 /// # Example 612 /// 613 /// The anchored state for a pattern will typically correspond to a 614 /// capturing state for that pattern. (Although, this is not an API 615 /// guarantee!) 616 /// 617 /// ``` 618 /// use regex_automata::{nfa::thompson::{NFA, State}, PatternID}; 619 /// 620 /// let nfa = NFA::new("a")?; 621 /// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap()); 622 /// match *state { 623 /// State::Capture { slot, .. } => { 624 /// assert_eq!(0, slot.as_usize()); 625 /// } 626 /// _ => unreachable!("unexpected state"), 627 /// } 628 /// 629 /// # Ok::<(), Box<dyn std::error::Error>>(()) 630 /// ``` 631 #[inline] state(&self, id: StateID) -> &State632 pub fn state(&self, id: StateID) -> &State { 633 &self.states()[id] 634 } 635 636 /// Returns a slice of all states in this NFA. 637 /// 638 /// The slice returned is indexed by `StateID`. This provides a convenient 639 /// way to access states while following transitions among those states. 640 /// 641 /// # Example 642 /// 643 /// This demonstrates that disabling UTF-8 mode can shrink the size of the 644 /// NFA considerably in some cases, especially when using Unicode character 645 /// classes. 646 /// 647 /// ``` 648 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 649 /// use regex_automata::nfa::thompson::NFA; 650 /// 651 /// let nfa_unicode = NFA::new(r"\w")?; 652 /// let nfa_ascii = NFA::new(r"(?-u)\w")?; 653 /// // Yes, a factor of 45 difference. No lie. 654 /// assert!(40 * nfa_ascii.states().len() < nfa_unicode.states().len()); 655 /// 656 /// # Ok::<(), Box<dyn std::error::Error>>(()) 657 /// ``` 658 #[inline] states(&self) -> &[State]659 pub fn states(&self) -> &[State] { 660 &self.0.states 661 } 662 663 /// Returns the capturing group info for this NFA. 664 /// 665 /// The [`GroupInfo`] provides a way to map to and from capture index 666 /// and capture name for each pattern. It also provides a mapping from 667 /// each of the capturing groups in every pattern to their corresponding 668 /// slot offsets encoded in [`State::Capture`] states. 669 /// 670 /// Note that `GroupInfo` uses reference counting internally, such that 671 /// cloning a `GroupInfo` is very cheap. 672 /// 673 /// # Example 674 /// 675 /// This example shows how to get a list of all capture group names for 676 /// a particular pattern. 677 /// 678 /// ``` 679 /// use regex_automata::{nfa::thompson::NFA, PatternID}; 680 /// 681 /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?; 682 /// // The first is the implicit group that is always unnammed. The next 683 /// // 5 groups are the explicit groups found in the concrete syntax above. 684 /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")]; 685 /// let got: Vec<Option<&str>> = 686 /// nfa.group_info().pattern_names(PatternID::ZERO).collect(); 687 /// assert_eq!(expected, got); 688 /// 689 /// // Using an invalid pattern ID will result in nothing yielded. 690 /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count(); 691 /// assert_eq!(0, got); 692 /// 693 /// # Ok::<(), Box<dyn std::error::Error>>(()) 694 /// ``` 695 #[inline] group_info(&self) -> &GroupInfo696 pub fn group_info(&self) -> &GroupInfo { 697 &self.0.group_info() 698 } 699 700 /// Returns true if and only if this NFA has at least one 701 /// [`Capture`](State::Capture) in its sequence of states. 702 /// 703 /// This is useful as a way to perform a quick test before attempting 704 /// something that does or does not require capture states. For example, 705 /// some regex engines (like the PikeVM) require capture states in order to 706 /// work at all. 707 /// 708 /// # Example 709 /// 710 /// This example shows a few different NFAs and whether they have captures 711 /// or not. 712 /// 713 /// ``` 714 /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; 715 /// 716 /// // Obviously has capture states. 717 /// let nfa = NFA::new("(a)")?; 718 /// assert!(nfa.has_capture()); 719 /// 720 /// // Less obviously has capture states, because every pattern has at 721 /// // least one anonymous capture group corresponding to the match for the 722 /// // entire pattern. 723 /// let nfa = NFA::new("a")?; 724 /// assert!(nfa.has_capture()); 725 /// 726 /// // Other than hand building your own NFA, this is the only way to build 727 /// // an NFA without capturing groups. In general, you should only do this 728 /// // if you don't intend to use any of the NFA-oriented regex engines. 729 /// // Overall, capturing groups don't have many downsides. Although they 730 /// // can add a bit of noise to simple NFAs, so it can be nice to disable 731 /// // them for debugging purposes. 732 /// // 733 /// // Notice that 'has_capture' is false here even when we have an 734 /// // explicit capture group in the pattern. 735 /// let nfa = NFA::compiler() 736 /// .configure(NFA::config().which_captures(WhichCaptures::None)) 737 /// .build("(a)")?; 738 /// assert!(!nfa.has_capture()); 739 /// 740 /// # Ok::<(), Box<dyn std::error::Error>>(()) 741 /// ``` 742 #[inline] has_capture(&self) -> bool743 pub fn has_capture(&self) -> bool { 744 self.0.has_capture 745 } 746 747 /// Returns true if and only if this NFA can match the empty string. 748 /// When it returns false, all possible matches are guaranteed to have a 749 /// non-zero length. 750 /// 751 /// This is useful as cheap way to know whether code needs to handle the 752 /// case of a zero length match. This is particularly important when UTF-8 753 /// modes are enabled, as when UTF-8 mode is enabled, empty matches that 754 /// split a codepoint must never be reported. This extra handling can 755 /// sometimes be costly, and since regexes matching an empty string are 756 /// somewhat rare, it can be beneficial to treat such regexes specially. 757 /// 758 /// # Example 759 /// 760 /// This example shows a few different NFAs and whether they match the 761 /// empty string or not. Notice the empty string isn't merely a matter 762 /// of a string of length literally `0`, but rather, whether a match can 763 /// occur between specific pairs of bytes. 764 /// 765 /// ``` 766 /// use regex_automata::{nfa::thompson::NFA, util::syntax}; 767 /// 768 /// // The empty regex matches the empty string. 769 /// let nfa = NFA::new("")?; 770 /// assert!(nfa.has_empty(), "empty matches empty"); 771 /// // The '+' repetition operator requires at least one match, and so 772 /// // does not match the empty string. 773 /// let nfa = NFA::new("a+")?; 774 /// assert!(!nfa.has_empty(), "+ does not match empty"); 775 /// // But the '*' repetition operator does. 776 /// let nfa = NFA::new("a*")?; 777 /// assert!(nfa.has_empty(), "* does match empty"); 778 /// // And wrapping '+' in an operator that can match an empty string also 779 /// // causes it to match the empty string too. 780 /// let nfa = NFA::new("(a+)*")?; 781 /// assert!(nfa.has_empty(), "+ inside of * matches empty"); 782 /// 783 /// // If a regex is just made of a look-around assertion, even if the 784 /// // assertion requires some kind of non-empty string around it (such as 785 /// // \b), then it is still treated as if it matches the empty string. 786 /// // Namely, if a match occurs of just a look-around assertion, then the 787 /// // match returned is empty. 788 /// let nfa = NFA::compiler() 789 /// .syntax(syntax::Config::new().utf8(false)) 790 /// .build(r"^$\A\z\b\B(?-u:\b\B)")?; 791 /// assert!(nfa.has_empty(), "assertions match empty"); 792 /// // Even when an assertion is wrapped in a '+', it still matches the 793 /// // empty string. 794 /// let nfa = NFA::new(r"\b+")?; 795 /// assert!(nfa.has_empty(), "+ of an assertion matches empty"); 796 /// 797 /// // An alternation with even one branch that can match the empty string 798 /// // is also said to match the empty string overall. 799 /// let nfa = NFA::new("foo|(bar)?|quux")?; 800 /// assert!(nfa.has_empty(), "alternations can match empty"); 801 /// 802 /// // An NFA that matches nothing does not match the empty string. 803 /// let nfa = NFA::new("[a&&b]")?; 804 /// assert!(!nfa.has_empty(), "never matching means not matching empty"); 805 /// // But if it's wrapped in something that doesn't require a match at 806 /// // all, then it can match the empty string! 807 /// let nfa = NFA::new("[a&&b]*")?; 808 /// assert!(nfa.has_empty(), "* on never-match still matches empty"); 809 /// // Since a '+' requires a match, using it on something that can never 810 /// // match will itself produce a regex that can never match anything, 811 /// // and thus does not match the empty string. 812 /// let nfa = NFA::new("[a&&b]+")?; 813 /// assert!(!nfa.has_empty(), "+ on never-match still matches nothing"); 814 /// 815 /// # Ok::<(), Box<dyn std::error::Error>>(()) 816 /// ``` 817 #[inline] has_empty(&self) -> bool818 pub fn has_empty(&self) -> bool { 819 self.0.has_empty 820 } 821 822 /// Whether UTF-8 mode is enabled for this NFA or not. 823 /// 824 /// When UTF-8 mode is enabled, all matches reported by a regex engine 825 /// derived from this NFA are guaranteed to correspond to spans of valid 826 /// UTF-8. This includes zero-width matches. For example, the regex engine 827 /// must guarantee that the empty regex will not match at the positions 828 /// between code units in the UTF-8 encoding of a single codepoint. 829 /// 830 /// See [`Config::utf8`] for more information. 831 /// 832 /// This is enabled by default. 833 /// 834 /// # Example 835 /// 836 /// This example shows how UTF-8 mode can impact the match spans that may 837 /// be reported in certain cases. 838 /// 839 /// ``` 840 /// use regex_automata::{ 841 /// nfa::thompson::{self, pikevm::PikeVM}, 842 /// Match, Input, 843 /// }; 844 /// 845 /// let re = PikeVM::new("")?; 846 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 847 /// 848 /// // UTF-8 mode is enabled by default. 849 /// let mut input = Input::new("☃"); 850 /// re.search(&mut cache, &input, &mut caps); 851 /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); 852 /// 853 /// // Even though an empty regex matches at 1..1, our next match is 854 /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is 855 /// // three bytes long). 856 /// input.set_start(1); 857 /// re.search(&mut cache, &input, &mut caps); 858 /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); 859 /// 860 /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: 861 /// let re = PikeVM::builder() 862 /// .thompson(thompson::Config::new().utf8(false)) 863 /// .build("")?; 864 /// re.search(&mut cache, &input, &mut caps); 865 /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); 866 /// 867 /// input.set_start(2); 868 /// re.search(&mut cache, &input, &mut caps); 869 /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); 870 /// 871 /// input.set_start(3); 872 /// re.search(&mut cache, &input, &mut caps); 873 /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); 874 /// 875 /// input.set_start(4); 876 /// re.search(&mut cache, &input, &mut caps); 877 /// assert_eq!(None, caps.get_match()); 878 /// 879 /// # Ok::<(), Box<dyn std::error::Error>>(()) 880 /// ``` 881 #[inline] is_utf8(&self) -> bool882 pub fn is_utf8(&self) -> bool { 883 self.0.utf8 884 } 885 886 /// Returns true when this NFA is meant to be matched in reverse. 887 /// 888 /// Generally speaking, when this is true, it means the NFA is supposed to 889 /// be used in conjunction with moving backwards through the haystack. That 890 /// is, from a higher memory address to a lower memory address. 891 /// 892 /// It is often the case that lower level routines dealing with an NFA 893 /// don't need to care about whether it is "meant" to be matched in reverse 894 /// or not. However, there are some specific cases where it matters. For 895 /// example, the implementation of CRLF-aware `^` and `$` line anchors 896 /// needs to know whether the search is in the forward or reverse 897 /// direction. In the forward direction, neither `^` nor `$` should match 898 /// when a `\r` has been seen previously and a `\n` is next. However, in 899 /// the reverse direction, neither `^` nor `$` should match when a `\n` 900 /// has been seen previously and a `\r` is next. This fundamentally changes 901 /// how the state machine is constructed, and thus needs to be altered 902 /// based on the direction of the search. 903 /// 904 /// This is automatically set when using a [`Compiler`] with a configuration 905 /// where [`Config::reverse`] is enabled. If you're building your own NFA 906 /// by hand via a [`Builder`] 907 #[inline] is_reverse(&self) -> bool908 pub fn is_reverse(&self) -> bool { 909 self.0.reverse 910 } 911 912 /// Returns true if and only if all starting states for this NFA correspond 913 /// to the beginning of an anchored search. 914 /// 915 /// Typically, an NFA will have both an anchored and an unanchored starting 916 /// state. Namely, because it tends to be useful to have both and the cost 917 /// of having an unanchored starting state is almost zero (for an NFA). 918 /// However, if all patterns in the NFA are themselves anchored, then even 919 /// the unanchored starting state will correspond to an anchored search 920 /// since the pattern doesn't permit anything else. 921 /// 922 /// # Example 923 /// 924 /// This example shows a few different scenarios where this method's 925 /// return value varies. 926 /// 927 /// ``` 928 /// use regex_automata::nfa::thompson::NFA; 929 /// 930 /// // The unanchored starting state permits matching this pattern anywhere 931 /// // in a haystack, instead of just at the beginning. 932 /// let nfa = NFA::new("a")?; 933 /// assert!(!nfa.is_always_start_anchored()); 934 /// 935 /// // In this case, the pattern is itself anchored, so there is no way 936 /// // to run an unanchored search. 937 /// let nfa = NFA::new("^a")?; 938 /// assert!(nfa.is_always_start_anchored()); 939 /// 940 /// // When multiline mode is enabled, '^' can match at the start of a line 941 /// // in addition to the start of a haystack, so an unanchored search is 942 /// // actually possible. 943 /// let nfa = NFA::new("(?m)^a")?; 944 /// assert!(!nfa.is_always_start_anchored()); 945 /// 946 /// // Weird cases also work. A pattern is only considered anchored if all 947 /// // matches may only occur at the start of a haystack. 948 /// let nfa = NFA::new("(^a)|a")?; 949 /// assert!(!nfa.is_always_start_anchored()); 950 /// 951 /// // When multiple patterns are present, if they are all anchored, then 952 /// // the NFA is always anchored too. 953 /// let nfa = NFA::new_many(&["^a", "^b", "^c"])?; 954 /// assert!(nfa.is_always_start_anchored()); 955 /// 956 /// // But if one pattern is unanchored, then the NFA must permit an 957 /// // unanchored search. 958 /// let nfa = NFA::new_many(&["^a", "b", "^c"])?; 959 /// assert!(!nfa.is_always_start_anchored()); 960 /// 961 /// # Ok::<(), Box<dyn std::error::Error>>(()) 962 /// ``` 963 #[inline] is_always_start_anchored(&self) -> bool964 pub fn is_always_start_anchored(&self) -> bool { 965 self.start_anchored() == self.start_unanchored() 966 } 967 968 /// Returns the look-around matcher associated with this NFA. 969 /// 970 /// A look-around matcher determines how to match look-around assertions. 971 /// In particular, some assertions are configurable. For example, the 972 /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed 973 /// from the default of `\n` to any other byte. 974 /// 975 /// If the NFA was built using a [`Compiler`], then this matcher 976 /// can be set via the [`Config::look_matcher`] configuration 977 /// knob. Otherwise, if you've built an NFA by hand, it is set via 978 /// [`Builder::set_look_matcher`]. 979 /// 980 /// # Example 981 /// 982 /// This shows how to change the line terminator for multi-line assertions. 983 /// 984 /// ``` 985 /// use regex_automata::{ 986 /// nfa::thompson::{self, pikevm::PikeVM}, 987 /// util::look::LookMatcher, 988 /// Match, Input, 989 /// }; 990 /// 991 /// let mut lookm = LookMatcher::new(); 992 /// lookm.set_line_terminator(b'\x00'); 993 /// 994 /// let re = PikeVM::builder() 995 /// .thompson(thompson::Config::new().look_matcher(lookm)) 996 /// .build(r"(?m)^[a-z]+$")?; 997 /// let mut cache = re.create_cache(); 998 /// 999 /// // Multi-line assertions now use NUL as a terminator. 1000 /// assert_eq!( 1001 /// Some(Match::must(0, 1..4)), 1002 /// re.find(&mut cache, b"\x00abc\x00"), 1003 /// ); 1004 /// // ... and \n is no longer recognized as a terminator. 1005 /// assert_eq!( 1006 /// None, 1007 /// re.find(&mut cache, b"\nabc\n"), 1008 /// ); 1009 /// 1010 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1011 /// ``` 1012 #[inline] look_matcher(&self) -> &LookMatcher1013 pub fn look_matcher(&self) -> &LookMatcher { 1014 &self.0.look_matcher 1015 } 1016 1017 /// Returns the union of all look-around assertions used throughout this 1018 /// NFA. When the returned set is empty, it implies that the NFA has no 1019 /// look-around assertions and thus zero conditional epsilon transitions. 1020 /// 1021 /// This is useful in some cases enabling optimizations. It is not 1022 /// unusual, for example, for optimizations to be of the form, "for any 1023 /// regex with zero conditional epsilon transitions, do ..." where "..." 1024 /// is some kind of optimization. 1025 /// 1026 /// This isn't only helpful for optimizations either. Sometimes look-around 1027 /// assertions are difficult to support. For example, many of the DFAs in 1028 /// this crate don't support Unicode word boundaries or handle them using 1029 /// heuristics. Handling that correctly typically requires some kind of 1030 /// cheap check of whether the NFA has a Unicode word boundary in the first 1031 /// place. 1032 /// 1033 /// # Example 1034 /// 1035 /// This example shows how this routine varies based on the regex pattern: 1036 /// 1037 /// ``` 1038 /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; 1039 /// 1040 /// // No look-around at all. 1041 /// let nfa = NFA::new("a")?; 1042 /// assert!(nfa.look_set_any().is_empty()); 1043 /// 1044 /// // When multiple patterns are present, since this returns the union, 1045 /// // it will include look-around assertions that only appear in one 1046 /// // pattern. 1047 /// let nfa = NFA::new_many(&["a", "b", "a^b", "c"])?; 1048 /// assert!(nfa.look_set_any().contains(Look::Start)); 1049 /// 1050 /// // Some groups of assertions have various shortcuts. For example: 1051 /// let nfa = NFA::new(r"(?-u:\b)")?; 1052 /// assert!(nfa.look_set_any().contains_word()); 1053 /// assert!(!nfa.look_set_any().contains_word_unicode()); 1054 /// assert!(nfa.look_set_any().contains_word_ascii()); 1055 /// 1056 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1057 /// ``` 1058 #[inline] look_set_any(&self) -> LookSet1059 pub fn look_set_any(&self) -> LookSet { 1060 self.0.look_set_any 1061 } 1062 1063 /// Returns the union of all prefix look-around assertions for every 1064 /// pattern in this NFA. When the returned set is empty, it implies none of 1065 /// the patterns require moving through a conditional epsilon transition 1066 /// before inspecting the first byte in the haystack. 1067 /// 1068 /// This can be useful for determining what kinds of assertions need to be 1069 /// satisfied at the beginning of a search. For example, typically DFAs 1070 /// in this crate will build a distinct starting state for each possible 1071 /// starting configuration that might result in look-around assertions 1072 /// being satisfied differently. However, if the set returned here is 1073 /// empty, then you know that the start state is invariant because there 1074 /// are no conditional epsilon transitions to consider. 1075 /// 1076 /// # Example 1077 /// 1078 /// This example shows how this routine varies based on the regex pattern: 1079 /// 1080 /// ``` 1081 /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; 1082 /// 1083 /// // No look-around at all. 1084 /// let nfa = NFA::new("a")?; 1085 /// assert!(nfa.look_set_prefix_any().is_empty()); 1086 /// 1087 /// // When multiple patterns are present, since this returns the union, 1088 /// // it will include look-around assertions that only appear in one 1089 /// // pattern. But it will only include assertions that are in the prefix 1090 /// // of a pattern. For example, this includes '^' but not '$' even though 1091 /// // '$' does appear. 1092 /// let nfa = NFA::new_many(&["a", "b", "^ab$", "c"])?; 1093 /// assert!(nfa.look_set_prefix_any().contains(Look::Start)); 1094 /// assert!(!nfa.look_set_prefix_any().contains(Look::End)); 1095 /// 1096 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1097 /// ``` 1098 #[inline] look_set_prefix_any(&self) -> LookSet1099 pub fn look_set_prefix_any(&self) -> LookSet { 1100 self.0.look_set_prefix_any 1101 } 1102 1103 // FIXME: The `look_set_prefix_all` computation was not correct, and it 1104 // seemed a little tricky to fix it. Since I wasn't actually using it for 1105 // anything, I just decided to remove it in the run up to the regex 1.9 1106 // release. If you need this, please file an issue. 1107 /* 1108 /// Returns the intersection of all prefix look-around assertions for every 1109 /// pattern in this NFA. When the returned set is empty, it implies at 1110 /// least one of the patterns does not require moving through a conditional 1111 /// epsilon transition before inspecting the first byte in the haystack. 1112 /// Conversely, when the set contains an assertion, it implies that every 1113 /// pattern in the NFA also contains that assertion in its prefix. 1114 /// 1115 /// This can be useful for determining what kinds of assertions need to be 1116 /// satisfied at the beginning of a search. For example, if you know that 1117 /// [`Look::Start`] is in the prefix intersection set returned here, then 1118 /// you know that all searches, regardless of input configuration, will be 1119 /// anchored. 1120 /// 1121 /// # Example 1122 /// 1123 /// This example shows how this routine varies based on the regex pattern: 1124 /// 1125 /// ``` 1126 /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; 1127 /// 1128 /// // No look-around at all. 1129 /// let nfa = NFA::new("a")?; 1130 /// assert!(nfa.look_set_prefix_all().is_empty()); 1131 /// 1132 /// // When multiple patterns are present, since this returns the 1133 /// // intersection, it will only include assertions present in every 1134 /// // prefix, and only the prefix. 1135 /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?; 1136 /// assert!(nfa.look_set_prefix_all().contains(Look::Start)); 1137 /// assert!(!nfa.look_set_prefix_all().contains(Look::End)); 1138 /// 1139 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1140 /// ``` 1141 #[inline] 1142 pub fn look_set_prefix_all(&self) -> LookSet { 1143 self.0.look_set_prefix_all 1144 } 1145 */ 1146 1147 /// Returns the memory usage, in bytes, of this NFA. 1148 /// 1149 /// This does **not** include the stack size used up by this NFA. To 1150 /// compute that, use `std::mem::size_of::<NFA>()`. 1151 /// 1152 /// # Example 1153 /// 1154 /// This example shows that large Unicode character classes can use quite 1155 /// a bit of memory. 1156 /// 1157 /// ``` 1158 /// # if cfg!(miri) { return Ok(()); } // miri takes too long 1159 /// use regex_automata::nfa::thompson::NFA; 1160 /// 1161 /// let nfa_unicode = NFA::new(r"\w")?; 1162 /// let nfa_ascii = NFA::new(r"(?-u:\w)")?; 1163 /// 1164 /// assert!(10 * nfa_ascii.memory_usage() < nfa_unicode.memory_usage()); 1165 /// 1166 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1167 /// ``` 1168 #[inline] memory_usage(&self) -> usize1169 pub fn memory_usage(&self) -> usize { 1170 use core::mem::size_of; 1171 1172 size_of::<Inner>() // allocated on the heap via Arc 1173 + self.0.states.len() * size_of::<State>() 1174 + self.0.start_pattern.len() * size_of::<StateID>() 1175 + self.0.group_info.memory_usage() 1176 + self.0.memory_extra 1177 } 1178 } 1179 1180 impl fmt::Debug for NFA { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1181 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1182 self.0.fmt(f) 1183 } 1184 } 1185 1186 /// The "inner" part of the NFA. We split this part out so that we can easily 1187 /// wrap it in an `Arc` above in the definition of `NFA`. 1188 /// 1189 /// See builder.rs for the code that actually builds this type. This module 1190 /// does provide (internal) mutable methods for adding things to this 1191 /// NFA before finalizing it, but the high level construction process is 1192 /// controlled by the builder abstraction. (Which is complicated enough to 1193 /// get its own module.) 1194 #[derive(Default)] 1195 pub(super) struct Inner { 1196 /// The state sequence. This sequence is guaranteed to be indexable by all 1197 /// starting state IDs, and it is also guaranteed to contain at most one 1198 /// `Match` state for each pattern compiled into this NFA. (A pattern may 1199 /// not have a corresponding `Match` state if a `Match` state is impossible 1200 /// to reach.) 1201 states: Vec<State>, 1202 /// The anchored starting state of this NFA. 1203 start_anchored: StateID, 1204 /// The unanchored starting state of this NFA. 1205 start_unanchored: StateID, 1206 /// The starting states for each individual pattern. Starting at any 1207 /// of these states will result in only an anchored search for the 1208 /// corresponding pattern. The vec is indexed by pattern ID. When the NFA 1209 /// contains a single regex, then `start_pattern[0]` and `start_anchored` 1210 /// are always equivalent. 1211 start_pattern: Vec<StateID>, 1212 /// Info about the capturing groups in this NFA. This is responsible for 1213 /// mapping groups to slots, mapping groups to names and names to groups. 1214 group_info: GroupInfo, 1215 /// A representation of equivalence classes over the transitions in this 1216 /// NFA. Two bytes in the same equivalence class must not discriminate 1217 /// between a match or a non-match. This map can be used to shrink the 1218 /// total size of a DFA's transition table with a small match-time cost. 1219 /// 1220 /// Note that the NFA's transitions are *not* defined in terms of these 1221 /// equivalence classes. The NFA's transitions are defined on the original 1222 /// byte values. For the most part, this is because they wouldn't really 1223 /// help the NFA much since the NFA already uses a sparse representation 1224 /// to represent transitions. Byte classes are most effective in a dense 1225 /// representation. 1226 byte_class_set: ByteClassSet, 1227 /// This is generated from `byte_class_set`, and essentially represents the 1228 /// same thing but supports different access patterns. Namely, this permits 1229 /// looking up the equivalence class of a byte very cheaply. 1230 /// 1231 /// Ideally we would just store this, but because of annoying code 1232 /// structure reasons, we keep both this and `byte_class_set` around for 1233 /// now. I think I would prefer that `byte_class_set` were computed in the 1234 /// `Builder`, but right now, we compute it as states are added to the 1235 /// `NFA`. 1236 byte_classes: ByteClasses, 1237 /// Whether this NFA has a `Capture` state anywhere. 1238 has_capture: bool, 1239 /// When the empty string is in the language matched by this NFA. 1240 has_empty: bool, 1241 /// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that 1242 /// all non-empty matches produced by this NFA correspond to spans of valid 1243 /// UTF-8, and any empty matches produced by this NFA that split a UTF-8 1244 /// encoded codepoint should be filtered out by the corresponding regex 1245 /// engine. 1246 utf8: bool, 1247 /// Whether this NFA is meant to be matched in reverse or not. 1248 reverse: bool, 1249 /// The matcher to be used for look-around assertions. 1250 look_matcher: LookMatcher, 1251 /// The union of all look-around assertions that occur anywhere within 1252 /// this NFA. If this set is empty, then it means there are precisely zero 1253 /// conditional epsilon transitions in the NFA. 1254 look_set_any: LookSet, 1255 /// The union of all look-around assertions that occur as a zero-length 1256 /// prefix for any of the patterns in this NFA. 1257 look_set_prefix_any: LookSet, 1258 /* 1259 /// The intersection of all look-around assertions that occur as a 1260 /// zero-length prefix for any of the patterns in this NFA. 1261 look_set_prefix_all: LookSet, 1262 */ 1263 /// Heap memory used indirectly by NFA states and other things (like the 1264 /// various capturing group representations above). Since each state 1265 /// might use a different amount of heap, we need to keep track of this 1266 /// incrementally. 1267 memory_extra: usize, 1268 } 1269 1270 impl Inner { 1271 /// Runs any last finalization bits and turns this into a full NFA. into_nfa(mut self) -> NFA1272 pub(super) fn into_nfa(mut self) -> NFA { 1273 self.byte_classes = self.byte_class_set.byte_classes(); 1274 // Do epsilon closure from the start state of every pattern in order 1275 // to compute various properties such as look-around assertions and 1276 // whether the empty string can be matched. 1277 let mut stack = vec![]; 1278 let mut seen = SparseSet::new(self.states.len()); 1279 for &start_id in self.start_pattern.iter() { 1280 stack.push(start_id); 1281 seen.clear(); 1282 // let mut prefix_all = LookSet::full(); 1283 let mut prefix_any = LookSet::empty(); 1284 while let Some(sid) = stack.pop() { 1285 if !seen.insert(sid) { 1286 continue; 1287 } 1288 match self.states[sid] { 1289 State::ByteRange { .. } 1290 | State::Dense { .. } 1291 | State::Fail => continue, 1292 State::Sparse(_) => { 1293 // This snippet below will rewrite this sparse state 1294 // as a dense state. By doing it here, we apply this 1295 // optimization to all hot "sparse" states since these 1296 // are the states that are reachable from the start 1297 // state via an epsilon closure. 1298 // 1299 // Unfortunately, this optimization did not seem to 1300 // help much in some very limited ad hoc benchmarking. 1301 // 1302 // I left the 'Dense' state type in place in case we 1303 // want to revisit this, but I suspect the real way 1304 // to make forward progress is a more fundamental 1305 // rearchitecting of how data in the NFA is laid out. 1306 // I think we should consider a single contiguous 1307 // allocation instead of all this indirection and 1308 // potential heap allocations for every state. But this 1309 // is a large re-design and will require API breaking 1310 // changes. 1311 // self.memory_extra -= self.states[sid].memory_usage(); 1312 // let trans = DenseTransitions::from_sparse(sparse); 1313 // self.states[sid] = State::Dense(trans); 1314 // self.memory_extra += self.states[sid].memory_usage(); 1315 continue; 1316 } 1317 State::Match { .. } => self.has_empty = true, 1318 State::Look { look, next } => { 1319 prefix_any = prefix_any.insert(look); 1320 stack.push(next); 1321 } 1322 State::Union { ref alternates } => { 1323 // Order doesn't matter here, since we're just dealing 1324 // with look-around sets. But if we do richer analysis 1325 // here that needs to care about preference order, then 1326 // this should be done in reverse. 1327 stack.extend(alternates.iter()); 1328 } 1329 State::BinaryUnion { alt1, alt2 } => { 1330 stack.push(alt2); 1331 stack.push(alt1); 1332 } 1333 State::Capture { next, .. } => { 1334 stack.push(next); 1335 } 1336 } 1337 } 1338 self.look_set_prefix_any = 1339 self.look_set_prefix_any.union(prefix_any); 1340 } 1341 NFA(Arc::new(self)) 1342 } 1343 1344 /// Returns the capturing group info for this NFA. group_info(&self) -> &GroupInfo1345 pub(super) fn group_info(&self) -> &GroupInfo { 1346 &self.group_info 1347 } 1348 1349 /// Add the given state to this NFA after allocating a fresh identifier for 1350 /// it. 1351 /// 1352 /// This panics if too many states are added such that a fresh identifier 1353 /// could not be created. (Currently, the only caller of this routine is 1354 /// a `Builder`, and it upholds this invariant.) add(&mut self, state: State) -> StateID1355 pub(super) fn add(&mut self, state: State) -> StateID { 1356 match state { 1357 State::ByteRange { ref trans } => { 1358 self.byte_class_set.set_range(trans.start, trans.end); 1359 } 1360 State::Sparse(ref sparse) => { 1361 for trans in sparse.transitions.iter() { 1362 self.byte_class_set.set_range(trans.start, trans.end); 1363 } 1364 } 1365 State::Dense { .. } => unreachable!(), 1366 State::Look { look, .. } => { 1367 self.look_matcher 1368 .add_to_byteset(look, &mut self.byte_class_set); 1369 self.look_set_any = self.look_set_any.insert(look); 1370 } 1371 State::Capture { .. } => { 1372 self.has_capture = true; 1373 } 1374 State::Union { .. } 1375 | State::BinaryUnion { .. } 1376 | State::Fail 1377 | State::Match { .. } => {} 1378 } 1379 1380 let id = StateID::new(self.states.len()).unwrap(); 1381 self.memory_extra += state.memory_usage(); 1382 self.states.push(state); 1383 id 1384 } 1385 1386 /// Set the starting state identifiers for this NFA. 1387 /// 1388 /// `start_anchored` and `start_unanchored` may be equivalent. When they 1389 /// are, then the NFA can only execute anchored searches. This might 1390 /// occur, for example, for patterns that are unconditionally anchored. 1391 /// e.g., `^foo`. set_starts( &mut self, start_anchored: StateID, start_unanchored: StateID, start_pattern: &[StateID], )1392 pub(super) fn set_starts( 1393 &mut self, 1394 start_anchored: StateID, 1395 start_unanchored: StateID, 1396 start_pattern: &[StateID], 1397 ) { 1398 self.start_anchored = start_anchored; 1399 self.start_unanchored = start_unanchored; 1400 self.start_pattern = start_pattern.to_vec(); 1401 } 1402 1403 /// Sets the UTF-8 mode of this NFA. set_utf8(&mut self, yes: bool)1404 pub(super) fn set_utf8(&mut self, yes: bool) { 1405 self.utf8 = yes; 1406 } 1407 1408 /// Sets the reverse mode of this NFA. set_reverse(&mut self, yes: bool)1409 pub(super) fn set_reverse(&mut self, yes: bool) { 1410 self.reverse = yes; 1411 } 1412 1413 /// Sets the look-around assertion matcher for this NFA. set_look_matcher(&mut self, m: LookMatcher)1414 pub(super) fn set_look_matcher(&mut self, m: LookMatcher) { 1415 self.look_matcher = m; 1416 } 1417 1418 /// Set the capturing groups for this NFA. 1419 /// 1420 /// The given slice should contain the capturing groups for each pattern, 1421 /// The capturing groups in turn should correspond to the total number of 1422 /// capturing groups in the pattern, including the anonymous first capture 1423 /// group for each pattern. If a capturing group does have a name, then it 1424 /// should be provided as a Arc<str>. 1425 /// 1426 /// This returns an error if a corresponding `GroupInfo` could not be 1427 /// built. set_captures( &mut self, captures: &[Vec<Option<Arc<str>>>], ) -> Result<(), GroupInfoError>1428 pub(super) fn set_captures( 1429 &mut self, 1430 captures: &[Vec<Option<Arc<str>>>], 1431 ) -> Result<(), GroupInfoError> { 1432 self.group_info = GroupInfo::new( 1433 captures.iter().map(|x| x.iter().map(|y| y.as_ref())), 1434 )?; 1435 Ok(()) 1436 } 1437 1438 /// Remap the transitions in every state of this NFA using the given map. 1439 /// The given map should be indexed according to state ID namespace used by 1440 /// the transitions of the states currently in this NFA. 1441 /// 1442 /// This is particularly useful to the NFA builder, since it is convenient 1443 /// to add NFA states in order to produce their final IDs. Then, after all 1444 /// of the intermediate "empty" states (unconditional epsilon transitions) 1445 /// have been removed from the builder's representation, we can re-map all 1446 /// of the transitions in the states already added to their final IDs. remap(&mut self, old_to_new: &[StateID])1447 pub(super) fn remap(&mut self, old_to_new: &[StateID]) { 1448 for state in &mut self.states { 1449 state.remap(old_to_new); 1450 } 1451 self.start_anchored = old_to_new[self.start_anchored]; 1452 self.start_unanchored = old_to_new[self.start_unanchored]; 1453 for id in self.start_pattern.iter_mut() { 1454 *id = old_to_new[*id]; 1455 } 1456 } 1457 } 1458 1459 impl fmt::Debug for Inner { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1460 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1461 writeln!(f, "thompson::NFA(")?; 1462 for (sid, state) in self.states.iter().with_state_ids() { 1463 let status = if sid == self.start_anchored { 1464 '^' 1465 } else if sid == self.start_unanchored { 1466 '>' 1467 } else { 1468 ' ' 1469 }; 1470 writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; 1471 } 1472 let pattern_len = self.start_pattern.len(); 1473 if pattern_len > 1 { 1474 writeln!(f, "")?; 1475 for pid in 0..pattern_len { 1476 let sid = self.start_pattern[pid]; 1477 writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?; 1478 } 1479 } 1480 writeln!(f, "")?; 1481 writeln!( 1482 f, 1483 "transition equivalence classes: {:?}", 1484 self.byte_classes, 1485 )?; 1486 writeln!(f, ")")?; 1487 Ok(()) 1488 } 1489 } 1490 1491 /// A state in an NFA. 1492 /// 1493 /// In theory, it can help to conceptualize an `NFA` as a graph consisting of 1494 /// `State`s. Each `State` contains its complete set of outgoing transitions. 1495 /// 1496 /// In practice, it can help to conceptualize an `NFA` as a sequence of 1497 /// instructions for a virtual machine. Each `State` says what to do and where 1498 /// to go next. 1499 /// 1500 /// Strictly speaking, the practical interpretation is the most correct one, 1501 /// because of the [`Capture`](State::Capture) state. Namely, a `Capture` 1502 /// state always forwards execution to another state unconditionally. Its only 1503 /// purpose is to cause a side effect: the recording of the current input 1504 /// position at a particular location in memory. In this sense, an `NFA` 1505 /// has more power than a theoretical non-deterministic finite automaton. 1506 /// 1507 /// For most uses of this crate, it is likely that one may never even need to 1508 /// be aware of this type at all. The main use cases for looking at `State`s 1509 /// directly are if you need to write your own search implementation or if you 1510 /// need to do some kind of analysis on the NFA. 1511 #[derive(Clone, Eq, PartialEq)] 1512 pub enum State { 1513 /// A state with a single transition that can only be taken if the current 1514 /// input symbol is in a particular range of bytes. 1515 ByteRange { 1516 /// The transition from this state to the next. 1517 trans: Transition, 1518 }, 1519 /// A state with possibly many transitions represented in a sparse fashion. 1520 /// Transitions are non-overlapping and ordered lexicographically by input 1521 /// range. 1522 /// 1523 /// In practice, this is used for encoding UTF-8 automata. Its presence is 1524 /// primarily an optimization that avoids many additional unconditional 1525 /// epsilon transitions (via [`Union`](State::Union) states), and thus 1526 /// decreases the overhead of traversing the NFA. This can improve both 1527 /// matching time and DFA construction time. 1528 Sparse(SparseTransitions), 1529 /// A dense representation of a state with multiple transitions. 1530 Dense(DenseTransitions), 1531 /// A conditional epsilon transition satisfied via some sort of 1532 /// look-around. Look-around is limited to anchor and word boundary 1533 /// assertions. 1534 /// 1535 /// Look-around states are meant to be evaluated while performing epsilon 1536 /// closure (computing the set of states reachable from a particular state 1537 /// via only epsilon transitions). If the current position in the haystack 1538 /// satisfies the look-around assertion, then you're permitted to follow 1539 /// that epsilon transition. 1540 Look { 1541 /// The look-around assertion that must be satisfied before moving 1542 /// to `next`. 1543 look: Look, 1544 /// The state to transition to if the look-around assertion is 1545 /// satisfied. 1546 next: StateID, 1547 }, 1548 /// An alternation such that there exists an epsilon transition to all 1549 /// states in `alternates`, where matches found via earlier transitions 1550 /// are preferred over later transitions. 1551 Union { 1552 /// An ordered sequence of unconditional epsilon transitions to other 1553 /// states. Transitions earlier in the sequence are preferred over 1554 /// transitions later in the sequence. 1555 alternates: Box<[StateID]>, 1556 }, 1557 /// An alternation such that there exists precisely two unconditional 1558 /// epsilon transitions, where matches found via `alt1` are preferred over 1559 /// matches found via `alt2`. 1560 /// 1561 /// This state exists as a common special case of Union where there are 1562 /// only two alternates. In this case, we don't need any allocations to 1563 /// represent the state. This saves a bit of memory and also saves an 1564 /// additional memory access when traversing the NFA. 1565 BinaryUnion { 1566 /// An unconditional epsilon transition to another NFA state. This 1567 /// is preferred over `alt2`. 1568 alt1: StateID, 1569 /// An unconditional epsilon transition to another NFA state. Matches 1570 /// reported via this transition should only be reported if no matches 1571 /// were found by following `alt1`. 1572 alt2: StateID, 1573 }, 1574 /// An empty state that records a capture location. 1575 /// 1576 /// From the perspective of finite automata, this is precisely equivalent 1577 /// to an unconditional epsilon transition, but serves the purpose of 1578 /// instructing NFA simulations to record additional state when the finite 1579 /// state machine passes through this epsilon transition. 1580 /// 1581 /// `slot` in this context refers to the specific capture group slot 1582 /// offset that is being recorded. Each capturing group has two slots 1583 /// corresponding to the start and end of the matching portion of that 1584 /// group. 1585 /// 1586 /// The pattern ID and capture group index are also included in this state 1587 /// in case they are useful. But mostly, all you'll need is `next` and 1588 /// `slot`. 1589 Capture { 1590 /// The state to transition to, unconditionally. 1591 next: StateID, 1592 /// The pattern ID that this capture belongs to. 1593 pattern_id: PatternID, 1594 /// The capture group index that this capture belongs to. Capture group 1595 /// indices are local to each pattern. For example, when capturing 1596 /// groups are enabled, every pattern has a capture group at index 1597 /// `0`. 1598 group_index: SmallIndex, 1599 /// The slot index for this capture. Every capturing group has two 1600 /// slots: one for the start haystack offset and one for the end 1601 /// haystack offset. Unlike capture group indices, slot indices are 1602 /// global across all patterns in this NFA. That is, each slot belongs 1603 /// to a single pattern, but there is only one slot at index `i`. 1604 slot: SmallIndex, 1605 }, 1606 /// A state that cannot be transitioned out of. This is useful for cases 1607 /// where you want to prevent matching from occurring. For example, if your 1608 /// regex parser permits empty character classes, then one could choose 1609 /// a `Fail` state to represent them. (An empty character class can be 1610 /// thought of as an empty set. Since nothing is in an empty set, they can 1611 /// never match anything.) 1612 Fail, 1613 /// A match state. There is at least one such occurrence of this state for 1614 /// each regex that can match that is in this NFA. 1615 Match { 1616 /// The matching pattern ID. 1617 pattern_id: PatternID, 1618 }, 1619 } 1620 1621 impl State { 1622 /// Returns true if and only if this state contains one or more epsilon 1623 /// transitions. 1624 /// 1625 /// In practice, a state has no outgoing transitions (like `Match`), has 1626 /// only non-epsilon transitions (like `ByteRange`) or has only epsilon 1627 /// transitions (like `Union`). 1628 /// 1629 /// # Example 1630 /// 1631 /// ``` 1632 /// use regex_automata::{ 1633 /// nfa::thompson::{State, Transition}, 1634 /// util::primitives::{PatternID, StateID, SmallIndex}, 1635 /// }; 1636 /// 1637 /// // Capture states are epsilon transitions. 1638 /// let state = State::Capture { 1639 /// next: StateID::ZERO, 1640 /// pattern_id: PatternID::ZERO, 1641 /// group_index: SmallIndex::ZERO, 1642 /// slot: SmallIndex::ZERO, 1643 /// }; 1644 /// assert!(state.is_epsilon()); 1645 /// 1646 /// // ByteRange states are not. 1647 /// let state = State::ByteRange { 1648 /// trans: Transition { start: b'a', end: b'z', next: StateID::ZERO }, 1649 /// }; 1650 /// assert!(!state.is_epsilon()); 1651 /// 1652 /// # Ok::<(), Box<dyn std::error::Error>>(()) 1653 /// ``` 1654 #[inline] is_epsilon(&self) -> bool1655 pub fn is_epsilon(&self) -> bool { 1656 match *self { 1657 State::ByteRange { .. } 1658 | State::Sparse { .. } 1659 | State::Dense { .. } 1660 | State::Fail 1661 | State::Match { .. } => false, 1662 State::Look { .. } 1663 | State::Union { .. } 1664 | State::BinaryUnion { .. } 1665 | State::Capture { .. } => true, 1666 } 1667 } 1668 1669 /// Returns the heap memory usage of this NFA state in bytes. memory_usage(&self) -> usize1670 fn memory_usage(&self) -> usize { 1671 match *self { 1672 State::ByteRange { .. } 1673 | State::Look { .. } 1674 | State::BinaryUnion { .. } 1675 | State::Capture { .. } 1676 | State::Match { .. } 1677 | State::Fail => 0, 1678 State::Sparse(SparseTransitions { ref transitions }) => { 1679 transitions.len() * mem::size_of::<Transition>() 1680 } 1681 State::Dense { .. } => 256 * mem::size_of::<StateID>(), 1682 State::Union { ref alternates } => { 1683 alternates.len() * mem::size_of::<StateID>() 1684 } 1685 } 1686 } 1687 1688 /// Remap the transitions in this state using the given map. Namely, the 1689 /// given map should be indexed according to the transitions currently 1690 /// in this state. 1691 /// 1692 /// This is used during the final phase of the NFA compiler, which turns 1693 /// its intermediate NFA into the final NFA. remap(&mut self, remap: &[StateID])1694 fn remap(&mut self, remap: &[StateID]) { 1695 match *self { 1696 State::ByteRange { ref mut trans } => { 1697 trans.next = remap[trans.next] 1698 } 1699 State::Sparse(SparseTransitions { ref mut transitions }) => { 1700 for t in transitions.iter_mut() { 1701 t.next = remap[t.next]; 1702 } 1703 } 1704 State::Dense(DenseTransitions { ref mut transitions }) => { 1705 for sid in transitions.iter_mut() { 1706 *sid = remap[*sid]; 1707 } 1708 } 1709 State::Look { ref mut next, .. } => *next = remap[*next], 1710 State::Union { ref mut alternates } => { 1711 for alt in alternates.iter_mut() { 1712 *alt = remap[*alt]; 1713 } 1714 } 1715 State::BinaryUnion { ref mut alt1, ref mut alt2 } => { 1716 *alt1 = remap[*alt1]; 1717 *alt2 = remap[*alt2]; 1718 } 1719 State::Capture { ref mut next, .. } => *next = remap[*next], 1720 State::Fail => {} 1721 State::Match { .. } => {} 1722 } 1723 } 1724 } 1725 1726 impl fmt::Debug for State { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1727 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1728 match *self { 1729 State::ByteRange { ref trans } => trans.fmt(f), 1730 State::Sparse(SparseTransitions { ref transitions }) => { 1731 let rs = transitions 1732 .iter() 1733 .map(|t| format!("{:?}", t)) 1734 .collect::<Vec<String>>() 1735 .join(", "); 1736 write!(f, "sparse({})", rs) 1737 } 1738 State::Dense(ref dense) => { 1739 write!(f, "dense(")?; 1740 for (i, t) in dense.iter().enumerate() { 1741 if i > 0 { 1742 write!(f, ", ")?; 1743 } 1744 write!(f, "{:?}", t)?; 1745 } 1746 write!(f, ")") 1747 } 1748 State::Look { ref look, next } => { 1749 write!(f, "{:?} => {:?}", look, next.as_usize()) 1750 } 1751 State::Union { ref alternates } => { 1752 let alts = alternates 1753 .iter() 1754 .map(|id| format!("{:?}", id.as_usize())) 1755 .collect::<Vec<String>>() 1756 .join(", "); 1757 write!(f, "union({})", alts) 1758 } 1759 State::BinaryUnion { alt1, alt2 } => { 1760 write!( 1761 f, 1762 "binary-union({}, {})", 1763 alt1.as_usize(), 1764 alt2.as_usize() 1765 ) 1766 } 1767 State::Capture { next, pattern_id, group_index, slot } => { 1768 write!( 1769 f, 1770 "capture(pid={:?}, group={:?}, slot={:?}) => {:?}", 1771 pattern_id.as_usize(), 1772 group_index.as_usize(), 1773 slot.as_usize(), 1774 next.as_usize(), 1775 ) 1776 } 1777 State::Fail => write!(f, "FAIL"), 1778 State::Match { pattern_id } => { 1779 write!(f, "MATCH({:?})", pattern_id.as_usize()) 1780 } 1781 } 1782 } 1783 } 1784 1785 /// A sequence of transitions used to represent a sparse state. 1786 /// 1787 /// This is the primary representation of a [`Sparse`](State::Sparse) state. 1788 /// It corresponds to a sorted sequence of transitions with non-overlapping 1789 /// byte ranges. If the byte at the current position in the haystack matches 1790 /// one of the byte ranges, then the finite state machine should take the 1791 /// corresponding transition. 1792 #[derive(Clone, Debug, Eq, PartialEq)] 1793 pub struct SparseTransitions { 1794 /// The sorted sequence of non-overlapping transitions. 1795 pub transitions: Box<[Transition]>, 1796 } 1797 1798 impl SparseTransitions { 1799 /// This follows the matching transition for a particular byte. 1800 /// 1801 /// The matching transition is found by looking for a matching byte 1802 /// range (there is at most one) corresponding to the position `at` in 1803 /// `haystack`. 1804 /// 1805 /// If `at >= haystack.len()`, then this returns `None`. 1806 #[inline] matches(&self, haystack: &[u8], at: usize) -> Option<StateID>1807 pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { 1808 haystack.get(at).and_then(|&b| self.matches_byte(b)) 1809 } 1810 1811 /// This follows the matching transition for any member of the alphabet. 1812 /// 1813 /// The matching transition is found by looking for a matching byte 1814 /// range (there is at most one) corresponding to the position `at` in 1815 /// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi), 1816 /// then this always returns `None`. 1817 #[inline] matches_unit( &self, unit: alphabet::Unit, ) -> Option<StateID>1818 pub(crate) fn matches_unit( 1819 &self, 1820 unit: alphabet::Unit, 1821 ) -> Option<StateID> { 1822 unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) 1823 } 1824 1825 /// This follows the matching transition for a particular byte. 1826 /// 1827 /// The matching transition is found by looking for a matching byte range 1828 /// (there is at most one) corresponding to the byte given. 1829 #[inline] matches_byte(&self, byte: u8) -> Option<StateID>1830 pub fn matches_byte(&self, byte: u8) -> Option<StateID> { 1831 for t in self.transitions.iter() { 1832 if t.start > byte { 1833 break; 1834 } else if t.matches_byte(byte) { 1835 return Some(t.next); 1836 } 1837 } 1838 None 1839 1840 /* 1841 // This is an alternative implementation that uses binary search. In 1842 // some ad hoc experiments, like 1843 // 1844 // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file 1845 // 1846 // I could not observe any improvement, and in fact, things seemed to 1847 // be a bit slower. I can see an improvement in at least one benchmark: 1848 // 1849 // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8 1850 // 1851 // Where total search time goes from 3.2s to 2.4s when using binary 1852 // search. 1853 self.transitions 1854 .binary_search_by(|t| { 1855 if t.end < byte { 1856 core::cmp::Ordering::Less 1857 } else if t.start > byte { 1858 core::cmp::Ordering::Greater 1859 } else { 1860 core::cmp::Ordering::Equal 1861 } 1862 }) 1863 .ok() 1864 .map(|i| self.transitions[i].next) 1865 */ 1866 } 1867 } 1868 1869 /// A sequence of transitions used to represent a dense state. 1870 /// 1871 /// This is the primary representation of a [`Dense`](State::Dense) state. It 1872 /// provides constant time matching. That is, given a byte in a haystack and 1873 /// a `DenseTransitions`, one can determine if the state matches in constant 1874 /// time. 1875 /// 1876 /// This is in contrast to `SparseTransitions`, whose time complexity is 1877 /// necessarily bigger than constant time. Also in contrast, `DenseTransitions` 1878 /// usually requires (much) more heap memory. 1879 #[derive(Clone, Debug, Eq, PartialEq)] 1880 pub struct DenseTransitions { 1881 /// A dense representation of this state's transitions on the heap. This 1882 /// always has length 256. 1883 pub transitions: Box<[StateID]>, 1884 } 1885 1886 impl DenseTransitions { 1887 /// This follows the matching transition for a particular byte. 1888 /// 1889 /// The matching transition is found by looking for a transition that 1890 /// doesn't correspond to `StateID::ZERO` for the byte `at` the given 1891 /// position in `haystack`. 1892 /// 1893 /// If `at >= haystack.len()`, then this returns `None`. 1894 #[inline] matches(&self, haystack: &[u8], at: usize) -> Option<StateID>1895 pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { 1896 haystack.get(at).and_then(|&b| self.matches_byte(b)) 1897 } 1898 1899 /// This follows the matching transition for any member of the alphabet. 1900 /// 1901 /// The matching transition is found by looking for a transition that 1902 /// doesn't correspond to `StateID::ZERO` for the byte `at` the given 1903 /// position in `haystack`. 1904 /// 1905 /// If `at >= haystack.len()` or if the given alphabet unit is 1906 /// [`EOI`](alphabet::Unit::eoi), then this returns `None`. 1907 #[inline] matches_unit( &self, unit: alphabet::Unit, ) -> Option<StateID>1908 pub(crate) fn matches_unit( 1909 &self, 1910 unit: alphabet::Unit, 1911 ) -> Option<StateID> { 1912 unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) 1913 } 1914 1915 /// This follows the matching transition for a particular byte. 1916 /// 1917 /// The matching transition is found by looking for a transition that 1918 /// doesn't correspond to `StateID::ZERO` for the given `byte`. 1919 /// 1920 /// If `at >= haystack.len()`, then this returns `None`. 1921 #[inline] matches_byte(&self, byte: u8) -> Option<StateID>1922 pub fn matches_byte(&self, byte: u8) -> Option<StateID> { 1923 let next = self.transitions[usize::from(byte)]; 1924 if next == StateID::ZERO { 1925 None 1926 } else { 1927 Some(next) 1928 } 1929 } 1930 1931 /* 1932 /// The dense state optimization isn't currently enabled, so permit a 1933 /// little bit of dead code. 1934 pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions { 1935 let mut dense = vec![StateID::ZERO; 256]; 1936 for t in sparse.transitions.iter() { 1937 for b in t.start..=t.end { 1938 dense[usize::from(b)] = t.next; 1939 } 1940 } 1941 DenseTransitions { transitions: dense.into_boxed_slice() } 1942 } 1943 */ 1944 1945 /// Returns an iterator over all transitions that don't point to 1946 /// `StateID::ZERO`. iter(&self) -> impl Iterator<Item = Transition> + '_1947 pub(crate) fn iter(&self) -> impl Iterator<Item = Transition> + '_ { 1948 use crate::util::int::Usize; 1949 self.transitions 1950 .iter() 1951 .enumerate() 1952 .filter(|&(_, &sid)| sid != StateID::ZERO) 1953 .map(|(byte, &next)| Transition { 1954 start: byte.as_u8(), 1955 end: byte.as_u8(), 1956 next, 1957 }) 1958 } 1959 } 1960 1961 /// A single transition to another state. 1962 /// 1963 /// This transition may only be followed if the current byte in the haystack 1964 /// falls in the inclusive range of bytes specified. 1965 #[derive(Clone, Copy, Eq, Hash, PartialEq)] 1966 pub struct Transition { 1967 /// The inclusive start of the byte range. 1968 pub start: u8, 1969 /// The inclusive end of the byte range. 1970 pub end: u8, 1971 /// The identifier of the state to transition to. 1972 pub next: StateID, 1973 } 1974 1975 impl Transition { 1976 /// Returns true if the position `at` in `haystack` falls in this 1977 /// transition's range of bytes. 1978 /// 1979 /// If `at >= haystack.len()`, then this returns `false`. matches(&self, haystack: &[u8], at: usize) -> bool1980 pub fn matches(&self, haystack: &[u8], at: usize) -> bool { 1981 haystack.get(at).map_or(false, |&b| self.matches_byte(b)) 1982 } 1983 1984 /// Returns true if the given alphabet unit falls in this transition's 1985 /// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then 1986 /// this returns `false`. matches_unit(&self, unit: alphabet::Unit) -> bool1987 pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { 1988 unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) 1989 } 1990 1991 /// Returns true if the given byte falls in this transition's range of 1992 /// bytes. matches_byte(&self, byte: u8) -> bool1993 pub fn matches_byte(&self, byte: u8) -> bool { 1994 self.start <= byte && byte <= self.end 1995 } 1996 } 1997 1998 impl fmt::Debug for Transition { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1999 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 2000 use crate::util::escape::DebugByte; 2001 2002 let Transition { start, end, next } = *self; 2003 if self.start == self.end { 2004 write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) 2005 } else { 2006 write!( 2007 f, 2008 "{:?}-{:?} => {:?}", 2009 DebugByte(start), 2010 DebugByte(end), 2011 next.as_usize(), 2012 ) 2013 } 2014 } 2015 } 2016 2017 /// An iterator over all pattern IDs in an NFA. 2018 /// 2019 /// This iterator is created by [`NFA::patterns`]. 2020 /// 2021 /// The lifetime parameter `'a` refers to the lifetime of the NFA from which 2022 /// this pattern iterator was created. 2023 #[derive(Debug)] 2024 pub struct PatternIter<'a> { 2025 it: PatternIDIter, 2026 /// We explicitly associate a lifetime with this iterator even though we 2027 /// don't actually borrow anything from the NFA. We do this for backward 2028 /// compatibility purposes. If we ever do need to borrow something from 2029 /// the NFA, then we can and just get rid of this marker without breaking 2030 /// the public API. 2031 _marker: core::marker::PhantomData<&'a ()>, 2032 } 2033 2034 impl<'a> Iterator for PatternIter<'a> { 2035 type Item = PatternID; 2036 next(&mut self) -> Option<PatternID>2037 fn next(&mut self) -> Option<PatternID> { 2038 self.it.next() 2039 } 2040 } 2041 2042 #[cfg(all(test, feature = "nfa-pikevm"))] 2043 mod tests { 2044 use super::*; 2045 use crate::{nfa::thompson::pikevm::PikeVM, Input}; 2046 2047 // This asserts that an NFA state doesn't have its size changed. It is 2048 // *really* easy to accidentally increase the size, and thus potentially 2049 // dramatically increase the memory usage of every NFA. 2050 // 2051 // This assert doesn't mean we absolutely cannot increase the size of an 2052 // NFA state. We can. It's just here to make sure we do it knowingly and 2053 // intentionally. 2054 #[test] state_has_small_size()2055 fn state_has_small_size() { 2056 #[cfg(target_pointer_width = "64")] 2057 assert_eq!(24, core::mem::size_of::<State>()); 2058 #[cfg(target_pointer_width = "32")] 2059 assert_eq!(20, core::mem::size_of::<State>()); 2060 } 2061 2062 #[test] always_match()2063 fn always_match() { 2064 let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap(); 2065 let mut cache = re.create_cache(); 2066 let mut caps = re.create_captures(); 2067 let mut find = |haystack, start, end| { 2068 let input = Input::new(haystack).range(start..end); 2069 re.search(&mut cache, &input, &mut caps); 2070 caps.get_match().map(|m| m.end()) 2071 }; 2072 2073 assert_eq!(Some(0), find("", 0, 0)); 2074 assert_eq!(Some(0), find("a", 0, 1)); 2075 assert_eq!(Some(1), find("a", 1, 1)); 2076 assert_eq!(Some(0), find("ab", 0, 2)); 2077 assert_eq!(Some(1), find("ab", 1, 2)); 2078 assert_eq!(Some(2), find("ab", 2, 2)); 2079 } 2080 2081 #[test] never_match()2082 fn never_match() { 2083 let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap(); 2084 let mut cache = re.create_cache(); 2085 let mut caps = re.create_captures(); 2086 let mut find = |haystack, start, end| { 2087 let input = Input::new(haystack).range(start..end); 2088 re.search(&mut cache, &input, &mut caps); 2089 caps.get_match().map(|m| m.end()) 2090 }; 2091 2092 assert_eq!(None, find("", 0, 0)); 2093 assert_eq!(None, find("a", 0, 1)); 2094 assert_eq!(None, find("a", 1, 1)); 2095 assert_eq!(None, find("ab", 0, 2)); 2096 assert_eq!(None, find("ab", 1, 2)); 2097 assert_eq!(None, find("ab", 2, 2)); 2098 } 2099 } 2100