1 #![allow(warnings)] 2 3 // This module defines an internal builder that encapsulates all interaction 4 // with meta::Regex construction, and then 4 public API builders that wrap 5 // around it. The docs are essentially repeated on each of the 4 public 6 // builders, with tweaks to the examples as needed. 7 // 8 // The reason why there are so many builders is partially because of a misstep 9 // in the initial API design: the builder constructor takes in the pattern 10 // strings instead of using the `build` method to accept the pattern strings. 11 // This means `new` has a different signature for each builder. It probably 12 // would have been nicer to to use one builder with `fn new()`, and then add 13 // `build(pat)` and `build_many(pats)` constructors. 14 // 15 // The other reason is because I think the `bytes` module should probably 16 // have its own builder type. That way, it is completely isolated from the 17 // top-level API. 18 // 19 // If I could do it again, I'd probably have a `regex::Builder` and a 20 // `regex::bytes::Builder`. Each would have `build` and `build_set` (or 21 // `build_many`) methods for constructing a single pattern `Regex` and a 22 // multi-pattern `RegexSet`, respectively. 23 24 use alloc::{ 25 string::{String, ToString}, 26 sync::Arc, 27 vec, 28 vec::Vec, 29 }; 30 31 use regex_automata::{ 32 meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind, 33 }; 34 35 use crate::error::Error; 36 37 /// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a 38 /// `bytes::RegexSet`. 39 /// 40 /// This is essentially the implementation of the four different builder types 41 /// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder` 42 /// and `bytes::RegexSetBuilder`. 43 #[derive(Clone, Debug)] 44 struct Builder { 45 pats: Vec<String>, 46 metac: meta::Config, 47 syntaxc: syntax::Config, 48 } 49 50 impl Default for Builder { default() -> Builder51 fn default() -> Builder { 52 let metac = meta::Config::new() 53 .nfa_size_limit(Some(10 * (1 << 20))) 54 .hybrid_cache_capacity(2 * (1 << 20)); 55 Builder { pats: vec![], metac, syntaxc: syntax::Config::default() } 56 } 57 } 58 59 impl Builder { new<I, S>(patterns: I) -> Builder where S: AsRef<str>, I: IntoIterator<Item = S>,60 fn new<I, S>(patterns: I) -> Builder 61 where 62 S: AsRef<str>, 63 I: IntoIterator<Item = S>, 64 { 65 let mut b = Builder::default(); 66 b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string())); 67 b 68 } 69 build_one_string(&self) -> Result<crate::Regex, Error>70 fn build_one_string(&self) -> Result<crate::Regex, Error> { 71 assert_eq!(1, self.pats.len()); 72 let metac = self 73 .metac 74 .clone() 75 .match_kind(MatchKind::LeftmostFirst) 76 .utf8_empty(true); 77 let syntaxc = self.syntaxc.clone().utf8(true); 78 let pattern = Arc::from(self.pats[0].as_str()); 79 meta::Builder::new() 80 .configure(metac) 81 .syntax(syntaxc) 82 .build(&pattern) 83 .map(|meta| crate::Regex { meta, pattern }) 84 .map_err(Error::from_meta_build_error) 85 } 86 build_one_bytes(&self) -> Result<crate::bytes::Regex, Error>87 fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> { 88 assert_eq!(1, self.pats.len()); 89 let metac = self 90 .metac 91 .clone() 92 .match_kind(MatchKind::LeftmostFirst) 93 .utf8_empty(false); 94 let syntaxc = self.syntaxc.clone().utf8(false); 95 let pattern = Arc::from(self.pats[0].as_str()); 96 meta::Builder::new() 97 .configure(metac) 98 .syntax(syntaxc) 99 .build(&pattern) 100 .map(|meta| crate::bytes::Regex { meta, pattern }) 101 .map_err(Error::from_meta_build_error) 102 } 103 build_many_string(&self) -> Result<crate::RegexSet, Error>104 fn build_many_string(&self) -> Result<crate::RegexSet, Error> { 105 let metac = self 106 .metac 107 .clone() 108 .match_kind(MatchKind::All) 109 .utf8_empty(true) 110 .which_captures(WhichCaptures::None); 111 let syntaxc = self.syntaxc.clone().utf8(true); 112 let patterns = Arc::from(self.pats.as_slice()); 113 meta::Builder::new() 114 .configure(metac) 115 .syntax(syntaxc) 116 .build_many(&patterns) 117 .map(|meta| crate::RegexSet { meta, patterns }) 118 .map_err(Error::from_meta_build_error) 119 } 120 build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error>121 fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> { 122 let metac = self 123 .metac 124 .clone() 125 .match_kind(MatchKind::All) 126 .utf8_empty(false) 127 .which_captures(WhichCaptures::None); 128 let syntaxc = self.syntaxc.clone().utf8(false); 129 let patterns = Arc::from(self.pats.as_slice()); 130 meta::Builder::new() 131 .configure(metac) 132 .syntax(syntaxc) 133 .build_many(&patterns) 134 .map(|meta| crate::bytes::RegexSet { meta, patterns }) 135 .map_err(Error::from_meta_build_error) 136 } 137 case_insensitive(&mut self, yes: bool) -> &mut Builder138 fn case_insensitive(&mut self, yes: bool) -> &mut Builder { 139 self.syntaxc = self.syntaxc.case_insensitive(yes); 140 self 141 } 142 multi_line(&mut self, yes: bool) -> &mut Builder143 fn multi_line(&mut self, yes: bool) -> &mut Builder { 144 self.syntaxc = self.syntaxc.multi_line(yes); 145 self 146 } 147 dot_matches_new_line(&mut self, yes: bool) -> &mut Builder148 fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder { 149 self.syntaxc = self.syntaxc.dot_matches_new_line(yes); 150 self 151 } 152 crlf(&mut self, yes: bool) -> &mut Builder153 fn crlf(&mut self, yes: bool) -> &mut Builder { 154 self.syntaxc = self.syntaxc.crlf(yes); 155 self 156 } 157 line_terminator(&mut self, byte: u8) -> &mut Builder158 fn line_terminator(&mut self, byte: u8) -> &mut Builder { 159 self.metac = self.metac.clone().line_terminator(byte); 160 self.syntaxc = self.syntaxc.line_terminator(byte); 161 self 162 } 163 swap_greed(&mut self, yes: bool) -> &mut Builder164 fn swap_greed(&mut self, yes: bool) -> &mut Builder { 165 self.syntaxc = self.syntaxc.swap_greed(yes); 166 self 167 } 168 ignore_whitespace(&mut self, yes: bool) -> &mut Builder169 fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder { 170 self.syntaxc = self.syntaxc.ignore_whitespace(yes); 171 self 172 } 173 unicode(&mut self, yes: bool) -> &mut Builder174 fn unicode(&mut self, yes: bool) -> &mut Builder { 175 self.syntaxc = self.syntaxc.unicode(yes); 176 self 177 } 178 octal(&mut self, yes: bool) -> &mut Builder179 fn octal(&mut self, yes: bool) -> &mut Builder { 180 self.syntaxc = self.syntaxc.octal(yes); 181 self 182 } 183 size_limit(&mut self, limit: usize) -> &mut Builder184 fn size_limit(&mut self, limit: usize) -> &mut Builder { 185 self.metac = self.metac.clone().nfa_size_limit(Some(limit)); 186 self 187 } 188 dfa_size_limit(&mut self, limit: usize) -> &mut Builder189 fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder { 190 self.metac = self.metac.clone().hybrid_cache_capacity(limit); 191 self 192 } 193 nest_limit(&mut self, limit: u32) -> &mut Builder194 fn nest_limit(&mut self, limit: u32) -> &mut Builder { 195 self.syntaxc = self.syntaxc.nest_limit(limit); 196 self 197 } 198 } 199 200 pub(crate) mod string { 201 use crate::{error::Error, Regex, RegexSet}; 202 203 use super::Builder; 204 205 /// A configurable builder for a [`Regex`]. 206 /// 207 /// This builder can be used to programmatically set flags such as `i` 208 /// (case insensitive) and `x` (for verbose mode). This builder can also be 209 /// used to configure things like the line terminator and a size limit on 210 /// the compiled regular expression. 211 #[derive(Clone, Debug)] 212 pub struct RegexBuilder { 213 builder: Builder, 214 } 215 216 impl RegexBuilder { 217 /// Create a new builder with a default configuration for the given 218 /// pattern. 219 /// 220 /// If the pattern is invalid or exceeds the configured size limits, 221 /// then an error will be returned when [`RegexBuilder::build`] is 222 /// called. new(pattern: &str) -> RegexBuilder223 pub fn new(pattern: &str) -> RegexBuilder { 224 RegexBuilder { builder: Builder::new([pattern]) } 225 } 226 227 /// Compiles the pattern given to `RegexBuilder::new` with the 228 /// configuration set on this builder. 229 /// 230 /// If the pattern isn't a valid regex or if a configured size limit 231 /// was exceeded, then an error is returned. build(&self) -> Result<Regex, Error>232 pub fn build(&self) -> Result<Regex, Error> { 233 self.builder.build_one_string() 234 } 235 236 /// This configures Unicode mode for the entire pattern. 237 /// 238 /// Enabling Unicode mode does a number of things: 239 /// 240 /// * Most fundamentally, it causes the fundamental atom of matching 241 /// to be a single codepoint. When Unicode mode is disabled, it's a 242 /// single byte. For example, when Unicode mode is enabled, `.` will 243 /// match `` once, where as it will match 4 times when Unicode mode 244 /// is disabled. (Since the UTF-8 encoding of `` is 4 bytes long.) 245 /// * Case insensitive matching uses Unicode simple case folding rules. 246 /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are 247 /// available. 248 /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and 249 /// `\d`. 250 /// * The word boundary assertions, `\b` and `\B`, use the Unicode 251 /// definition of a word character. 252 /// 253 /// Note that if Unicode mode is disabled, then the regex will fail to 254 /// compile if it could match invalid UTF-8. For example, when Unicode 255 /// mode is disabled, then since `.` matches any byte (except for 256 /// `\n`), then it can match invalid UTF-8 and thus building a regex 257 /// from it will fail. Another example is `\w` and `\W`. Since `\w` can 258 /// only match ASCII bytes when Unicode mode is disabled, it's allowed. 259 /// But `\W` can match more than ASCII bytes, including invalid UTF-8, 260 /// and so it is not allowed. This restriction can be lifted only by 261 /// using a [`bytes::Regex`](crate::bytes::Regex). 262 /// 263 /// For more details on the Unicode support in this crate, see the 264 /// [Unicode section](crate#unicode) in this crate's top-level 265 /// documentation. 266 /// 267 /// The default for this is `true`. 268 /// 269 /// # Example 270 /// 271 /// ``` 272 /// use regex::RegexBuilder; 273 /// 274 /// let re = RegexBuilder::new(r"\w") 275 /// .unicode(false) 276 /// .build() 277 /// .unwrap(); 278 /// // Normally greek letters would be included in \w, but since 279 /// // Unicode mode is disabled, it only matches ASCII letters. 280 /// assert!(!re.is_match("δ")); 281 /// 282 /// let re = RegexBuilder::new(r"s") 283 /// .case_insensitive(true) 284 /// .unicode(false) 285 /// .build() 286 /// .unwrap(); 287 /// // Normally 'ſ' is included when searching for 's' case 288 /// // insensitively due to Unicode's simple case folding rules. But 289 /// // when Unicode mode is disabled, only ASCII case insensitive rules 290 /// // are used. 291 /// assert!(!re.is_match("ſ")); 292 /// ``` unicode(&mut self, yes: bool) -> &mut RegexBuilder293 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { 294 self.builder.unicode(yes); 295 self 296 } 297 298 /// This configures whether to enable case insensitive matching for the 299 /// entire pattern. 300 /// 301 /// This setting can also be configured using the inline flag `i` 302 /// in the pattern. For example, `(?i:foo)` matches `foo` case 303 /// insensitively while `(?-i:foo)` matches `foo` case sensitively. 304 /// 305 /// The default for this is `false`. 306 /// 307 /// # Example 308 /// 309 /// ``` 310 /// use regex::RegexBuilder; 311 /// 312 /// let re = RegexBuilder::new(r"foo(?-i:bar)quux") 313 /// .case_insensitive(true) 314 /// .build() 315 /// .unwrap(); 316 /// assert!(re.is_match("FoObarQuUx")); 317 /// // Even though case insensitive matching is enabled in the builder, 318 /// // it can be locally disabled within the pattern. In this case, 319 /// // `bar` is matched case sensitively. 320 /// assert!(!re.is_match("fooBARquux")); 321 /// ``` case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder322 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { 323 self.builder.case_insensitive(yes); 324 self 325 } 326 327 /// This configures multi-line mode for the entire pattern. 328 /// 329 /// Enabling multi-line mode changes the behavior of the `^` and `$` 330 /// anchor assertions. Instead of only matching at the beginning and 331 /// end of a haystack, respectively, multi-line mode causes them to 332 /// match at the beginning and end of a line *in addition* to the 333 /// beginning and end of a haystack. More precisely, `^` will match at 334 /// the position immediately following a `\n` and `$` will match at the 335 /// position immediately preceding a `\n`. 336 /// 337 /// The behavior of this option can be impacted by other settings too: 338 /// 339 /// * The [`RegexBuilder::line_terminator`] option changes `\n` above 340 /// to any ASCII byte. 341 /// * The [`RegexBuilder::crlf`] option changes the line terminator to 342 /// be either `\r` or `\n`, but never at the position between a `\r` 343 /// and `\n`. 344 /// 345 /// This setting can also be configured using the inline flag `m` in 346 /// the pattern. 347 /// 348 /// The default for this is `false`. 349 /// 350 /// # Example 351 /// 352 /// ``` 353 /// use regex::RegexBuilder; 354 /// 355 /// let re = RegexBuilder::new(r"^foo$") 356 /// .multi_line(true) 357 /// .build() 358 /// .unwrap(); 359 /// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range())); 360 /// ``` multi_line(&mut self, yes: bool) -> &mut RegexBuilder361 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { 362 self.builder.multi_line(yes); 363 self 364 } 365 366 /// This configures dot-matches-new-line mode for the entire pattern. 367 /// 368 /// Perhaps surprisingly, the default behavior for `.` is not to match 369 /// any character, but rather, to match any character except for the 370 /// line terminator (which is `\n` by default). When this mode is 371 /// enabled, the behavior changes such that `.` truly matches any 372 /// character. 373 /// 374 /// This setting can also be configured using the inline flag `s` in 375 /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent 376 /// regexes. 377 /// 378 /// The default for this is `false`. 379 /// 380 /// # Example 381 /// 382 /// ``` 383 /// use regex::RegexBuilder; 384 /// 385 /// let re = RegexBuilder::new(r"foo.bar") 386 /// .dot_matches_new_line(true) 387 /// .build() 388 /// .unwrap(); 389 /// let hay = "foo\nbar"; 390 /// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str())); 391 /// ``` dot_matches_new_line( &mut self, yes: bool, ) -> &mut RegexBuilder392 pub fn dot_matches_new_line( 393 &mut self, 394 yes: bool, 395 ) -> &mut RegexBuilder { 396 self.builder.dot_matches_new_line(yes); 397 self 398 } 399 400 /// This configures CRLF mode for the entire pattern. 401 /// 402 /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for 403 /// short) and `\n` ("line feed" or LF for short) are treated as line 404 /// terminators. This results in the following: 405 /// 406 /// * Unless dot-matches-new-line mode is enabled, `.` will now match 407 /// any character except for `\n` and `\r`. 408 /// * When multi-line mode is enabled, `^` will match immediately 409 /// following a `\n` or a `\r`. Similarly, `$` will match immediately 410 /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match 411 /// between `\r` and `\n`. 412 /// 413 /// This setting can also be configured using the inline flag `R` in 414 /// the pattern. 415 /// 416 /// The default for this is `false`. 417 /// 418 /// # Example 419 /// 420 /// ``` 421 /// use regex::RegexBuilder; 422 /// 423 /// let re = RegexBuilder::new(r"^foo$") 424 /// .multi_line(true) 425 /// .crlf(true) 426 /// .build() 427 /// .unwrap(); 428 /// let hay = "\r\nfoo\r\n"; 429 /// // If CRLF mode weren't enabled here, then '$' wouldn't match 430 /// // immediately after 'foo', and thus no match would be found. 431 /// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str())); 432 /// ``` 433 /// 434 /// This example demonstrates that `^` will never match at a position 435 /// between `\r` and `\n`. (`$` will similarly not match between a `\r` 436 /// and a `\n`.) 437 /// 438 /// ``` 439 /// use regex::RegexBuilder; 440 /// 441 /// let re = RegexBuilder::new(r"^") 442 /// .multi_line(true) 443 /// .crlf(true) 444 /// .build() 445 /// .unwrap(); 446 /// let hay = "\r\n\r\n"; 447 /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect(); 448 /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]); 449 /// ``` crlf(&mut self, yes: bool) -> &mut RegexBuilder450 pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { 451 self.builder.crlf(yes); 452 self 453 } 454 455 /// Configures the line terminator to be used by the regex. 456 /// 457 /// The line terminator is relevant in two ways for a particular regex: 458 /// 459 /// * When dot-matches-new-line mode is *not* enabled (the default), 460 /// then `.` will match any character except for the configured line 461 /// terminator. 462 /// * When multi-line mode is enabled (not the default), then `^` and 463 /// `$` will match immediately after and before, respectively, a line 464 /// terminator. 465 /// 466 /// In both cases, if CRLF mode is enabled in a particular context, 467 /// then it takes precedence over any configured line terminator. 468 /// 469 /// This option cannot be configured from within the pattern. 470 /// 471 /// The default line terminator is `\n`. 472 /// 473 /// # Example 474 /// 475 /// This shows how to treat the NUL byte as a line terminator. This can 476 /// be a useful heuristic when searching binary data. 477 /// 478 /// ``` 479 /// use regex::RegexBuilder; 480 /// 481 /// let re = RegexBuilder::new(r"^foo$") 482 /// .multi_line(true) 483 /// .line_terminator(b'\x00') 484 /// .build() 485 /// .unwrap(); 486 /// let hay = "\x00foo\x00"; 487 /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range())); 488 /// ``` 489 /// 490 /// This example shows that the behavior of `.` is impacted by this 491 /// setting as well: 492 /// 493 /// ``` 494 /// use regex::RegexBuilder; 495 /// 496 /// let re = RegexBuilder::new(r".") 497 /// .line_terminator(b'\x00') 498 /// .build() 499 /// .unwrap(); 500 /// assert!(re.is_match("\n")); 501 /// assert!(!re.is_match("\x00")); 502 /// ``` 503 /// 504 /// This shows that building a regex will fail if the byte given 505 /// is not ASCII and the pattern could result in matching invalid 506 /// UTF-8. This is because any singular non-ASCII byte is not valid 507 /// UTF-8, and it is not permitted for a [`Regex`] to match invalid 508 /// UTF-8. (It is permissible to use a non-ASCII byte when building a 509 /// [`bytes::Regex`](crate::bytes::Regex).) 510 /// 511 /// ``` 512 /// use regex::RegexBuilder; 513 /// 514 /// assert!(RegexBuilder::new(r".").line_terminator(0x80).build().is_err()); 515 /// // Note that using a non-ASCII byte isn't enough on its own to 516 /// // cause regex compilation to fail. You actually have to make use 517 /// // of it in the regex in a way that leads to matching invalid 518 /// // UTF-8. If you don't, then regex compilation will succeed! 519 /// assert!(RegexBuilder::new(r"a").line_terminator(0x80).build().is_ok()); 520 /// ``` line_terminator(&mut self, byte: u8) -> &mut RegexBuilder521 pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder { 522 self.builder.line_terminator(byte); 523 self 524 } 525 526 /// This configures swap-greed mode for the entire pattern. 527 /// 528 /// When swap-greed mode is enabled, patterns like `a+` will become 529 /// non-greedy and patterns like `a+?` will become greedy. In other 530 /// words, the meanings of `a+` and `a+?` are switched. 531 /// 532 /// This setting can also be configured using the inline flag `U` in 533 /// the pattern. 534 /// 535 /// The default for this is `false`. 536 /// 537 /// # Example 538 /// 539 /// ``` 540 /// use regex::RegexBuilder; 541 /// 542 /// let re = RegexBuilder::new(r"a+") 543 /// .swap_greed(true) 544 /// .build() 545 /// .unwrap(); 546 /// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str())); 547 /// ``` swap_greed(&mut self, yes: bool) -> &mut RegexBuilder548 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { 549 self.builder.swap_greed(yes); 550 self 551 } 552 553 /// This configures verbose mode for the entire pattern. 554 /// 555 /// When enabled, whitespace will treated as insignifcant in the 556 /// pattern and `#` can be used to start a comment until the next new 557 /// line. 558 /// 559 /// Normally, in most places in a pattern, whitespace is treated 560 /// literally. For example ` +` will match one or more ASCII whitespace 561 /// characters. 562 /// 563 /// When verbose mode is enabled, `\#` can be used to match a literal 564 /// `#` and `\ ` can be used to match a literal ASCII whitespace 565 /// character. 566 /// 567 /// Verbose mode is useful for permitting regexes to be formatted and 568 /// broken up more nicely. This may make them more easily readable. 569 /// 570 /// This setting can also be configured using the inline flag `x` in 571 /// the pattern. 572 /// 573 /// The default for this is `false`. 574 /// 575 /// # Example 576 /// 577 /// ``` 578 /// use regex::RegexBuilder; 579 /// 580 /// let pat = r" 581 /// \b 582 /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter 583 /// [\s--\n]+ # whitespace should separate names 584 /// (?: # middle name can be an initial! 585 /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) 586 /// [\s--\n]+ 587 /// )? 588 /// (?<last>\p{Uppercase}\w*) 589 /// \b 590 /// "; 591 /// let re = RegexBuilder::new(pat) 592 /// .ignore_whitespace(true) 593 /// .build() 594 /// .unwrap(); 595 /// 596 /// let caps = re.captures("Harry Potter").unwrap(); 597 /// assert_eq!("Harry", &caps["first"]); 598 /// assert_eq!("Potter", &caps["last"]); 599 /// 600 /// let caps = re.captures("Harry J. Potter").unwrap(); 601 /// assert_eq!("Harry", &caps["first"]); 602 /// // Since a middle name/initial isn't required for an overall match, 603 /// // we can't assume that 'initial' or 'middle' will be populated! 604 /// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str())); 605 /// assert_eq!(None, caps.name("middle").map(|m| m.as_str())); 606 /// assert_eq!("Potter", &caps["last"]); 607 /// 608 /// let caps = re.captures("Harry James Potter").unwrap(); 609 /// assert_eq!("Harry", &caps["first"]); 610 /// // Since a middle name/initial isn't required for an overall match, 611 /// // we can't assume that 'initial' or 'middle' will be populated! 612 /// assert_eq!(None, caps.name("initial").map(|m| m.as_str())); 613 /// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str())); 614 /// assert_eq!("Potter", &caps["last"]); 615 /// ``` ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder616 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { 617 self.builder.ignore_whitespace(yes); 618 self 619 } 620 621 /// This configures octal mode for the entire pattern. 622 /// 623 /// Octal syntax is a little-known way of uttering Unicode codepoints 624 /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all 625 /// equivalent patterns, where the last example shows octal syntax. 626 /// 627 /// While supporting octal syntax isn't in and of itself a problem, 628 /// it does make good error messages harder. That is, in PCRE based 629 /// regex engines, syntax like `\1` invokes a backreference, which is 630 /// explicitly unsupported this library. However, many users expect 631 /// backreferences to be supported. Therefore, when octal support 632 /// is disabled, the error message will explicitly mention that 633 /// backreferences aren't supported. 634 /// 635 /// The default for this is `false`. 636 /// 637 /// # Example 638 /// 639 /// ``` 640 /// use regex::RegexBuilder; 641 /// 642 /// // Normally this pattern would not compile, with an error message 643 /// // about backreferences not being supported. But with octal mode 644 /// // enabled, octal escape sequences work. 645 /// let re = RegexBuilder::new(r"\141") 646 /// .octal(true) 647 /// .build() 648 /// .unwrap(); 649 /// assert!(re.is_match("a")); 650 /// ``` octal(&mut self, yes: bool) -> &mut RegexBuilder651 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { 652 self.builder.octal(yes); 653 self 654 } 655 656 /// Sets the approximate size limit, in bytes, of the compiled regex. 657 /// 658 /// This roughly corresponds to the number of heap memory, in 659 /// bytes, occupied by a single regex. If the regex would otherwise 660 /// approximately exceed this limit, then compiling that regex will 661 /// fail. 662 /// 663 /// The main utility of a method like this is to avoid compiling 664 /// regexes that use an unexpected amount of resources, such as 665 /// time and memory. Even if the memory usage of a large regex is 666 /// acceptable, its search time may not be. Namely, worst case time 667 /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and 668 /// `n ~ len(haystack)`. That is, search time depends, in part, on the 669 /// size of the compiled regex. This means that putting a limit on the 670 /// size of the regex limits how much a regex can impact search time. 671 /// 672 /// For more information about regex size limits, see the section on 673 /// [untrusted inputs](crate#untrusted-input) in the top-level crate 674 /// documentation. 675 /// 676 /// The default for this is some reasonable number that permits most 677 /// patterns to compile successfully. 678 /// 679 /// # Example 680 /// 681 /// ``` 682 /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 683 /// use regex::RegexBuilder; 684 /// 685 /// // It may surprise you how big some seemingly small patterns can 686 /// // be! Since \w is Unicode aware, this generates a regex that can 687 /// // match approximately 140,000 distinct codepoints. 688 /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err()); 689 /// ``` size_limit(&mut self, bytes: usize) -> &mut RegexBuilder690 pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { 691 self.builder.size_limit(bytes); 692 self 693 } 694 695 /// Set the approximate capacity, in bytes, of the cache of transitions 696 /// used by the lazy DFA. 697 /// 698 /// While the lazy DFA isn't always used, in tends to be the most 699 /// commonly use regex engine in default configurations. It tends to 700 /// adopt the performance profile of a fully build DFA, but without the 701 /// downside of taking worst case exponential time to build. 702 /// 703 /// The downside is that it needs to keep a cache of transitions and 704 /// states that are built while running a search, and this cache 705 /// can fill up. When it fills up, the cache will reset itself. Any 706 /// previously generated states and transitions will then need to be 707 /// re-generated. If this happens too many times, then this library 708 /// will bail out of using the lazy DFA and switch to a different regex 709 /// engine. 710 /// 711 /// If your regex provokes this particular downside of the lazy DFA, 712 /// then it may be beneficial to increase its cache capacity. This will 713 /// potentially reduce the frequency of cache resetting (ideally to 714 /// `0`). While it won't fix all potential performance problems with 715 /// the lazy DFA, increasing the cache capacity does fix some. 716 /// 717 /// There is no easy way to determine, a priori, whether increasing 718 /// this cache capacity will help. In general, the larger your regex, 719 /// the more cache it's likely to use. But that isn't an ironclad rule. 720 /// For example, a regex like `[01]*1[01]{N}` would normally produce a 721 /// fully build DFA that is exponential in size with respect to `N`. 722 /// The lazy DFA will prevent exponential space blow-up, but it cache 723 /// is likely to fill up, even when it's large and even for smallish 724 /// values of `N`. 725 /// 726 /// If you aren't sure whether this helps or not, it is sensible to 727 /// set this to some arbitrarily large number in testing, such as 728 /// `usize::MAX`. Namely, this represents the amount of capacity that 729 /// *may* be used. It's probably not a good idea to use `usize::MAX` in 730 /// production though, since it implies there are no controls on heap 731 /// memory used by this library during a search. In effect, set it to 732 /// whatever you're willing to allocate for a single regex search. dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder733 pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { 734 self.builder.dfa_size_limit(bytes); 735 self 736 } 737 738 /// Set the nesting limit for this parser. 739 /// 740 /// The nesting limit controls how deep the abstract syntax tree is 741 /// allowed to be. If the AST exceeds the given limit (e.g., with too 742 /// many nested groups), then an error is returned by the parser. 743 /// 744 /// The purpose of this limit is to act as a heuristic to prevent stack 745 /// overflow for consumers that do structural induction on an AST using 746 /// explicit recursion. While this crate never does this (instead using 747 /// constant stack space and moving the call stack to the heap), other 748 /// crates may. 749 /// 750 /// This limit is not checked until the entire AST is parsed. 751 /// Therefore, if callers want to put a limit on the amount of heap 752 /// space used, then they should impose a limit on the length, in 753 /// bytes, of the concrete pattern string. In particular, this is 754 /// viable since this parser implementation will limit itself to heap 755 /// space proportional to the length of the pattern string. See also 756 /// the [untrusted inputs](crate#untrusted-input) section in the 757 /// top-level crate documentation for more information about this. 758 /// 759 /// Note that a nest limit of `0` will return a nest limit error for 760 /// most patterns but not all. For example, a nest limit of `0` permits 761 /// `a` but not `ab`, since `ab` requires an explicit concatenation, 762 /// which results in a nest depth of `1`. In general, a nest limit is 763 /// not something that manifests in an obvious way in the concrete 764 /// syntax, therefore, it should not be used in a granular way. 765 /// 766 /// # Example 767 /// 768 /// ``` 769 /// use regex::RegexBuilder; 770 /// 771 /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok()); 772 /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err()); 773 /// ``` nest_limit(&mut self, limit: u32) -> &mut RegexBuilder774 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { 775 self.builder.nest_limit(limit); 776 self 777 } 778 } 779 780 /// A configurable builder for a [`RegexSet`]. 781 /// 782 /// This builder can be used to programmatically set flags such as 783 /// `i` (case insensitive) and `x` (for verbose mode). This builder 784 /// can also be used to configure things like the line terminator 785 /// and a size limit on the compiled regular expression. 786 #[derive(Clone, Debug)] 787 pub struct RegexSetBuilder { 788 builder: Builder, 789 } 790 791 impl RegexSetBuilder { 792 /// Create a new builder with a default configuration for the given 793 /// patterns. 794 /// 795 /// If the patterns are invalid or exceed the configured size limits, 796 /// then an error will be returned when [`RegexSetBuilder::build`] is 797 /// called. new<I, S>(patterns: I) -> RegexSetBuilder where I: IntoIterator<Item = S>, S: AsRef<str>,798 pub fn new<I, S>(patterns: I) -> RegexSetBuilder 799 where 800 I: IntoIterator<Item = S>, 801 S: AsRef<str>, 802 { 803 RegexSetBuilder { builder: Builder::new(patterns) } 804 } 805 806 /// Compiles the patterns given to `RegexSetBuilder::new` with the 807 /// configuration set on this builder. 808 /// 809 /// If the patterns aren't valid regexes or if a configured size limit 810 /// was exceeded, then an error is returned. build(&self) -> Result<RegexSet, Error>811 pub fn build(&self) -> Result<RegexSet, Error> { 812 self.builder.build_many_string() 813 } 814 815 /// This configures Unicode mode for the all of the patterns. 816 /// 817 /// Enabling Unicode mode does a number of things: 818 /// 819 /// * Most fundamentally, it causes the fundamental atom of matching 820 /// to be a single codepoint. When Unicode mode is disabled, it's a 821 /// single byte. For example, when Unicode mode is enabled, `.` will 822 /// match `` once, where as it will match 4 times when Unicode mode 823 /// is disabled. (Since the UTF-8 encoding of `` is 4 bytes long.) 824 /// * Case insensitive matching uses Unicode simple case folding rules. 825 /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are 826 /// available. 827 /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and 828 /// `\d`. 829 /// * The word boundary assertions, `\b` and `\B`, use the Unicode 830 /// definition of a word character. 831 /// 832 /// Note that if Unicode mode is disabled, then the regex will fail to 833 /// compile if it could match invalid UTF-8. For example, when Unicode 834 /// mode is disabled, then since `.` matches any byte (except for 835 /// `\n`), then it can match invalid UTF-8 and thus building a regex 836 /// from it will fail. Another example is `\w` and `\W`. Since `\w` can 837 /// only match ASCII bytes when Unicode mode is disabled, it's allowed. 838 /// But `\W` can match more than ASCII bytes, including invalid UTF-8, 839 /// and so it is not allowed. This restriction can be lifted only by 840 /// using a [`bytes::RegexSet`](crate::bytes::RegexSet). 841 /// 842 /// For more details on the Unicode support in this crate, see the 843 /// [Unicode section](crate#unicode) in this crate's top-level 844 /// documentation. 845 /// 846 /// The default for this is `true`. 847 /// 848 /// # Example 849 /// 850 /// ``` 851 /// use regex::RegexSetBuilder; 852 /// 853 /// let re = RegexSetBuilder::new([r"\w"]) 854 /// .unicode(false) 855 /// .build() 856 /// .unwrap(); 857 /// // Normally greek letters would be included in \w, but since 858 /// // Unicode mode is disabled, it only matches ASCII letters. 859 /// assert!(!re.is_match("δ")); 860 /// 861 /// let re = RegexSetBuilder::new([r"s"]) 862 /// .case_insensitive(true) 863 /// .unicode(false) 864 /// .build() 865 /// .unwrap(); 866 /// // Normally 'ſ' is included when searching for 's' case 867 /// // insensitively due to Unicode's simple case folding rules. But 868 /// // when Unicode mode is disabled, only ASCII case insensitive rules 869 /// // are used. 870 /// assert!(!re.is_match("ſ")); 871 /// ``` unicode(&mut self, yes: bool) -> &mut RegexSetBuilder872 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { 873 self.builder.unicode(yes); 874 self 875 } 876 877 /// This configures whether to enable case insensitive matching for all 878 /// of the patterns. 879 /// 880 /// This setting can also be configured using the inline flag `i` 881 /// in the pattern. For example, `(?i:foo)` matches `foo` case 882 /// insensitively while `(?-i:foo)` matches `foo` case sensitively. 883 /// 884 /// The default for this is `false`. 885 /// 886 /// # Example 887 /// 888 /// ``` 889 /// use regex::RegexSetBuilder; 890 /// 891 /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"]) 892 /// .case_insensitive(true) 893 /// .build() 894 /// .unwrap(); 895 /// assert!(re.is_match("FoObarQuUx")); 896 /// // Even though case insensitive matching is enabled in the builder, 897 /// // it can be locally disabled within the pattern. In this case, 898 /// // `bar` is matched case sensitively. 899 /// assert!(!re.is_match("fooBARquux")); 900 /// ``` case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder901 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder { 902 self.builder.case_insensitive(yes); 903 self 904 } 905 906 /// This configures multi-line mode for all of the patterns. 907 /// 908 /// Enabling multi-line mode changes the behavior of the `^` and `$` 909 /// anchor assertions. Instead of only matching at the beginning and 910 /// end of a haystack, respectively, multi-line mode causes them to 911 /// match at the beginning and end of a line *in addition* to the 912 /// beginning and end of a haystack. More precisely, `^` will match at 913 /// the position immediately following a `\n` and `$` will match at the 914 /// position immediately preceding a `\n`. 915 /// 916 /// The behavior of this option can be impacted by other settings too: 917 /// 918 /// * The [`RegexSetBuilder::line_terminator`] option changes `\n` 919 /// above to any ASCII byte. 920 /// * The [`RegexSetBuilder::crlf`] option changes the line terminator 921 /// to be either `\r` or `\n`, but never at the position between a `\r` 922 /// and `\n`. 923 /// 924 /// This setting can also be configured using the inline flag `m` in 925 /// the pattern. 926 /// 927 /// The default for this is `false`. 928 /// 929 /// # Example 930 /// 931 /// ``` 932 /// use regex::RegexSetBuilder; 933 /// 934 /// let re = RegexSetBuilder::new([r"^foo$"]) 935 /// .multi_line(true) 936 /// .build() 937 /// .unwrap(); 938 /// assert!(re.is_match("\nfoo\n")); 939 /// ``` multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder940 pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder { 941 self.builder.multi_line(yes); 942 self 943 } 944 945 /// This configures dot-matches-new-line mode for the entire pattern. 946 /// 947 /// Perhaps surprisingly, the default behavior for `.` is not to match 948 /// any character, but rather, to match any character except for the 949 /// line terminator (which is `\n` by default). When this mode is 950 /// enabled, the behavior changes such that `.` truly matches any 951 /// character. 952 /// 953 /// This setting can also be configured using the inline flag `s` in 954 /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent 955 /// regexes. 956 /// 957 /// The default for this is `false`. 958 /// 959 /// # Example 960 /// 961 /// ``` 962 /// use regex::RegexSetBuilder; 963 /// 964 /// let re = RegexSetBuilder::new([r"foo.bar"]) 965 /// .dot_matches_new_line(true) 966 /// .build() 967 /// .unwrap(); 968 /// let hay = "foo\nbar"; 969 /// assert!(re.is_match(hay)); 970 /// ``` dot_matches_new_line( &mut self, yes: bool, ) -> &mut RegexSetBuilder971 pub fn dot_matches_new_line( 972 &mut self, 973 yes: bool, 974 ) -> &mut RegexSetBuilder { 975 self.builder.dot_matches_new_line(yes); 976 self 977 } 978 979 /// This configures CRLF mode for all of the patterns. 980 /// 981 /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for 982 /// short) and `\n` ("line feed" or LF for short) are treated as line 983 /// terminators. This results in the following: 984 /// 985 /// * Unless dot-matches-new-line mode is enabled, `.` will now match 986 /// any character except for `\n` and `\r`. 987 /// * When multi-line mode is enabled, `^` will match immediately 988 /// following a `\n` or a `\r`. Similarly, `$` will match immediately 989 /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match 990 /// between `\r` and `\n`. 991 /// 992 /// This setting can also be configured using the inline flag `R` in 993 /// the pattern. 994 /// 995 /// The default for this is `false`. 996 /// 997 /// # Example 998 /// 999 /// ``` 1000 /// use regex::RegexSetBuilder; 1001 /// 1002 /// let re = RegexSetBuilder::new([r"^foo$"]) 1003 /// .multi_line(true) 1004 /// .crlf(true) 1005 /// .build() 1006 /// .unwrap(); 1007 /// let hay = "\r\nfoo\r\n"; 1008 /// // If CRLF mode weren't enabled here, then '$' wouldn't match 1009 /// // immediately after 'foo', and thus no match would be found. 1010 /// assert!(re.is_match(hay)); 1011 /// ``` 1012 /// 1013 /// This example demonstrates that `^` will never match at a position 1014 /// between `\r` and `\n`. (`$` will similarly not match between a `\r` 1015 /// and a `\n`.) 1016 /// 1017 /// ``` 1018 /// use regex::RegexSetBuilder; 1019 /// 1020 /// let re = RegexSetBuilder::new([r"^\n"]) 1021 /// .multi_line(true) 1022 /// .crlf(true) 1023 /// .build() 1024 /// .unwrap(); 1025 /// assert!(!re.is_match("\r\n")); 1026 /// ``` crlf(&mut self, yes: bool) -> &mut RegexSetBuilder1027 pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder { 1028 self.builder.crlf(yes); 1029 self 1030 } 1031 1032 /// Configures the line terminator to be used by the regex. 1033 /// 1034 /// The line terminator is relevant in two ways for a particular regex: 1035 /// 1036 /// * When dot-matches-new-line mode is *not* enabled (the default), 1037 /// then `.` will match any character except for the configured line 1038 /// terminator. 1039 /// * When multi-line mode is enabled (not the default), then `^` and 1040 /// `$` will match immediately after and before, respectively, a line 1041 /// terminator. 1042 /// 1043 /// In both cases, if CRLF mode is enabled in a particular context, 1044 /// then it takes precedence over any configured line terminator. 1045 /// 1046 /// This option cannot be configured from within the pattern. 1047 /// 1048 /// The default line terminator is `\n`. 1049 /// 1050 /// # Example 1051 /// 1052 /// This shows how to treat the NUL byte as a line terminator. This can 1053 /// be a useful heuristic when searching binary data. 1054 /// 1055 /// ``` 1056 /// use regex::RegexSetBuilder; 1057 /// 1058 /// let re = RegexSetBuilder::new([r"^foo$"]) 1059 /// .multi_line(true) 1060 /// .line_terminator(b'\x00') 1061 /// .build() 1062 /// .unwrap(); 1063 /// let hay = "\x00foo\x00"; 1064 /// assert!(re.is_match(hay)); 1065 /// ``` 1066 /// 1067 /// This example shows that the behavior of `.` is impacted by this 1068 /// setting as well: 1069 /// 1070 /// ``` 1071 /// use regex::RegexSetBuilder; 1072 /// 1073 /// let re = RegexSetBuilder::new([r"."]) 1074 /// .line_terminator(b'\x00') 1075 /// .build() 1076 /// .unwrap(); 1077 /// assert!(re.is_match("\n")); 1078 /// assert!(!re.is_match("\x00")); 1079 /// ``` 1080 /// 1081 /// This shows that building a regex will fail if the byte given 1082 /// is not ASCII and the pattern could result in matching invalid 1083 /// UTF-8. This is because any singular non-ASCII byte is not valid 1084 /// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid 1085 /// UTF-8. (It is permissible to use a non-ASCII byte when building a 1086 /// [`bytes::RegexSet`](crate::bytes::RegexSet).) 1087 /// 1088 /// ``` 1089 /// use regex::RegexSetBuilder; 1090 /// 1091 /// assert!( 1092 /// RegexSetBuilder::new([r"."]) 1093 /// .line_terminator(0x80) 1094 /// .build() 1095 /// .is_err() 1096 /// ); 1097 /// // Note that using a non-ASCII byte isn't enough on its own to 1098 /// // cause regex compilation to fail. You actually have to make use 1099 /// // of it in the regex in a way that leads to matching invalid 1100 /// // UTF-8. If you don't, then regex compilation will succeed! 1101 /// assert!( 1102 /// RegexSetBuilder::new([r"a"]) 1103 /// .line_terminator(0x80) 1104 /// .build() 1105 /// .is_ok() 1106 /// ); 1107 /// ``` line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder1108 pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder { 1109 self.builder.line_terminator(byte); 1110 self 1111 } 1112 1113 /// This configures swap-greed mode for all of the patterns. 1114 /// 1115 /// When swap-greed mode is enabled, patterns like `a+` will become 1116 /// non-greedy and patterns like `a+?` will become greedy. In other 1117 /// words, the meanings of `a+` and `a+?` are switched. 1118 /// 1119 /// This setting can also be configured using the inline flag `U` in 1120 /// the pattern. 1121 /// 1122 /// Note that this is generally not useful for a `RegexSet` since a 1123 /// `RegexSet` can only report whether a pattern matches or not. Since 1124 /// greediness never impacts whether a match is found or not (only the 1125 /// offsets of the match), it follows that whether parts of a pattern 1126 /// are greedy or not doesn't matter for a `RegexSet`. 1127 /// 1128 /// The default for this is `false`. swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder1129 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder { 1130 self.builder.swap_greed(yes); 1131 self 1132 } 1133 1134 /// This configures verbose mode for all of the patterns. 1135 /// 1136 /// When enabled, whitespace will treated as insignifcant in the 1137 /// pattern and `#` can be used to start a comment until the next new 1138 /// line. 1139 /// 1140 /// Normally, in most places in a pattern, whitespace is treated 1141 /// literally. For example ` +` will match one or more ASCII whitespace 1142 /// characters. 1143 /// 1144 /// When verbose mode is enabled, `\#` can be used to match a literal 1145 /// `#` and `\ ` can be used to match a literal ASCII whitespace 1146 /// character. 1147 /// 1148 /// Verbose mode is useful for permitting regexes to be formatted and 1149 /// broken up more nicely. This may make them more easily readable. 1150 /// 1151 /// This setting can also be configured using the inline flag `x` in 1152 /// the pattern. 1153 /// 1154 /// The default for this is `false`. 1155 /// 1156 /// # Example 1157 /// 1158 /// ``` 1159 /// use regex::RegexSetBuilder; 1160 /// 1161 /// let pat = r" 1162 /// \b 1163 /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter 1164 /// [\s--\n]+ # whitespace should separate names 1165 /// (?: # middle name can be an initial! 1166 /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) 1167 /// [\s--\n]+ 1168 /// )? 1169 /// (?<last>\p{Uppercase}\w*) 1170 /// \b 1171 /// "; 1172 /// let re = RegexSetBuilder::new([pat]) 1173 /// .ignore_whitespace(true) 1174 /// .build() 1175 /// .unwrap(); 1176 /// assert!(re.is_match("Harry Potter")); 1177 /// assert!(re.is_match("Harry J. Potter")); 1178 /// assert!(re.is_match("Harry James Potter")); 1179 /// assert!(!re.is_match("harry J. Potter")); 1180 /// ``` ignore_whitespace( &mut self, yes: bool, ) -> &mut RegexSetBuilder1181 pub fn ignore_whitespace( 1182 &mut self, 1183 yes: bool, 1184 ) -> &mut RegexSetBuilder { 1185 self.builder.ignore_whitespace(yes); 1186 self 1187 } 1188 1189 /// This configures octal mode for all of the patterns. 1190 /// 1191 /// Octal syntax is a little-known way of uttering Unicode codepoints 1192 /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all 1193 /// equivalent patterns, where the last example shows octal syntax. 1194 /// 1195 /// While supporting octal syntax isn't in and of itself a problem, 1196 /// it does make good error messages harder. That is, in PCRE based 1197 /// regex engines, syntax like `\1` invokes a backreference, which is 1198 /// explicitly unsupported this library. However, many users expect 1199 /// backreferences to be supported. Therefore, when octal support 1200 /// is disabled, the error message will explicitly mention that 1201 /// backreferences aren't supported. 1202 /// 1203 /// The default for this is `false`. 1204 /// 1205 /// # Example 1206 /// 1207 /// ``` 1208 /// use regex::RegexSetBuilder; 1209 /// 1210 /// // Normally this pattern would not compile, with an error message 1211 /// // about backreferences not being supported. But with octal mode 1212 /// // enabled, octal escape sequences work. 1213 /// let re = RegexSetBuilder::new([r"\141"]) 1214 /// .octal(true) 1215 /// .build() 1216 /// .unwrap(); 1217 /// assert!(re.is_match("a")); 1218 /// ``` octal(&mut self, yes: bool) -> &mut RegexSetBuilder1219 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { 1220 self.builder.octal(yes); 1221 self 1222 } 1223 1224 /// Sets the approximate size limit, in bytes, of the compiled regex. 1225 /// 1226 /// This roughly corresponds to the number of heap memory, in 1227 /// bytes, occupied by a single regex. If the regex would otherwise 1228 /// approximately exceed this limit, then compiling that regex will 1229 /// fail. 1230 /// 1231 /// The main utility of a method like this is to avoid compiling 1232 /// regexes that use an unexpected amount of resources, such as 1233 /// time and memory. Even if the memory usage of a large regex is 1234 /// acceptable, its search time may not be. Namely, worst case time 1235 /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and 1236 /// `n ~ len(haystack)`. That is, search time depends, in part, on the 1237 /// size of the compiled regex. This means that putting a limit on the 1238 /// size of the regex limits how much a regex can impact search time. 1239 /// 1240 /// For more information about regex size limits, see the section on 1241 /// [untrusted inputs](crate#untrusted-input) in the top-level crate 1242 /// documentation. 1243 /// 1244 /// The default for this is some reasonable number that permits most 1245 /// patterns to compile successfully. 1246 /// 1247 /// # Example 1248 /// 1249 /// ``` 1250 /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 1251 /// use regex::RegexSetBuilder; 1252 /// 1253 /// // It may surprise you how big some seemingly small patterns can 1254 /// // be! Since \w is Unicode aware, this generates a regex that can 1255 /// // match approximately 140,000 distinct codepoints. 1256 /// assert!( 1257 /// RegexSetBuilder::new([r"\w"]) 1258 /// .size_limit(45_000) 1259 /// .build() 1260 /// .is_err() 1261 /// ); 1262 /// ``` size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder1263 pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder { 1264 self.builder.size_limit(bytes); 1265 self 1266 } 1267 1268 /// Set the approximate capacity, in bytes, of the cache of transitions 1269 /// used by the lazy DFA. 1270 /// 1271 /// While the lazy DFA isn't always used, in tends to be the most 1272 /// commonly use regex engine in default configurations. It tends to 1273 /// adopt the performance profile of a fully build DFA, but without the 1274 /// downside of taking worst case exponential time to build. 1275 /// 1276 /// The downside is that it needs to keep a cache of transitions and 1277 /// states that are built while running a search, and this cache 1278 /// can fill up. When it fills up, the cache will reset itself. Any 1279 /// previously generated states and transitions will then need to be 1280 /// re-generated. If this happens too many times, then this library 1281 /// will bail out of using the lazy DFA and switch to a different regex 1282 /// engine. 1283 /// 1284 /// If your regex provokes this particular downside of the lazy DFA, 1285 /// then it may be beneficial to increase its cache capacity. This will 1286 /// potentially reduce the frequency of cache resetting (ideally to 1287 /// `0`). While it won't fix all potential performance problems with 1288 /// the lazy DFA, increasing the cache capacity does fix some. 1289 /// 1290 /// There is no easy way to determine, a priori, whether increasing 1291 /// this cache capacity will help. In general, the larger your regex, 1292 /// the more cache it's likely to use. But that isn't an ironclad rule. 1293 /// For example, a regex like `[01]*1[01]{N}` would normally produce a 1294 /// fully build DFA that is exponential in size with respect to `N`. 1295 /// The lazy DFA will prevent exponential space blow-up, but it cache 1296 /// is likely to fill up, even when it's large and even for smallish 1297 /// values of `N`. 1298 /// 1299 /// If you aren't sure whether this helps or not, it is sensible to 1300 /// set this to some arbitrarily large number in testing, such as 1301 /// `usize::MAX`. Namely, this represents the amount of capacity that 1302 /// *may* be used. It's probably not a good idea to use `usize::MAX` in 1303 /// production though, since it implies there are no controls on heap 1304 /// memory used by this library during a search. In effect, set it to 1305 /// whatever you're willing to allocate for a single regex search. dfa_size_limit( &mut self, bytes: usize, ) -> &mut RegexSetBuilder1306 pub fn dfa_size_limit( 1307 &mut self, 1308 bytes: usize, 1309 ) -> &mut RegexSetBuilder { 1310 self.builder.dfa_size_limit(bytes); 1311 self 1312 } 1313 1314 /// Set the nesting limit for this parser. 1315 /// 1316 /// The nesting limit controls how deep the abstract syntax tree is 1317 /// allowed to be. If the AST exceeds the given limit (e.g., with too 1318 /// many nested groups), then an error is returned by the parser. 1319 /// 1320 /// The purpose of this limit is to act as a heuristic to prevent stack 1321 /// overflow for consumers that do structural induction on an AST using 1322 /// explicit recursion. While this crate never does this (instead using 1323 /// constant stack space and moving the call stack to the heap), other 1324 /// crates may. 1325 /// 1326 /// This limit is not checked until the entire AST is parsed. 1327 /// Therefore, if callers want to put a limit on the amount of heap 1328 /// space used, then they should impose a limit on the length, in 1329 /// bytes, of the concrete pattern string. In particular, this is 1330 /// viable since this parser implementation will limit itself to heap 1331 /// space proportional to the length of the pattern string. See also 1332 /// the [untrusted inputs](crate#untrusted-input) section in the 1333 /// top-level crate documentation for more information about this. 1334 /// 1335 /// Note that a nest limit of `0` will return a nest limit error for 1336 /// most patterns but not all. For example, a nest limit of `0` permits 1337 /// `a` but not `ab`, since `ab` requires an explicit concatenation, 1338 /// which results in a nest depth of `1`. In general, a nest limit is 1339 /// not something that manifests in an obvious way in the concrete 1340 /// syntax, therefore, it should not be used in a granular way. 1341 /// 1342 /// # Example 1343 /// 1344 /// ``` 1345 /// use regex::RegexSetBuilder; 1346 /// 1347 /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok()); 1348 /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err()); 1349 /// ``` nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder1350 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder { 1351 self.builder.nest_limit(limit); 1352 self 1353 } 1354 } 1355 } 1356 1357 pub(crate) mod bytes { 1358 use crate::{ 1359 bytes::{Regex, RegexSet}, 1360 error::Error, 1361 }; 1362 1363 use super::Builder; 1364 1365 /// A configurable builder for a [`Regex`]. 1366 /// 1367 /// This builder can be used to programmatically set flags such as `i` 1368 /// (case insensitive) and `x` (for verbose mode). This builder can also be 1369 /// used to configure things like the line terminator and a size limit on 1370 /// the compiled regular expression. 1371 #[derive(Clone, Debug)] 1372 pub struct RegexBuilder { 1373 builder: Builder, 1374 } 1375 1376 impl RegexBuilder { 1377 /// Create a new builder with a default configuration for the given 1378 /// pattern. 1379 /// 1380 /// If the pattern is invalid or exceeds the configured size limits, 1381 /// then an error will be returned when [`RegexBuilder::build`] is 1382 /// called. new(pattern: &str) -> RegexBuilder1383 pub fn new(pattern: &str) -> RegexBuilder { 1384 RegexBuilder { builder: Builder::new([pattern]) } 1385 } 1386 1387 /// Compiles the pattern given to `RegexBuilder::new` with the 1388 /// configuration set on this builder. 1389 /// 1390 /// If the pattern isn't a valid regex or if a configured size limit 1391 /// was exceeded, then an error is returned. build(&self) -> Result<Regex, Error>1392 pub fn build(&self) -> Result<Regex, Error> { 1393 self.builder.build_one_bytes() 1394 } 1395 1396 /// This configures Unicode mode for the entire pattern. 1397 /// 1398 /// Enabling Unicode mode does a number of things: 1399 /// 1400 /// * Most fundamentally, it causes the fundamental atom of matching 1401 /// to be a single codepoint. When Unicode mode is disabled, it's a 1402 /// single byte. For example, when Unicode mode is enabled, `.` will 1403 /// match `` once, where as it will match 4 times when Unicode mode 1404 /// is disabled. (Since the UTF-8 encoding of `` is 4 bytes long.) 1405 /// * Case insensitive matching uses Unicode simple case folding rules. 1406 /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are 1407 /// available. 1408 /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and 1409 /// `\d`. 1410 /// * The word boundary assertions, `\b` and `\B`, use the Unicode 1411 /// definition of a word character. 1412 /// 1413 /// Note that unlike the top-level `Regex` for searching `&str`, it 1414 /// is permitted to disable Unicode mode even if the resulting pattern 1415 /// could match invalid UTF-8. For example, `(?-u:.)` is not a valid 1416 /// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`. 1417 /// 1418 /// For more details on the Unicode support in this crate, see the 1419 /// [Unicode section](crate#unicode) in this crate's top-level 1420 /// documentation. 1421 /// 1422 /// The default for this is `true`. 1423 /// 1424 /// # Example 1425 /// 1426 /// ``` 1427 /// use regex::bytes::RegexBuilder; 1428 /// 1429 /// let re = RegexBuilder::new(r"\w") 1430 /// .unicode(false) 1431 /// .build() 1432 /// .unwrap(); 1433 /// // Normally greek letters would be included in \w, but since 1434 /// // Unicode mode is disabled, it only matches ASCII letters. 1435 /// assert!(!re.is_match("δ".as_bytes())); 1436 /// 1437 /// let re = RegexBuilder::new(r"s") 1438 /// .case_insensitive(true) 1439 /// .unicode(false) 1440 /// .build() 1441 /// .unwrap(); 1442 /// // Normally 'ſ' is included when searching for 's' case 1443 /// // insensitively due to Unicode's simple case folding rules. But 1444 /// // when Unicode mode is disabled, only ASCII case insensitive rules 1445 /// // are used. 1446 /// assert!(!re.is_match("ſ".as_bytes())); 1447 /// ``` 1448 /// 1449 /// Since this builder is for constructing a [`bytes::Regex`](Regex), 1450 /// one can disable Unicode mode even if it would match invalid UTF-8: 1451 /// 1452 /// ``` 1453 /// use regex::bytes::RegexBuilder; 1454 /// 1455 /// let re = RegexBuilder::new(r".") 1456 /// .unicode(false) 1457 /// .build() 1458 /// .unwrap(); 1459 /// // Normally greek letters would be included in \w, but since 1460 /// // Unicode mode is disabled, it only matches ASCII letters. 1461 /// assert!(re.is_match(b"\xFF")); 1462 /// ``` unicode(&mut self, yes: bool) -> &mut RegexBuilder1463 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { 1464 self.builder.unicode(yes); 1465 self 1466 } 1467 1468 /// This configures whether to enable case insensitive matching for the 1469 /// entire pattern. 1470 /// 1471 /// This setting can also be configured using the inline flag `i` 1472 /// in the pattern. For example, `(?i:foo)` matches `foo` case 1473 /// insensitively while `(?-i:foo)` matches `foo` case sensitively. 1474 /// 1475 /// The default for this is `false`. 1476 /// 1477 /// # Example 1478 /// 1479 /// ``` 1480 /// use regex::bytes::RegexBuilder; 1481 /// 1482 /// let re = RegexBuilder::new(r"foo(?-i:bar)quux") 1483 /// .case_insensitive(true) 1484 /// .build() 1485 /// .unwrap(); 1486 /// assert!(re.is_match(b"FoObarQuUx")); 1487 /// // Even though case insensitive matching is enabled in the builder, 1488 /// // it can be locally disabled within the pattern. In this case, 1489 /// // `bar` is matched case sensitively. 1490 /// assert!(!re.is_match(b"fooBARquux")); 1491 /// ``` case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder1492 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { 1493 self.builder.case_insensitive(yes); 1494 self 1495 } 1496 1497 /// This configures multi-line mode for the entire pattern. 1498 /// 1499 /// Enabling multi-line mode changes the behavior of the `^` and `$` 1500 /// anchor assertions. Instead of only matching at the beginning and 1501 /// end of a haystack, respectively, multi-line mode causes them to 1502 /// match at the beginning and end of a line *in addition* to the 1503 /// beginning and end of a haystack. More precisely, `^` will match at 1504 /// the position immediately following a `\n` and `$` will match at the 1505 /// position immediately preceding a `\n`. 1506 /// 1507 /// The behavior of this option can be impacted by other settings too: 1508 /// 1509 /// * The [`RegexBuilder::line_terminator`] option changes `\n` above 1510 /// to any ASCII byte. 1511 /// * The [`RegexBuilder::crlf`] option changes the line terminator to 1512 /// be either `\r` or `\n`, but never at the position between a `\r` 1513 /// and `\n`. 1514 /// 1515 /// This setting can also be configured using the inline flag `m` in 1516 /// the pattern. 1517 /// 1518 /// The default for this is `false`. 1519 /// 1520 /// # Example 1521 /// 1522 /// ``` 1523 /// use regex::bytes::RegexBuilder; 1524 /// 1525 /// let re = RegexBuilder::new(r"^foo$") 1526 /// .multi_line(true) 1527 /// .build() 1528 /// .unwrap(); 1529 /// assert_eq!(Some(1..4), re.find(b"\nfoo\n").map(|m| m.range())); 1530 /// ``` multi_line(&mut self, yes: bool) -> &mut RegexBuilder1531 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { 1532 self.builder.multi_line(yes); 1533 self 1534 } 1535 1536 /// This configures dot-matches-new-line mode for the entire pattern. 1537 /// 1538 /// Perhaps surprisingly, the default behavior for `.` is not to match 1539 /// any character, but rather, to match any character except for the 1540 /// line terminator (which is `\n` by default). When this mode is 1541 /// enabled, the behavior changes such that `.` truly matches any 1542 /// character. 1543 /// 1544 /// This setting can also be configured using the inline flag `s` in 1545 /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent 1546 /// regexes. 1547 /// 1548 /// The default for this is `false`. 1549 /// 1550 /// # Example 1551 /// 1552 /// ``` 1553 /// use regex::bytes::RegexBuilder; 1554 /// 1555 /// let re = RegexBuilder::new(r"foo.bar") 1556 /// .dot_matches_new_line(true) 1557 /// .build() 1558 /// .unwrap(); 1559 /// let hay = b"foo\nbar"; 1560 /// assert_eq!(Some(&b"foo\nbar"[..]), re.find(hay).map(|m| m.as_bytes())); 1561 /// ``` dot_matches_new_line( &mut self, yes: bool, ) -> &mut RegexBuilder1562 pub fn dot_matches_new_line( 1563 &mut self, 1564 yes: bool, 1565 ) -> &mut RegexBuilder { 1566 self.builder.dot_matches_new_line(yes); 1567 self 1568 } 1569 1570 /// This configures CRLF mode for the entire pattern. 1571 /// 1572 /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for 1573 /// short) and `\n` ("line feed" or LF for short) are treated as line 1574 /// terminators. This results in the following: 1575 /// 1576 /// * Unless dot-matches-new-line mode is enabled, `.` will now match 1577 /// any character except for `\n` and `\r`. 1578 /// * When multi-line mode is enabled, `^` will match immediately 1579 /// following a `\n` or a `\r`. Similarly, `$` will match immediately 1580 /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match 1581 /// between `\r` and `\n`. 1582 /// 1583 /// This setting can also be configured using the inline flag `R` in 1584 /// the pattern. 1585 /// 1586 /// The default for this is `false`. 1587 /// 1588 /// # Example 1589 /// 1590 /// ``` 1591 /// use regex::bytes::RegexBuilder; 1592 /// 1593 /// let re = RegexBuilder::new(r"^foo$") 1594 /// .multi_line(true) 1595 /// .crlf(true) 1596 /// .build() 1597 /// .unwrap(); 1598 /// let hay = b"\r\nfoo\r\n"; 1599 /// // If CRLF mode weren't enabled here, then '$' wouldn't match 1600 /// // immediately after 'foo', and thus no match would be found. 1601 /// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(|m| m.as_bytes())); 1602 /// ``` 1603 /// 1604 /// This example demonstrates that `^` will never match at a position 1605 /// between `\r` and `\n`. (`$` will similarly not match between a `\r` 1606 /// and a `\n`.) 1607 /// 1608 /// ``` 1609 /// use regex::bytes::RegexBuilder; 1610 /// 1611 /// let re = RegexBuilder::new(r"^") 1612 /// .multi_line(true) 1613 /// .crlf(true) 1614 /// .build() 1615 /// .unwrap(); 1616 /// let hay = b"\r\n\r\n"; 1617 /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect(); 1618 /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]); 1619 /// ``` crlf(&mut self, yes: bool) -> &mut RegexBuilder1620 pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { 1621 self.builder.crlf(yes); 1622 self 1623 } 1624 1625 /// Configures the line terminator to be used by the regex. 1626 /// 1627 /// The line terminator is relevant in two ways for a particular regex: 1628 /// 1629 /// * When dot-matches-new-line mode is *not* enabled (the default), 1630 /// then `.` will match any character except for the configured line 1631 /// terminator. 1632 /// * When multi-line mode is enabled (not the default), then `^` and 1633 /// `$` will match immediately after and before, respectively, a line 1634 /// terminator. 1635 /// 1636 /// In both cases, if CRLF mode is enabled in a particular context, 1637 /// then it takes precedence over any configured line terminator. 1638 /// 1639 /// This option cannot be configured from within the pattern. 1640 /// 1641 /// The default line terminator is `\n`. 1642 /// 1643 /// # Example 1644 /// 1645 /// This shows how to treat the NUL byte as a line terminator. This can 1646 /// be a useful heuristic when searching binary data. 1647 /// 1648 /// ``` 1649 /// use regex::bytes::RegexBuilder; 1650 /// 1651 /// let re = RegexBuilder::new(r"^foo$") 1652 /// .multi_line(true) 1653 /// .line_terminator(b'\x00') 1654 /// .build() 1655 /// .unwrap(); 1656 /// let hay = b"\x00foo\x00"; 1657 /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range())); 1658 /// ``` 1659 /// 1660 /// This example shows that the behavior of `.` is impacted by this 1661 /// setting as well: 1662 /// 1663 /// ``` 1664 /// use regex::bytes::RegexBuilder; 1665 /// 1666 /// let re = RegexBuilder::new(r".") 1667 /// .line_terminator(b'\x00') 1668 /// .build() 1669 /// .unwrap(); 1670 /// assert!(re.is_match(b"\n")); 1671 /// assert!(!re.is_match(b"\x00")); 1672 /// ``` 1673 /// 1674 /// This shows that building a regex will work even when the byte 1675 /// given is not ASCII. This is unlike the top-level `Regex` API where 1676 /// matching invalid UTF-8 is not allowed. 1677 /// 1678 /// Note though that you must disable Unicode mode. This is required 1679 /// because Unicode mode requires matching one codepoint at a time, 1680 /// and there is no way to match a non-ASCII byte as if it were a 1681 /// codepoint. 1682 /// 1683 /// ``` 1684 /// use regex::bytes::RegexBuilder; 1685 /// 1686 /// assert!( 1687 /// RegexBuilder::new(r".") 1688 /// .unicode(false) 1689 /// .line_terminator(0x80) 1690 /// .build() 1691 /// .is_ok(), 1692 /// ); 1693 /// ``` line_terminator(&mut self, byte: u8) -> &mut RegexBuilder1694 pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder { 1695 self.builder.line_terminator(byte); 1696 self 1697 } 1698 1699 /// This configures swap-greed mode for the entire pattern. 1700 /// 1701 /// When swap-greed mode is enabled, patterns like `a+` will become 1702 /// non-greedy and patterns like `a+?` will become greedy. In other 1703 /// words, the meanings of `a+` and `a+?` are switched. 1704 /// 1705 /// This setting can also be configured using the inline flag `U` in 1706 /// the pattern. 1707 /// 1708 /// The default for this is `false`. 1709 /// 1710 /// # Example 1711 /// 1712 /// ``` 1713 /// use regex::bytes::RegexBuilder; 1714 /// 1715 /// let re = RegexBuilder::new(r"a+") 1716 /// .swap_greed(true) 1717 /// .build() 1718 /// .unwrap(); 1719 /// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(|m| m.as_bytes())); 1720 /// ``` swap_greed(&mut self, yes: bool) -> &mut RegexBuilder1721 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { 1722 self.builder.swap_greed(yes); 1723 self 1724 } 1725 1726 /// This configures verbose mode for the entire pattern. 1727 /// 1728 /// When enabled, whitespace will treated as insignifcant in the 1729 /// pattern and `#` can be used to start a comment until the next new 1730 /// line. 1731 /// 1732 /// Normally, in most places in a pattern, whitespace is treated 1733 /// literally. For example ` +` will match one or more ASCII whitespace 1734 /// characters. 1735 /// 1736 /// When verbose mode is enabled, `\#` can be used to match a literal 1737 /// `#` and `\ ` can be used to match a literal ASCII whitespace 1738 /// character. 1739 /// 1740 /// Verbose mode is useful for permitting regexes to be formatted and 1741 /// broken up more nicely. This may make them more easily readable. 1742 /// 1743 /// This setting can also be configured using the inline flag `x` in 1744 /// the pattern. 1745 /// 1746 /// The default for this is `false`. 1747 /// 1748 /// # Example 1749 /// 1750 /// ``` 1751 /// use regex::bytes::RegexBuilder; 1752 /// 1753 /// let pat = r" 1754 /// \b 1755 /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter 1756 /// [\s--\n]+ # whitespace should separate names 1757 /// (?: # middle name can be an initial! 1758 /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) 1759 /// [\s--\n]+ 1760 /// )? 1761 /// (?<last>\p{Uppercase}\w*) 1762 /// \b 1763 /// "; 1764 /// let re = RegexBuilder::new(pat) 1765 /// .ignore_whitespace(true) 1766 /// .build() 1767 /// .unwrap(); 1768 /// 1769 /// let caps = re.captures(b"Harry Potter").unwrap(); 1770 /// assert_eq!(&b"Harry"[..], &caps["first"]); 1771 /// assert_eq!(&b"Potter"[..], &caps["last"]); 1772 /// 1773 /// let caps = re.captures(b"Harry J. Potter").unwrap(); 1774 /// assert_eq!(&b"Harry"[..], &caps["first"]); 1775 /// // Since a middle name/initial isn't required for an overall match, 1776 /// // we can't assume that 'initial' or 'middle' will be populated! 1777 /// assert_eq!( 1778 /// Some(&b"J"[..]), 1779 /// caps.name("initial").map(|m| m.as_bytes()), 1780 /// ); 1781 /// assert_eq!(None, caps.name("middle").map(|m| m.as_bytes())); 1782 /// assert_eq!(&b"Potter"[..], &caps["last"]); 1783 /// 1784 /// let caps = re.captures(b"Harry James Potter").unwrap(); 1785 /// assert_eq!(&b"Harry"[..], &caps["first"]); 1786 /// // Since a middle name/initial isn't required for an overall match, 1787 /// // we can't assume that 'initial' or 'middle' will be populated! 1788 /// assert_eq!(None, caps.name("initial").map(|m| m.as_bytes())); 1789 /// assert_eq!( 1790 /// Some(&b"James"[..]), 1791 /// caps.name("middle").map(|m| m.as_bytes()), 1792 /// ); 1793 /// assert_eq!(&b"Potter"[..], &caps["last"]); 1794 /// ``` ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder1795 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { 1796 self.builder.ignore_whitespace(yes); 1797 self 1798 } 1799 1800 /// This configures octal mode for the entire pattern. 1801 /// 1802 /// Octal syntax is a little-known way of uttering Unicode codepoints 1803 /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all 1804 /// equivalent patterns, where the last example shows octal syntax. 1805 /// 1806 /// While supporting octal syntax isn't in and of itself a problem, 1807 /// it does make good error messages harder. That is, in PCRE based 1808 /// regex engines, syntax like `\1` invokes a backreference, which is 1809 /// explicitly unsupported this library. However, many users expect 1810 /// backreferences to be supported. Therefore, when octal support 1811 /// is disabled, the error message will explicitly mention that 1812 /// backreferences aren't supported. 1813 /// 1814 /// The default for this is `false`. 1815 /// 1816 /// # Example 1817 /// 1818 /// ``` 1819 /// use regex::bytes::RegexBuilder; 1820 /// 1821 /// // Normally this pattern would not compile, with an error message 1822 /// // about backreferences not being supported. But with octal mode 1823 /// // enabled, octal escape sequences work. 1824 /// let re = RegexBuilder::new(r"\141") 1825 /// .octal(true) 1826 /// .build() 1827 /// .unwrap(); 1828 /// assert!(re.is_match(b"a")); 1829 /// ``` octal(&mut self, yes: bool) -> &mut RegexBuilder1830 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { 1831 self.builder.octal(yes); 1832 self 1833 } 1834 1835 /// Sets the approximate size limit, in bytes, of the compiled regex. 1836 /// 1837 /// This roughly corresponds to the number of heap memory, in 1838 /// bytes, occupied by a single regex. If the regex would otherwise 1839 /// approximately exceed this limit, then compiling that regex will 1840 /// fail. 1841 /// 1842 /// The main utility of a method like this is to avoid compiling 1843 /// regexes that use an unexpected amount of resources, such as 1844 /// time and memory. Even if the memory usage of a large regex is 1845 /// acceptable, its search time may not be. Namely, worst case time 1846 /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and 1847 /// `n ~ len(haystack)`. That is, search time depends, in part, on the 1848 /// size of the compiled regex. This means that putting a limit on the 1849 /// size of the regex limits how much a regex can impact search time. 1850 /// 1851 /// For more information about regex size limits, see the section on 1852 /// [untrusted inputs](crate#untrusted-input) in the top-level crate 1853 /// documentation. 1854 /// 1855 /// The default for this is some reasonable number that permits most 1856 /// patterns to compile successfully. 1857 /// 1858 /// # Example 1859 /// 1860 /// ``` 1861 /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 1862 /// use regex::bytes::RegexBuilder; 1863 /// 1864 /// // It may surprise you how big some seemingly small patterns can 1865 /// // be! Since \w is Unicode aware, this generates a regex that can 1866 /// // match approximately 140,000 distinct codepoints. 1867 /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err()); 1868 /// ``` size_limit(&mut self, bytes: usize) -> &mut RegexBuilder1869 pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { 1870 self.builder.size_limit(bytes); 1871 self 1872 } 1873 1874 /// Set the approximate capacity, in bytes, of the cache of transitions 1875 /// used by the lazy DFA. 1876 /// 1877 /// While the lazy DFA isn't always used, in tends to be the most 1878 /// commonly use regex engine in default configurations. It tends to 1879 /// adopt the performance profile of a fully build DFA, but without the 1880 /// downside of taking worst case exponential time to build. 1881 /// 1882 /// The downside is that it needs to keep a cache of transitions and 1883 /// states that are built while running a search, and this cache 1884 /// can fill up. When it fills up, the cache will reset itself. Any 1885 /// previously generated states and transitions will then need to be 1886 /// re-generated. If this happens too many times, then this library 1887 /// will bail out of using the lazy DFA and switch to a different regex 1888 /// engine. 1889 /// 1890 /// If your regex provokes this particular downside of the lazy DFA, 1891 /// then it may be beneficial to increase its cache capacity. This will 1892 /// potentially reduce the frequency of cache resetting (ideally to 1893 /// `0`). While it won't fix all potential performance problems with 1894 /// the lazy DFA, increasing the cache capacity does fix some. 1895 /// 1896 /// There is no easy way to determine, a priori, whether increasing 1897 /// this cache capacity will help. In general, the larger your regex, 1898 /// the more cache it's likely to use. But that isn't an ironclad rule. 1899 /// For example, a regex like `[01]*1[01]{N}` would normally produce a 1900 /// fully build DFA that is exponential in size with respect to `N`. 1901 /// The lazy DFA will prevent exponential space blow-up, but it cache 1902 /// is likely to fill up, even when it's large and even for smallish 1903 /// values of `N`. 1904 /// 1905 /// If you aren't sure whether this helps or not, it is sensible to 1906 /// set this to some arbitrarily large number in testing, such as 1907 /// `usize::MAX`. Namely, this represents the amount of capacity that 1908 /// *may* be used. It's probably not a good idea to use `usize::MAX` in 1909 /// production though, since it implies there are no controls on heap 1910 /// memory used by this library during a search. In effect, set it to 1911 /// whatever you're willing to allocate for a single regex search. dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder1912 pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { 1913 self.builder.dfa_size_limit(bytes); 1914 self 1915 } 1916 1917 /// Set the nesting limit for this parser. 1918 /// 1919 /// The nesting limit controls how deep the abstract syntax tree is 1920 /// allowed to be. If the AST exceeds the given limit (e.g., with too 1921 /// many nested groups), then an error is returned by the parser. 1922 /// 1923 /// The purpose of this limit is to act as a heuristic to prevent stack 1924 /// overflow for consumers that do structural induction on an AST using 1925 /// explicit recursion. While this crate never does this (instead using 1926 /// constant stack space and moving the call stack to the heap), other 1927 /// crates may. 1928 /// 1929 /// This limit is not checked until the entire AST is parsed. 1930 /// Therefore, if callers want to put a limit on the amount of heap 1931 /// space used, then they should impose a limit on the length, in 1932 /// bytes, of the concrete pattern string. In particular, this is 1933 /// viable since this parser implementation will limit itself to heap 1934 /// space proportional to the length of the pattern string. See also 1935 /// the [untrusted inputs](crate#untrusted-input) section in the 1936 /// top-level crate documentation for more information about this. 1937 /// 1938 /// Note that a nest limit of `0` will return a nest limit error for 1939 /// most patterns but not all. For example, a nest limit of `0` permits 1940 /// `a` but not `ab`, since `ab` requires an explicit concatenation, 1941 /// which results in a nest depth of `1`. In general, a nest limit is 1942 /// not something that manifests in an obvious way in the concrete 1943 /// syntax, therefore, it should not be used in a granular way. 1944 /// 1945 /// # Example 1946 /// 1947 /// ``` 1948 /// use regex::bytes::RegexBuilder; 1949 /// 1950 /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok()); 1951 /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err()); 1952 /// ``` nest_limit(&mut self, limit: u32) -> &mut RegexBuilder1953 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { 1954 self.builder.nest_limit(limit); 1955 self 1956 } 1957 } 1958 1959 /// A configurable builder for a [`RegexSet`]. 1960 /// 1961 /// This builder can be used to programmatically set flags such as `i` 1962 /// (case insensitive) and `x` (for verbose mode). This builder can also be 1963 /// used to configure things like the line terminator and a size limit on 1964 /// the compiled regular expression. 1965 #[derive(Clone, Debug)] 1966 pub struct RegexSetBuilder { 1967 builder: Builder, 1968 } 1969 1970 impl RegexSetBuilder { 1971 /// Create a new builder with a default configuration for the given 1972 /// patterns. 1973 /// 1974 /// If the patterns are invalid or exceed the configured size limits, 1975 /// then an error will be returned when [`RegexSetBuilder::build`] is 1976 /// called. new<I, S>(patterns: I) -> RegexSetBuilder where I: IntoIterator<Item = S>, S: AsRef<str>,1977 pub fn new<I, S>(patterns: I) -> RegexSetBuilder 1978 where 1979 I: IntoIterator<Item = S>, 1980 S: AsRef<str>, 1981 { 1982 RegexSetBuilder { builder: Builder::new(patterns) } 1983 } 1984 1985 /// Compiles the patterns given to `RegexSetBuilder::new` with the 1986 /// configuration set on this builder. 1987 /// 1988 /// If the patterns aren't valid regexes or if a configured size limit 1989 /// was exceeded, then an error is returned. build(&self) -> Result<RegexSet, Error>1990 pub fn build(&self) -> Result<RegexSet, Error> { 1991 self.builder.build_many_bytes() 1992 } 1993 1994 /// This configures Unicode mode for the all of the patterns. 1995 /// 1996 /// Enabling Unicode mode does a number of things: 1997 /// 1998 /// * Most fundamentally, it causes the fundamental atom of matching 1999 /// to be a single codepoint. When Unicode mode is disabled, it's a 2000 /// single byte. For example, when Unicode mode is enabled, `.` will 2001 /// match `` once, where as it will match 4 times when Unicode mode 2002 /// is disabled. (Since the UTF-8 encoding of `` is 4 bytes long.) 2003 /// * Case insensitive matching uses Unicode simple case folding rules. 2004 /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are 2005 /// available. 2006 /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and 2007 /// `\d`. 2008 /// * The word boundary assertions, `\b` and `\B`, use the Unicode 2009 /// definition of a word character. 2010 /// 2011 /// Note that unlike the top-level `RegexSet` for searching `&str`, 2012 /// it is permitted to disable Unicode mode even if the resulting 2013 /// pattern could match invalid UTF-8. For example, `(?-u:.)` is not 2014 /// a valid pattern for a top-level `RegexSet`, but is valid for a 2015 /// `bytes::RegexSet`. 2016 /// 2017 /// For more details on the Unicode support in this crate, see the 2018 /// [Unicode section](crate#unicode) in this crate's top-level 2019 /// documentation. 2020 /// 2021 /// The default for this is `true`. 2022 /// 2023 /// # Example 2024 /// 2025 /// ``` 2026 /// use regex::bytes::RegexSetBuilder; 2027 /// 2028 /// let re = RegexSetBuilder::new([r"\w"]) 2029 /// .unicode(false) 2030 /// .build() 2031 /// .unwrap(); 2032 /// // Normally greek letters would be included in \w, but since 2033 /// // Unicode mode is disabled, it only matches ASCII letters. 2034 /// assert!(!re.is_match("δ".as_bytes())); 2035 /// 2036 /// let re = RegexSetBuilder::new([r"s"]) 2037 /// .case_insensitive(true) 2038 /// .unicode(false) 2039 /// .build() 2040 /// .unwrap(); 2041 /// // Normally 'ſ' is included when searching for 's' case 2042 /// // insensitively due to Unicode's simple case folding rules. But 2043 /// // when Unicode mode is disabled, only ASCII case insensitive rules 2044 /// // are used. 2045 /// assert!(!re.is_match("ſ".as_bytes())); 2046 /// ``` 2047 /// 2048 /// Since this builder is for constructing a 2049 /// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if 2050 /// it would match invalid UTF-8: 2051 /// 2052 /// ``` 2053 /// use regex::bytes::RegexSetBuilder; 2054 /// 2055 /// let re = RegexSetBuilder::new([r"."]) 2056 /// .unicode(false) 2057 /// .build() 2058 /// .unwrap(); 2059 /// // Normally greek letters would be included in \w, but since 2060 /// // Unicode mode is disabled, it only matches ASCII letters. 2061 /// assert!(re.is_match(b"\xFF")); 2062 /// ``` unicode(&mut self, yes: bool) -> &mut RegexSetBuilder2063 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { 2064 self.builder.unicode(yes); 2065 self 2066 } 2067 2068 /// This configures whether to enable case insensitive matching for all 2069 /// of the patterns. 2070 /// 2071 /// This setting can also be configured using the inline flag `i` 2072 /// in the pattern. For example, `(?i:foo)` matches `foo` case 2073 /// insensitively while `(?-i:foo)` matches `foo` case sensitively. 2074 /// 2075 /// The default for this is `false`. 2076 /// 2077 /// # Example 2078 /// 2079 /// ``` 2080 /// use regex::bytes::RegexSetBuilder; 2081 /// 2082 /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"]) 2083 /// .case_insensitive(true) 2084 /// .build() 2085 /// .unwrap(); 2086 /// assert!(re.is_match(b"FoObarQuUx")); 2087 /// // Even though case insensitive matching is enabled in the builder, 2088 /// // it can be locally disabled within the pattern. In this case, 2089 /// // `bar` is matched case sensitively. 2090 /// assert!(!re.is_match(b"fooBARquux")); 2091 /// ``` case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder2092 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder { 2093 self.builder.case_insensitive(yes); 2094 self 2095 } 2096 2097 /// This configures multi-line mode for all of the patterns. 2098 /// 2099 /// Enabling multi-line mode changes the behavior of the `^` and `$` 2100 /// anchor assertions. Instead of only matching at the beginning and 2101 /// end of a haystack, respectively, multi-line mode causes them to 2102 /// match at the beginning and end of a line *in addition* to the 2103 /// beginning and end of a haystack. More precisely, `^` will match at 2104 /// the position immediately following a `\n` and `$` will match at the 2105 /// position immediately preceding a `\n`. 2106 /// 2107 /// The behavior of this option can be impacted by other settings too: 2108 /// 2109 /// * The [`RegexSetBuilder::line_terminator`] option changes `\n` 2110 /// above to any ASCII byte. 2111 /// * The [`RegexSetBuilder::crlf`] option changes the line terminator 2112 /// to be either `\r` or `\n`, but never at the position between a `\r` 2113 /// and `\n`. 2114 /// 2115 /// This setting can also be configured using the inline flag `m` in 2116 /// the pattern. 2117 /// 2118 /// The default for this is `false`. 2119 /// 2120 /// # Example 2121 /// 2122 /// ``` 2123 /// use regex::bytes::RegexSetBuilder; 2124 /// 2125 /// let re = RegexSetBuilder::new([r"^foo$"]) 2126 /// .multi_line(true) 2127 /// .build() 2128 /// .unwrap(); 2129 /// assert!(re.is_match(b"\nfoo\n")); 2130 /// ``` multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder2131 pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder { 2132 self.builder.multi_line(yes); 2133 self 2134 } 2135 2136 /// This configures dot-matches-new-line mode for the entire pattern. 2137 /// 2138 /// Perhaps surprisingly, the default behavior for `.` is not to match 2139 /// any character, but rather, to match any character except for the 2140 /// line terminator (which is `\n` by default). When this mode is 2141 /// enabled, the behavior changes such that `.` truly matches any 2142 /// character. 2143 /// 2144 /// This setting can also be configured using the inline flag `s` in 2145 /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent 2146 /// regexes. 2147 /// 2148 /// The default for this is `false`. 2149 /// 2150 /// # Example 2151 /// 2152 /// ``` 2153 /// use regex::bytes::RegexSetBuilder; 2154 /// 2155 /// let re = RegexSetBuilder::new([r"foo.bar"]) 2156 /// .dot_matches_new_line(true) 2157 /// .build() 2158 /// .unwrap(); 2159 /// let hay = b"foo\nbar"; 2160 /// assert!(re.is_match(hay)); 2161 /// ``` dot_matches_new_line( &mut self, yes: bool, ) -> &mut RegexSetBuilder2162 pub fn dot_matches_new_line( 2163 &mut self, 2164 yes: bool, 2165 ) -> &mut RegexSetBuilder { 2166 self.builder.dot_matches_new_line(yes); 2167 self 2168 } 2169 2170 /// This configures CRLF mode for all of the patterns. 2171 /// 2172 /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for 2173 /// short) and `\n` ("line feed" or LF for short) are treated as line 2174 /// terminators. This results in the following: 2175 /// 2176 /// * Unless dot-matches-new-line mode is enabled, `.` will now match 2177 /// any character except for `\n` and `\r`. 2178 /// * When multi-line mode is enabled, `^` will match immediately 2179 /// following a `\n` or a `\r`. Similarly, `$` will match immediately 2180 /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match 2181 /// between `\r` and `\n`. 2182 /// 2183 /// This setting can also be configured using the inline flag `R` in 2184 /// the pattern. 2185 /// 2186 /// The default for this is `false`. 2187 /// 2188 /// # Example 2189 /// 2190 /// ``` 2191 /// use regex::bytes::RegexSetBuilder; 2192 /// 2193 /// let re = RegexSetBuilder::new([r"^foo$"]) 2194 /// .multi_line(true) 2195 /// .crlf(true) 2196 /// .build() 2197 /// .unwrap(); 2198 /// let hay = b"\r\nfoo\r\n"; 2199 /// // If CRLF mode weren't enabled here, then '$' wouldn't match 2200 /// // immediately after 'foo', and thus no match would be found. 2201 /// assert!(re.is_match(hay)); 2202 /// ``` 2203 /// 2204 /// This example demonstrates that `^` will never match at a position 2205 /// between `\r` and `\n`. (`$` will similarly not match between a `\r` 2206 /// and a `\n`.) 2207 /// 2208 /// ``` 2209 /// use regex::bytes::RegexSetBuilder; 2210 /// 2211 /// let re = RegexSetBuilder::new([r"^\n"]) 2212 /// .multi_line(true) 2213 /// .crlf(true) 2214 /// .build() 2215 /// .unwrap(); 2216 /// assert!(!re.is_match(b"\r\n")); 2217 /// ``` crlf(&mut self, yes: bool) -> &mut RegexSetBuilder2218 pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder { 2219 self.builder.crlf(yes); 2220 self 2221 } 2222 2223 /// Configures the line terminator to be used by the regex. 2224 /// 2225 /// The line terminator is relevant in two ways for a particular regex: 2226 /// 2227 /// * When dot-matches-new-line mode is *not* enabled (the default), 2228 /// then `.` will match any character except for the configured line 2229 /// terminator. 2230 /// * When multi-line mode is enabled (not the default), then `^` and 2231 /// `$` will match immediately after and before, respectively, a line 2232 /// terminator. 2233 /// 2234 /// In both cases, if CRLF mode is enabled in a particular context, 2235 /// then it takes precedence over any configured line terminator. 2236 /// 2237 /// This option cannot be configured from within the pattern. 2238 /// 2239 /// The default line terminator is `\n`. 2240 /// 2241 /// # Example 2242 /// 2243 /// This shows how to treat the NUL byte as a line terminator. This can 2244 /// be a useful heuristic when searching binary data. 2245 /// 2246 /// ``` 2247 /// use regex::bytes::RegexSetBuilder; 2248 /// 2249 /// let re = RegexSetBuilder::new([r"^foo$"]) 2250 /// .multi_line(true) 2251 /// .line_terminator(b'\x00') 2252 /// .build() 2253 /// .unwrap(); 2254 /// let hay = b"\x00foo\x00"; 2255 /// assert!(re.is_match(hay)); 2256 /// ``` 2257 /// 2258 /// This example shows that the behavior of `.` is impacted by this 2259 /// setting as well: 2260 /// 2261 /// ``` 2262 /// use regex::bytes::RegexSetBuilder; 2263 /// 2264 /// let re = RegexSetBuilder::new([r"."]) 2265 /// .line_terminator(b'\x00') 2266 /// .build() 2267 /// .unwrap(); 2268 /// assert!(re.is_match(b"\n")); 2269 /// assert!(!re.is_match(b"\x00")); 2270 /// ``` 2271 /// 2272 /// This shows that building a regex will work even when the byte given 2273 /// is not ASCII. This is unlike the top-level `RegexSet` API where 2274 /// matching invalid UTF-8 is not allowed. 2275 /// 2276 /// Note though that you must disable Unicode mode. This is required 2277 /// because Unicode mode requires matching one codepoint at a time, 2278 /// and there is no way to match a non-ASCII byte as if it were a 2279 /// codepoint. 2280 /// 2281 /// ``` 2282 /// use regex::bytes::RegexSetBuilder; 2283 /// 2284 /// assert!( 2285 /// RegexSetBuilder::new([r"."]) 2286 /// .unicode(false) 2287 /// .line_terminator(0x80) 2288 /// .build() 2289 /// .is_ok(), 2290 /// ); 2291 /// ``` line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder2292 pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder { 2293 self.builder.line_terminator(byte); 2294 self 2295 } 2296 2297 /// This configures swap-greed mode for all of the patterns. 2298 /// 2299 /// When swap-greed mode is enabled, patterns like `a+` will become 2300 /// non-greedy and patterns like `a+?` will become greedy. In other 2301 /// words, the meanings of `a+` and `a+?` are switched. 2302 /// 2303 /// This setting can also be configured using the inline flag `U` in 2304 /// the pattern. 2305 /// 2306 /// Note that this is generally not useful for a `RegexSet` since a 2307 /// `RegexSet` can only report whether a pattern matches or not. Since 2308 /// greediness never impacts whether a match is found or not (only the 2309 /// offsets of the match), it follows that whether parts of a pattern 2310 /// are greedy or not doesn't matter for a `RegexSet`. 2311 /// 2312 /// The default for this is `false`. swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder2313 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder { 2314 self.builder.swap_greed(yes); 2315 self 2316 } 2317 2318 /// This configures verbose mode for all of the patterns. 2319 /// 2320 /// When enabled, whitespace will treated as insignifcant in the 2321 /// pattern and `#` can be used to start a comment until the next new 2322 /// line. 2323 /// 2324 /// Normally, in most places in a pattern, whitespace is treated 2325 /// literally. For example ` +` will match one or more ASCII whitespace 2326 /// characters. 2327 /// 2328 /// When verbose mode is enabled, `\#` can be used to match a literal 2329 /// `#` and `\ ` can be used to match a literal ASCII whitespace 2330 /// character. 2331 /// 2332 /// Verbose mode is useful for permitting regexes to be formatted and 2333 /// broken up more nicely. This may make them more easily readable. 2334 /// 2335 /// This setting can also be configured using the inline flag `x` in 2336 /// the pattern. 2337 /// 2338 /// The default for this is `false`. 2339 /// 2340 /// # Example 2341 /// 2342 /// ``` 2343 /// use regex::bytes::RegexSetBuilder; 2344 /// 2345 /// let pat = r" 2346 /// \b 2347 /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter 2348 /// [\s--\n]+ # whitespace should separate names 2349 /// (?: # middle name can be an initial! 2350 /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) 2351 /// [\s--\n]+ 2352 /// )? 2353 /// (?<last>\p{Uppercase}\w*) 2354 /// \b 2355 /// "; 2356 /// let re = RegexSetBuilder::new([pat]) 2357 /// .ignore_whitespace(true) 2358 /// .build() 2359 /// .unwrap(); 2360 /// assert!(re.is_match(b"Harry Potter")); 2361 /// assert!(re.is_match(b"Harry J. Potter")); 2362 /// assert!(re.is_match(b"Harry James Potter")); 2363 /// assert!(!re.is_match(b"harry J. Potter")); 2364 /// ``` ignore_whitespace( &mut self, yes: bool, ) -> &mut RegexSetBuilder2365 pub fn ignore_whitespace( 2366 &mut self, 2367 yes: bool, 2368 ) -> &mut RegexSetBuilder { 2369 self.builder.ignore_whitespace(yes); 2370 self 2371 } 2372 2373 /// This configures octal mode for all of the patterns. 2374 /// 2375 /// Octal syntax is a little-known way of uttering Unicode codepoints 2376 /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all 2377 /// equivalent patterns, where the last example shows octal syntax. 2378 /// 2379 /// While supporting octal syntax isn't in and of itself a problem, 2380 /// it does make good error messages harder. That is, in PCRE based 2381 /// regex engines, syntax like `\1` invokes a backreference, which is 2382 /// explicitly unsupported this library. However, many users expect 2383 /// backreferences to be supported. Therefore, when octal support 2384 /// is disabled, the error message will explicitly mention that 2385 /// backreferences aren't supported. 2386 /// 2387 /// The default for this is `false`. 2388 /// 2389 /// # Example 2390 /// 2391 /// ``` 2392 /// use regex::bytes::RegexSetBuilder; 2393 /// 2394 /// // Normally this pattern would not compile, with an error message 2395 /// // about backreferences not being supported. But with octal mode 2396 /// // enabled, octal escape sequences work. 2397 /// let re = RegexSetBuilder::new([r"\141"]) 2398 /// .octal(true) 2399 /// .build() 2400 /// .unwrap(); 2401 /// assert!(re.is_match(b"a")); 2402 /// ``` octal(&mut self, yes: bool) -> &mut RegexSetBuilder2403 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { 2404 self.builder.octal(yes); 2405 self 2406 } 2407 2408 /// Sets the approximate size limit, in bytes, of the compiled regex. 2409 /// 2410 /// This roughly corresponds to the number of heap memory, in 2411 /// bytes, occupied by a single regex. If the regex would otherwise 2412 /// approximately exceed this limit, then compiling that regex will 2413 /// fail. 2414 /// 2415 /// The main utility of a method like this is to avoid compiling 2416 /// regexes that use an unexpected amount of resources, such as 2417 /// time and memory. Even if the memory usage of a large regex is 2418 /// acceptable, its search time may not be. Namely, worst case time 2419 /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and 2420 /// `n ~ len(haystack)`. That is, search time depends, in part, on the 2421 /// size of the compiled regex. This means that putting a limit on the 2422 /// size of the regex limits how much a regex can impact search time. 2423 /// 2424 /// For more information about regex size limits, see the section on 2425 /// [untrusted inputs](crate#untrusted-input) in the top-level crate 2426 /// documentation. 2427 /// 2428 /// The default for this is some reasonable number that permits most 2429 /// patterns to compile successfully. 2430 /// 2431 /// # Example 2432 /// 2433 /// ``` 2434 /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041 2435 /// use regex::bytes::RegexSetBuilder; 2436 /// 2437 /// // It may surprise you how big some seemingly small patterns can 2438 /// // be! Since \w is Unicode aware, this generates a regex that can 2439 /// // match approximately 140,000 distinct codepoints. 2440 /// assert!( 2441 /// RegexSetBuilder::new([r"\w"]) 2442 /// .size_limit(45_000) 2443 /// .build() 2444 /// .is_err() 2445 /// ); 2446 /// ``` size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder2447 pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder { 2448 self.builder.size_limit(bytes); 2449 self 2450 } 2451 2452 /// Set the approximate capacity, in bytes, of the cache of transitions 2453 /// used by the lazy DFA. 2454 /// 2455 /// While the lazy DFA isn't always used, in tends to be the most 2456 /// commonly use regex engine in default configurations. It tends to 2457 /// adopt the performance profile of a fully build DFA, but without the 2458 /// downside of taking worst case exponential time to build. 2459 /// 2460 /// The downside is that it needs to keep a cache of transitions and 2461 /// states that are built while running a search, and this cache 2462 /// can fill up. When it fills up, the cache will reset itself. Any 2463 /// previously generated states and transitions will then need to be 2464 /// re-generated. If this happens too many times, then this library 2465 /// will bail out of using the lazy DFA and switch to a different regex 2466 /// engine. 2467 /// 2468 /// If your regex provokes this particular downside of the lazy DFA, 2469 /// then it may be beneficial to increase its cache capacity. This will 2470 /// potentially reduce the frequency of cache resetting (ideally to 2471 /// `0`). While it won't fix all potential performance problems with 2472 /// the lazy DFA, increasing the cache capacity does fix some. 2473 /// 2474 /// There is no easy way to determine, a priori, whether increasing 2475 /// this cache capacity will help. In general, the larger your regex, 2476 /// the more cache it's likely to use. But that isn't an ironclad rule. 2477 /// For example, a regex like `[01]*1[01]{N}` would normally produce a 2478 /// fully build DFA that is exponential in size with respect to `N`. 2479 /// The lazy DFA will prevent exponential space blow-up, but it cache 2480 /// is likely to fill up, even when it's large and even for smallish 2481 /// values of `N`. 2482 /// 2483 /// If you aren't sure whether this helps or not, it is sensible to 2484 /// set this to some arbitrarily large number in testing, such as 2485 /// `usize::MAX`. Namely, this represents the amount of capacity that 2486 /// *may* be used. It's probably not a good idea to use `usize::MAX` in 2487 /// production though, since it implies there are no controls on heap 2488 /// memory used by this library during a search. In effect, set it to 2489 /// whatever you're willing to allocate for a single regex search. dfa_size_limit( &mut self, bytes: usize, ) -> &mut RegexSetBuilder2490 pub fn dfa_size_limit( 2491 &mut self, 2492 bytes: usize, 2493 ) -> &mut RegexSetBuilder { 2494 self.builder.dfa_size_limit(bytes); 2495 self 2496 } 2497 2498 /// Set the nesting limit for this parser. 2499 /// 2500 /// The nesting limit controls how deep the abstract syntax tree is 2501 /// allowed to be. If the AST exceeds the given limit (e.g., with too 2502 /// many nested groups), then an error is returned by the parser. 2503 /// 2504 /// The purpose of this limit is to act as a heuristic to prevent stack 2505 /// overflow for consumers that do structural induction on an AST using 2506 /// explicit recursion. While this crate never does this (instead using 2507 /// constant stack space and moving the call stack to the heap), other 2508 /// crates may. 2509 /// 2510 /// This limit is not checked until the entire AST is parsed. 2511 /// Therefore, if callers want to put a limit on the amount of heap 2512 /// space used, then they should impose a limit on the length, in 2513 /// bytes, of the concrete pattern string. In particular, this is 2514 /// viable since this parser implementation will limit itself to heap 2515 /// space proportional to the length of the pattern string. See also 2516 /// the [untrusted inputs](crate#untrusted-input) section in the 2517 /// top-level crate documentation for more information about this. 2518 /// 2519 /// Note that a nest limit of `0` will return a nest limit error for 2520 /// most patterns but not all. For example, a nest limit of `0` permits 2521 /// `a` but not `ab`, since `ab` requires an explicit concatenation, 2522 /// which results in a nest depth of `1`. In general, a nest limit is 2523 /// not something that manifests in an obvious way in the concrete 2524 /// syntax, therefore, it should not be used in a granular way. 2525 /// 2526 /// # Example 2527 /// 2528 /// ``` 2529 /// use regex::bytes::RegexSetBuilder; 2530 /// 2531 /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok()); 2532 /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err()); 2533 /// ``` nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder2534 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder { 2535 self.builder.nest_limit(limit); 2536 self 2537 } 2538 } 2539 } 2540