xref: /aosp_15_r20/external/cronet/third_party/rust/chromium_crates_io/vendor/regex-1.10.4/src/builders.rs (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 #![allow(warnings)]
2 
3 // This module defines an internal builder that encapsulates all interaction
4 // with meta::Regex construction, and then 4 public API builders that wrap
5 // around it. The docs are essentially repeated on each of the 4 public
6 // builders, with tweaks to the examples as needed.
7 //
8 // The reason why there are so many builders is partially because of a misstep
9 // in the initial API design: the builder constructor takes in the pattern
10 // strings instead of using the `build` method to accept the pattern strings.
11 // This means `new` has a different signature for each builder. It probably
12 // would have been nicer to to use one builder with `fn new()`, and then add
13 // `build(pat)` and `build_many(pats)` constructors.
14 //
15 // The other reason is because I think the `bytes` module should probably
16 // have its own builder type. That way, it is completely isolated from the
17 // top-level API.
18 //
19 // If I could do it again, I'd probably have a `regex::Builder` and a
20 // `regex::bytes::Builder`. Each would have `build` and `build_set` (or
21 // `build_many`) methods for constructing a single pattern `Regex` and a
22 // multi-pattern `RegexSet`, respectively.
23 
24 use alloc::{
25     string::{String, ToString},
26     sync::Arc,
27     vec,
28     vec::Vec,
29 };
30 
31 use regex_automata::{
32     meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind,
33 };
34 
35 use crate::error::Error;
36 
37 /// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a
38 /// `bytes::RegexSet`.
39 ///
40 /// This is essentially the implementation of the four different builder types
41 /// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder`
42 /// and `bytes::RegexSetBuilder`.
43 #[derive(Clone, Debug)]
44 struct Builder {
45     pats: Vec<String>,
46     metac: meta::Config,
47     syntaxc: syntax::Config,
48 }
49 
50 impl Default for Builder {
default() -> Builder51     fn default() -> Builder {
52         let metac = meta::Config::new()
53             .nfa_size_limit(Some(10 * (1 << 20)))
54             .hybrid_cache_capacity(2 * (1 << 20));
55         Builder { pats: vec![], metac, syntaxc: syntax::Config::default() }
56     }
57 }
58 
59 impl Builder {
new<I, S>(patterns: I) -> Builder where S: AsRef<str>, I: IntoIterator<Item = S>,60     fn new<I, S>(patterns: I) -> Builder
61     where
62         S: AsRef<str>,
63         I: IntoIterator<Item = S>,
64     {
65         let mut b = Builder::default();
66         b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string()));
67         b
68     }
69 
build_one_string(&self) -> Result<crate::Regex, Error>70     fn build_one_string(&self) -> Result<crate::Regex, Error> {
71         assert_eq!(1, self.pats.len());
72         let metac = self
73             .metac
74             .clone()
75             .match_kind(MatchKind::LeftmostFirst)
76             .utf8_empty(true);
77         let syntaxc = self.syntaxc.clone().utf8(true);
78         let pattern = Arc::from(self.pats[0].as_str());
79         meta::Builder::new()
80             .configure(metac)
81             .syntax(syntaxc)
82             .build(&pattern)
83             .map(|meta| crate::Regex { meta, pattern })
84             .map_err(Error::from_meta_build_error)
85     }
86 
build_one_bytes(&self) -> Result<crate::bytes::Regex, Error>87     fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> {
88         assert_eq!(1, self.pats.len());
89         let metac = self
90             .metac
91             .clone()
92             .match_kind(MatchKind::LeftmostFirst)
93             .utf8_empty(false);
94         let syntaxc = self.syntaxc.clone().utf8(false);
95         let pattern = Arc::from(self.pats[0].as_str());
96         meta::Builder::new()
97             .configure(metac)
98             .syntax(syntaxc)
99             .build(&pattern)
100             .map(|meta| crate::bytes::Regex { meta, pattern })
101             .map_err(Error::from_meta_build_error)
102     }
103 
build_many_string(&self) -> Result<crate::RegexSet, Error>104     fn build_many_string(&self) -> Result<crate::RegexSet, Error> {
105         let metac = self
106             .metac
107             .clone()
108             .match_kind(MatchKind::All)
109             .utf8_empty(true)
110             .which_captures(WhichCaptures::None);
111         let syntaxc = self.syntaxc.clone().utf8(true);
112         let patterns = Arc::from(self.pats.as_slice());
113         meta::Builder::new()
114             .configure(metac)
115             .syntax(syntaxc)
116             .build_many(&patterns)
117             .map(|meta| crate::RegexSet { meta, patterns })
118             .map_err(Error::from_meta_build_error)
119     }
120 
build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error>121     fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> {
122         let metac = self
123             .metac
124             .clone()
125             .match_kind(MatchKind::All)
126             .utf8_empty(false)
127             .which_captures(WhichCaptures::None);
128         let syntaxc = self.syntaxc.clone().utf8(false);
129         let patterns = Arc::from(self.pats.as_slice());
130         meta::Builder::new()
131             .configure(metac)
132             .syntax(syntaxc)
133             .build_many(&patterns)
134             .map(|meta| crate::bytes::RegexSet { meta, patterns })
135             .map_err(Error::from_meta_build_error)
136     }
137 
case_insensitive(&mut self, yes: bool) -> &mut Builder138     fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
139         self.syntaxc = self.syntaxc.case_insensitive(yes);
140         self
141     }
142 
multi_line(&mut self, yes: bool) -> &mut Builder143     fn multi_line(&mut self, yes: bool) -> &mut Builder {
144         self.syntaxc = self.syntaxc.multi_line(yes);
145         self
146     }
147 
dot_matches_new_line(&mut self, yes: bool) -> &mut Builder148     fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
149         self.syntaxc = self.syntaxc.dot_matches_new_line(yes);
150         self
151     }
152 
crlf(&mut self, yes: bool) -> &mut Builder153     fn crlf(&mut self, yes: bool) -> &mut Builder {
154         self.syntaxc = self.syntaxc.crlf(yes);
155         self
156     }
157 
line_terminator(&mut self, byte: u8) -> &mut Builder158     fn line_terminator(&mut self, byte: u8) -> &mut Builder {
159         self.metac = self.metac.clone().line_terminator(byte);
160         self.syntaxc = self.syntaxc.line_terminator(byte);
161         self
162     }
163 
swap_greed(&mut self, yes: bool) -> &mut Builder164     fn swap_greed(&mut self, yes: bool) -> &mut Builder {
165         self.syntaxc = self.syntaxc.swap_greed(yes);
166         self
167     }
168 
ignore_whitespace(&mut self, yes: bool) -> &mut Builder169     fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
170         self.syntaxc = self.syntaxc.ignore_whitespace(yes);
171         self
172     }
173 
unicode(&mut self, yes: bool) -> &mut Builder174     fn unicode(&mut self, yes: bool) -> &mut Builder {
175         self.syntaxc = self.syntaxc.unicode(yes);
176         self
177     }
178 
octal(&mut self, yes: bool) -> &mut Builder179     fn octal(&mut self, yes: bool) -> &mut Builder {
180         self.syntaxc = self.syntaxc.octal(yes);
181         self
182     }
183 
size_limit(&mut self, limit: usize) -> &mut Builder184     fn size_limit(&mut self, limit: usize) -> &mut Builder {
185         self.metac = self.metac.clone().nfa_size_limit(Some(limit));
186         self
187     }
188 
dfa_size_limit(&mut self, limit: usize) -> &mut Builder189     fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder {
190         self.metac = self.metac.clone().hybrid_cache_capacity(limit);
191         self
192     }
193 
nest_limit(&mut self, limit: u32) -> &mut Builder194     fn nest_limit(&mut self, limit: u32) -> &mut Builder {
195         self.syntaxc = self.syntaxc.nest_limit(limit);
196         self
197     }
198 }
199 
200 pub(crate) mod string {
201     use crate::{error::Error, Regex, RegexSet};
202 
203     use super::Builder;
204 
205     /// A configurable builder for a [`Regex`].
206     ///
207     /// This builder can be used to programmatically set flags such as `i`
208     /// (case insensitive) and `x` (for verbose mode). This builder can also be
209     /// used to configure things like the line terminator and a size limit on
210     /// the compiled regular expression.
211     #[derive(Clone, Debug)]
212     pub struct RegexBuilder {
213         builder: Builder,
214     }
215 
216     impl RegexBuilder {
217         /// Create a new builder with a default configuration for the given
218         /// pattern.
219         ///
220         /// If the pattern is invalid or exceeds the configured size limits,
221         /// then an error will be returned when [`RegexBuilder::build`] is
222         /// called.
new(pattern: &str) -> RegexBuilder223         pub fn new(pattern: &str) -> RegexBuilder {
224             RegexBuilder { builder: Builder::new([pattern]) }
225         }
226 
227         /// Compiles the pattern given to `RegexBuilder::new` with the
228         /// configuration set on this builder.
229         ///
230         /// If the pattern isn't a valid regex or if a configured size limit
231         /// was exceeded, then an error is returned.
build(&self) -> Result<Regex, Error>232         pub fn build(&self) -> Result<Regex, Error> {
233             self.builder.build_one_string()
234         }
235 
236         /// This configures Unicode mode for the entire pattern.
237         ///
238         /// Enabling Unicode mode does a number of things:
239         ///
240         /// * Most fundamentally, it causes the fundamental atom of matching
241         /// to be a single codepoint. When Unicode mode is disabled, it's a
242         /// single byte. For example, when Unicode mode is enabled, `.` will
243         /// match `��` once, where as it will match 4 times when Unicode mode
244         /// is disabled. (Since the UTF-8 encoding of `��` is 4 bytes long.)
245         /// * Case insensitive matching uses Unicode simple case folding rules.
246         /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
247         /// available.
248         /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
249         /// `\d`.
250         /// * The word boundary assertions, `\b` and `\B`, use the Unicode
251         /// definition of a word character.
252         ///
253         /// Note that if Unicode mode is disabled, then the regex will fail to
254         /// compile if it could match invalid UTF-8. For example, when Unicode
255         /// mode is disabled, then since `.` matches any byte (except for
256         /// `\n`), then it can match invalid UTF-8 and thus building a regex
257         /// from it will fail. Another example is `\w` and `\W`. Since `\w` can
258         /// only match ASCII bytes when Unicode mode is disabled, it's allowed.
259         /// But `\W` can match more than ASCII bytes, including invalid UTF-8,
260         /// and so it is not allowed. This restriction can be lifted only by
261         /// using a [`bytes::Regex`](crate::bytes::Regex).
262         ///
263         /// For more details on the Unicode support in this crate, see the
264         /// [Unicode section](crate#unicode) in this crate's top-level
265         /// documentation.
266         ///
267         /// The default for this is `true`.
268         ///
269         /// # Example
270         ///
271         /// ```
272         /// use regex::RegexBuilder;
273         ///
274         /// let re = RegexBuilder::new(r"\w")
275         ///     .unicode(false)
276         ///     .build()
277         ///     .unwrap();
278         /// // Normally greek letters would be included in \w, but since
279         /// // Unicode mode is disabled, it only matches ASCII letters.
280         /// assert!(!re.is_match("δ"));
281         ///
282         /// let re = RegexBuilder::new(r"s")
283         ///     .case_insensitive(true)
284         ///     .unicode(false)
285         ///     .build()
286         ///     .unwrap();
287         /// // Normally 'ſ' is included when searching for 's' case
288         /// // insensitively due to Unicode's simple case folding rules. But
289         /// // when Unicode mode is disabled, only ASCII case insensitive rules
290         /// // are used.
291         /// assert!(!re.is_match("ſ"));
292         /// ```
unicode(&mut self, yes: bool) -> &mut RegexBuilder293         pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
294             self.builder.unicode(yes);
295             self
296         }
297 
298         /// This configures whether to enable case insensitive matching for the
299         /// entire pattern.
300         ///
301         /// This setting can also be configured using the inline flag `i`
302         /// in the pattern. For example, `(?i:foo)` matches `foo` case
303         /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
304         ///
305         /// The default for this is `false`.
306         ///
307         /// # Example
308         ///
309         /// ```
310         /// use regex::RegexBuilder;
311         ///
312         /// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
313         ///     .case_insensitive(true)
314         ///     .build()
315         ///     .unwrap();
316         /// assert!(re.is_match("FoObarQuUx"));
317         /// // Even though case insensitive matching is enabled in the builder,
318         /// // it can be locally disabled within the pattern. In this case,
319         /// // `bar` is matched case sensitively.
320         /// assert!(!re.is_match("fooBARquux"));
321         /// ```
case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder322         pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
323             self.builder.case_insensitive(yes);
324             self
325         }
326 
327         /// This configures multi-line mode for the entire pattern.
328         ///
329         /// Enabling multi-line mode changes the behavior of the `^` and `$`
330         /// anchor assertions. Instead of only matching at the beginning and
331         /// end of a haystack, respectively, multi-line mode causes them to
332         /// match at the beginning and end of a line *in addition* to the
333         /// beginning and end of a haystack. More precisely, `^` will match at
334         /// the position immediately following a `\n` and `$` will match at the
335         /// position immediately preceding a `\n`.
336         ///
337         /// The behavior of this option can be impacted by other settings too:
338         ///
339         /// * The [`RegexBuilder::line_terminator`] option changes `\n` above
340         /// to any ASCII byte.
341         /// * The [`RegexBuilder::crlf`] option changes the line terminator to
342         /// be either `\r` or `\n`, but never at the position between a `\r`
343         /// and `\n`.
344         ///
345         /// This setting can also be configured using the inline flag `m` in
346         /// the pattern.
347         ///
348         /// The default for this is `false`.
349         ///
350         /// # Example
351         ///
352         /// ```
353         /// use regex::RegexBuilder;
354         ///
355         /// let re = RegexBuilder::new(r"^foo$")
356         ///     .multi_line(true)
357         ///     .build()
358         ///     .unwrap();
359         /// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range()));
360         /// ```
multi_line(&mut self, yes: bool) -> &mut RegexBuilder361         pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
362             self.builder.multi_line(yes);
363             self
364         }
365 
366         /// This configures dot-matches-new-line mode for the entire pattern.
367         ///
368         /// Perhaps surprisingly, the default behavior for `.` is not to match
369         /// any character, but rather, to match any character except for the
370         /// line terminator (which is `\n` by default). When this mode is
371         /// enabled, the behavior changes such that `.` truly matches any
372         /// character.
373         ///
374         /// This setting can also be configured using the inline flag `s` in
375         /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
376         /// regexes.
377         ///
378         /// The default for this is `false`.
379         ///
380         /// # Example
381         ///
382         /// ```
383         /// use regex::RegexBuilder;
384         ///
385         /// let re = RegexBuilder::new(r"foo.bar")
386         ///     .dot_matches_new_line(true)
387         ///     .build()
388         ///     .unwrap();
389         /// let hay = "foo\nbar";
390         /// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str()));
391         /// ```
dot_matches_new_line( &mut self, yes: bool, ) -> &mut RegexBuilder392         pub fn dot_matches_new_line(
393             &mut self,
394             yes: bool,
395         ) -> &mut RegexBuilder {
396             self.builder.dot_matches_new_line(yes);
397             self
398         }
399 
400         /// This configures CRLF mode for the entire pattern.
401         ///
402         /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
403         /// short) and `\n` ("line feed" or LF for short) are treated as line
404         /// terminators. This results in the following:
405         ///
406         /// * Unless dot-matches-new-line mode is enabled, `.` will now match
407         /// any character except for `\n` and `\r`.
408         /// * When multi-line mode is enabled, `^` will match immediately
409         /// following a `\n` or a `\r`. Similarly, `$` will match immediately
410         /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
411         /// between `\r` and `\n`.
412         ///
413         /// This setting can also be configured using the inline flag `R` in
414         /// the pattern.
415         ///
416         /// The default for this is `false`.
417         ///
418         /// # Example
419         ///
420         /// ```
421         /// use regex::RegexBuilder;
422         ///
423         /// let re = RegexBuilder::new(r"^foo$")
424         ///     .multi_line(true)
425         ///     .crlf(true)
426         ///     .build()
427         ///     .unwrap();
428         /// let hay = "\r\nfoo\r\n";
429         /// // If CRLF mode weren't enabled here, then '$' wouldn't match
430         /// // immediately after 'foo', and thus no match would be found.
431         /// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str()));
432         /// ```
433         ///
434         /// This example demonstrates that `^` will never match at a position
435         /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
436         /// and a `\n`.)
437         ///
438         /// ```
439         /// use regex::RegexBuilder;
440         ///
441         /// let re = RegexBuilder::new(r"^")
442         ///     .multi_line(true)
443         ///     .crlf(true)
444         ///     .build()
445         ///     .unwrap();
446         /// let hay = "\r\n\r\n";
447         /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
448         /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
449         /// ```
crlf(&mut self, yes: bool) -> &mut RegexBuilder450         pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
451             self.builder.crlf(yes);
452             self
453         }
454 
455         /// Configures the line terminator to be used by the regex.
456         ///
457         /// The line terminator is relevant in two ways for a particular regex:
458         ///
459         /// * When dot-matches-new-line mode is *not* enabled (the default),
460         /// then `.` will match any character except for the configured line
461         /// terminator.
462         /// * When multi-line mode is enabled (not the default), then `^` and
463         /// `$` will match immediately after and before, respectively, a line
464         /// terminator.
465         ///
466         /// In both cases, if CRLF mode is enabled in a particular context,
467         /// then it takes precedence over any configured line terminator.
468         ///
469         /// This option cannot be configured from within the pattern.
470         ///
471         /// The default line terminator is `\n`.
472         ///
473         /// # Example
474         ///
475         /// This shows how to treat the NUL byte as a line terminator. This can
476         /// be a useful heuristic when searching binary data.
477         ///
478         /// ```
479         /// use regex::RegexBuilder;
480         ///
481         /// let re = RegexBuilder::new(r"^foo$")
482         ///     .multi_line(true)
483         ///     .line_terminator(b'\x00')
484         ///     .build()
485         ///     .unwrap();
486         /// let hay = "\x00foo\x00";
487         /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
488         /// ```
489         ///
490         /// This example shows that the behavior of `.` is impacted by this
491         /// setting as well:
492         ///
493         /// ```
494         /// use regex::RegexBuilder;
495         ///
496         /// let re = RegexBuilder::new(r".")
497         ///     .line_terminator(b'\x00')
498         ///     .build()
499         ///     .unwrap();
500         /// assert!(re.is_match("\n"));
501         /// assert!(!re.is_match("\x00"));
502         /// ```
503         ///
504         /// This shows that building a regex will fail if the byte given
505         /// is not ASCII and the pattern could result in matching invalid
506         /// UTF-8. This is because any singular non-ASCII byte is not valid
507         /// UTF-8, and it is not permitted for a [`Regex`] to match invalid
508         /// UTF-8. (It is permissible to use a non-ASCII byte when building a
509         /// [`bytes::Regex`](crate::bytes::Regex).)
510         ///
511         /// ```
512         /// use regex::RegexBuilder;
513         ///
514         /// assert!(RegexBuilder::new(r".").line_terminator(0x80).build().is_err());
515         /// // Note that using a non-ASCII byte isn't enough on its own to
516         /// // cause regex compilation to fail. You actually have to make use
517         /// // of it in the regex in a way that leads to matching invalid
518         /// // UTF-8. If you don't, then regex compilation will succeed!
519         /// assert!(RegexBuilder::new(r"a").line_terminator(0x80).build().is_ok());
520         /// ```
line_terminator(&mut self, byte: u8) -> &mut RegexBuilder521         pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
522             self.builder.line_terminator(byte);
523             self
524         }
525 
526         /// This configures swap-greed mode for the entire pattern.
527         ///
528         /// When swap-greed mode is enabled, patterns like `a+` will become
529         /// non-greedy and patterns like `a+?` will become greedy. In other
530         /// words, the meanings of `a+` and `a+?` are switched.
531         ///
532         /// This setting can also be configured using the inline flag `U` in
533         /// the pattern.
534         ///
535         /// The default for this is `false`.
536         ///
537         /// # Example
538         ///
539         /// ```
540         /// use regex::RegexBuilder;
541         ///
542         /// let re = RegexBuilder::new(r"a+")
543         ///     .swap_greed(true)
544         ///     .build()
545         ///     .unwrap();
546         /// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str()));
547         /// ```
swap_greed(&mut self, yes: bool) -> &mut RegexBuilder548         pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
549             self.builder.swap_greed(yes);
550             self
551         }
552 
553         /// This configures verbose mode for the entire pattern.
554         ///
555         /// When enabled, whitespace will treated as insignifcant in the
556         /// pattern and `#` can be used to start a comment until the next new
557         /// line.
558         ///
559         /// Normally, in most places in a pattern, whitespace is treated
560         /// literally. For example ` +` will match one or more ASCII whitespace
561         /// characters.
562         ///
563         /// When verbose mode is enabled, `\#` can be used to match a literal
564         /// `#` and `\ ` can be used to match a literal ASCII whitespace
565         /// character.
566         ///
567         /// Verbose mode is useful for permitting regexes to be formatted and
568         /// broken up more nicely. This may make them more easily readable.
569         ///
570         /// This setting can also be configured using the inline flag `x` in
571         /// the pattern.
572         ///
573         /// The default for this is `false`.
574         ///
575         /// # Example
576         ///
577         /// ```
578         /// use regex::RegexBuilder;
579         ///
580         /// let pat = r"
581         ///     \b
582         ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
583         ///     [\s--\n]+                   # whitespace should separate names
584         ///     (?: # middle name can be an initial!
585         ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
586         ///         [\s--\n]+
587         ///     )?
588         ///     (?<last>\p{Uppercase}\w*)
589         ///     \b
590         /// ";
591         /// let re = RegexBuilder::new(pat)
592         ///     .ignore_whitespace(true)
593         ///     .build()
594         ///     .unwrap();
595         ///
596         /// let caps = re.captures("Harry Potter").unwrap();
597         /// assert_eq!("Harry", &caps["first"]);
598         /// assert_eq!("Potter", &caps["last"]);
599         ///
600         /// let caps = re.captures("Harry J. Potter").unwrap();
601         /// assert_eq!("Harry", &caps["first"]);
602         /// // Since a middle name/initial isn't required for an overall match,
603         /// // we can't assume that 'initial' or 'middle' will be populated!
604         /// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str()));
605         /// assert_eq!(None, caps.name("middle").map(|m| m.as_str()));
606         /// assert_eq!("Potter", &caps["last"]);
607         ///
608         /// let caps = re.captures("Harry James Potter").unwrap();
609         /// assert_eq!("Harry", &caps["first"]);
610         /// // Since a middle name/initial isn't required for an overall match,
611         /// // we can't assume that 'initial' or 'middle' will be populated!
612         /// assert_eq!(None, caps.name("initial").map(|m| m.as_str()));
613         /// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str()));
614         /// assert_eq!("Potter", &caps["last"]);
615         /// ```
ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder616         pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
617             self.builder.ignore_whitespace(yes);
618             self
619         }
620 
621         /// This configures octal mode for the entire pattern.
622         ///
623         /// Octal syntax is a little-known way of uttering Unicode codepoints
624         /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
625         /// equivalent patterns, where the last example shows octal syntax.
626         ///
627         /// While supporting octal syntax isn't in and of itself a problem,
628         /// it does make good error messages harder. That is, in PCRE based
629         /// regex engines, syntax like `\1` invokes a backreference, which is
630         /// explicitly unsupported this library. However, many users expect
631         /// backreferences to be supported. Therefore, when octal support
632         /// is disabled, the error message will explicitly mention that
633         /// backreferences aren't supported.
634         ///
635         /// The default for this is `false`.
636         ///
637         /// # Example
638         ///
639         /// ```
640         /// use regex::RegexBuilder;
641         ///
642         /// // Normally this pattern would not compile, with an error message
643         /// // about backreferences not being supported. But with octal mode
644         /// // enabled, octal escape sequences work.
645         /// let re = RegexBuilder::new(r"\141")
646         ///     .octal(true)
647         ///     .build()
648         ///     .unwrap();
649         /// assert!(re.is_match("a"));
650         /// ```
octal(&mut self, yes: bool) -> &mut RegexBuilder651         pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
652             self.builder.octal(yes);
653             self
654         }
655 
656         /// Sets the approximate size limit, in bytes, of the compiled regex.
657         ///
658         /// This roughly corresponds to the number of heap memory, in
659         /// bytes, occupied by a single regex. If the regex would otherwise
660         /// approximately exceed this limit, then compiling that regex will
661         /// fail.
662         ///
663         /// The main utility of a method like this is to avoid compiling
664         /// regexes that use an unexpected amount of resources, such as
665         /// time and memory. Even if the memory usage of a large regex is
666         /// acceptable, its search time may not be. Namely, worst case time
667         /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
668         /// `n ~ len(haystack)`. That is, search time depends, in part, on the
669         /// size of the compiled regex. This means that putting a limit on the
670         /// size of the regex limits how much a regex can impact search time.
671         ///
672         /// For more information about regex size limits, see the section on
673         /// [untrusted inputs](crate#untrusted-input) in the top-level crate
674         /// documentation.
675         ///
676         /// The default for this is some reasonable number that permits most
677         /// patterns to compile successfully.
678         ///
679         /// # Example
680         ///
681         /// ```
682         /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
683         /// use regex::RegexBuilder;
684         ///
685         /// // It may surprise you how big some seemingly small patterns can
686         /// // be! Since \w is Unicode aware, this generates a regex that can
687         /// // match approximately 140,000 distinct codepoints.
688         /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
689         /// ```
size_limit(&mut self, bytes: usize) -> &mut RegexBuilder690         pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
691             self.builder.size_limit(bytes);
692             self
693         }
694 
695         /// Set the approximate capacity, in bytes, of the cache of transitions
696         /// used by the lazy DFA.
697         ///
698         /// While the lazy DFA isn't always used, in tends to be the most
699         /// commonly use regex engine in default configurations. It tends to
700         /// adopt the performance profile of a fully build DFA, but without the
701         /// downside of taking worst case exponential time to build.
702         ///
703         /// The downside is that it needs to keep a cache of transitions and
704         /// states that are built while running a search, and this cache
705         /// can fill up. When it fills up, the cache will reset itself. Any
706         /// previously generated states and transitions will then need to be
707         /// re-generated. If this happens too many times, then this library
708         /// will bail out of using the lazy DFA and switch to a different regex
709         /// engine.
710         ///
711         /// If your regex provokes this particular downside of the lazy DFA,
712         /// then it may be beneficial to increase its cache capacity. This will
713         /// potentially reduce the frequency of cache resetting (ideally to
714         /// `0`). While it won't fix all potential performance problems with
715         /// the lazy DFA, increasing the cache capacity does fix some.
716         ///
717         /// There is no easy way to determine, a priori, whether increasing
718         /// this cache capacity will help. In general, the larger your regex,
719         /// the more cache it's likely to use. But that isn't an ironclad rule.
720         /// For example, a regex like `[01]*1[01]{N}` would normally produce a
721         /// fully build DFA that is exponential in size with respect to `N`.
722         /// The lazy DFA will prevent exponential space blow-up, but it cache
723         /// is likely to fill up, even when it's large and even for smallish
724         /// values of `N`.
725         ///
726         /// If you aren't sure whether this helps or not, it is sensible to
727         /// set this to some arbitrarily large number in testing, such as
728         /// `usize::MAX`. Namely, this represents the amount of capacity that
729         /// *may* be used. It's probably not a good idea to use `usize::MAX` in
730         /// production though, since it implies there are no controls on heap
731         /// memory used by this library during a search. In effect, set it to
732         /// whatever you're willing to allocate for a single regex search.
dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder733         pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
734             self.builder.dfa_size_limit(bytes);
735             self
736         }
737 
738         /// Set the nesting limit for this parser.
739         ///
740         /// The nesting limit controls how deep the abstract syntax tree is
741         /// allowed to be. If the AST exceeds the given limit (e.g., with too
742         /// many nested groups), then an error is returned by the parser.
743         ///
744         /// The purpose of this limit is to act as a heuristic to prevent stack
745         /// overflow for consumers that do structural induction on an AST using
746         /// explicit recursion. While this crate never does this (instead using
747         /// constant stack space and moving the call stack to the heap), other
748         /// crates may.
749         ///
750         /// This limit is not checked until the entire AST is parsed.
751         /// Therefore, if callers want to put a limit on the amount of heap
752         /// space used, then they should impose a limit on the length, in
753         /// bytes, of the concrete pattern string. In particular, this is
754         /// viable since this parser implementation will limit itself to heap
755         /// space proportional to the length of the pattern string. See also
756         /// the [untrusted inputs](crate#untrusted-input) section in the
757         /// top-level crate documentation for more information about this.
758         ///
759         /// Note that a nest limit of `0` will return a nest limit error for
760         /// most patterns but not all. For example, a nest limit of `0` permits
761         /// `a` but not `ab`, since `ab` requires an explicit concatenation,
762         /// which results in a nest depth of `1`. In general, a nest limit is
763         /// not something that manifests in an obvious way in the concrete
764         /// syntax, therefore, it should not be used in a granular way.
765         ///
766         /// # Example
767         ///
768         /// ```
769         /// use regex::RegexBuilder;
770         ///
771         /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
772         /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
773         /// ```
nest_limit(&mut self, limit: u32) -> &mut RegexBuilder774         pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
775             self.builder.nest_limit(limit);
776             self
777         }
778     }
779 
780     /// A configurable builder for a [`RegexSet`].
781     ///
782     /// This builder can be used to programmatically set flags such as
783     /// `i` (case insensitive) and `x` (for verbose mode). This builder
784     /// can also be used to configure things like the line terminator
785     /// and a size limit on the compiled regular expression.
786     #[derive(Clone, Debug)]
787     pub struct RegexSetBuilder {
788         builder: Builder,
789     }
790 
791     impl RegexSetBuilder {
792         /// Create a new builder with a default configuration for the given
793         /// patterns.
794         ///
795         /// If the patterns are invalid or exceed the configured size limits,
796         /// then an error will be returned when [`RegexSetBuilder::build`] is
797         /// called.
new<I, S>(patterns: I) -> RegexSetBuilder where I: IntoIterator<Item = S>, S: AsRef<str>,798         pub fn new<I, S>(patterns: I) -> RegexSetBuilder
799         where
800             I: IntoIterator<Item = S>,
801             S: AsRef<str>,
802         {
803             RegexSetBuilder { builder: Builder::new(patterns) }
804         }
805 
806         /// Compiles the patterns given to `RegexSetBuilder::new` with the
807         /// configuration set on this builder.
808         ///
809         /// If the patterns aren't valid regexes or if a configured size limit
810         /// was exceeded, then an error is returned.
build(&self) -> Result<RegexSet, Error>811         pub fn build(&self) -> Result<RegexSet, Error> {
812             self.builder.build_many_string()
813         }
814 
815         /// This configures Unicode mode for the all of the patterns.
816         ///
817         /// Enabling Unicode mode does a number of things:
818         ///
819         /// * Most fundamentally, it causes the fundamental atom of matching
820         /// to be a single codepoint. When Unicode mode is disabled, it's a
821         /// single byte. For example, when Unicode mode is enabled, `.` will
822         /// match `��` once, where as it will match 4 times when Unicode mode
823         /// is disabled. (Since the UTF-8 encoding of `��` is 4 bytes long.)
824         /// * Case insensitive matching uses Unicode simple case folding rules.
825         /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
826         /// available.
827         /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
828         /// `\d`.
829         /// * The word boundary assertions, `\b` and `\B`, use the Unicode
830         /// definition of a word character.
831         ///
832         /// Note that if Unicode mode is disabled, then the regex will fail to
833         /// compile if it could match invalid UTF-8. For example, when Unicode
834         /// mode is disabled, then since `.` matches any byte (except for
835         /// `\n`), then it can match invalid UTF-8 and thus building a regex
836         /// from it will fail. Another example is `\w` and `\W`. Since `\w` can
837         /// only match ASCII bytes when Unicode mode is disabled, it's allowed.
838         /// But `\W` can match more than ASCII bytes, including invalid UTF-8,
839         /// and so it is not allowed. This restriction can be lifted only by
840         /// using a [`bytes::RegexSet`](crate::bytes::RegexSet).
841         ///
842         /// For more details on the Unicode support in this crate, see the
843         /// [Unicode section](crate#unicode) in this crate's top-level
844         /// documentation.
845         ///
846         /// The default for this is `true`.
847         ///
848         /// # Example
849         ///
850         /// ```
851         /// use regex::RegexSetBuilder;
852         ///
853         /// let re = RegexSetBuilder::new([r"\w"])
854         ///     .unicode(false)
855         ///     .build()
856         ///     .unwrap();
857         /// // Normally greek letters would be included in \w, but since
858         /// // Unicode mode is disabled, it only matches ASCII letters.
859         /// assert!(!re.is_match("δ"));
860         ///
861         /// let re = RegexSetBuilder::new([r"s"])
862         ///     .case_insensitive(true)
863         ///     .unicode(false)
864         ///     .build()
865         ///     .unwrap();
866         /// // Normally 'ſ' is included when searching for 's' case
867         /// // insensitively due to Unicode's simple case folding rules. But
868         /// // when Unicode mode is disabled, only ASCII case insensitive rules
869         /// // are used.
870         /// assert!(!re.is_match("ſ"));
871         /// ```
unicode(&mut self, yes: bool) -> &mut RegexSetBuilder872         pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
873             self.builder.unicode(yes);
874             self
875         }
876 
877         /// This configures whether to enable case insensitive matching for all
878         /// of the patterns.
879         ///
880         /// This setting can also be configured using the inline flag `i`
881         /// in the pattern. For example, `(?i:foo)` matches `foo` case
882         /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
883         ///
884         /// The default for this is `false`.
885         ///
886         /// # Example
887         ///
888         /// ```
889         /// use regex::RegexSetBuilder;
890         ///
891         /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
892         ///     .case_insensitive(true)
893         ///     .build()
894         ///     .unwrap();
895         /// assert!(re.is_match("FoObarQuUx"));
896         /// // Even though case insensitive matching is enabled in the builder,
897         /// // it can be locally disabled within the pattern. In this case,
898         /// // `bar` is matched case sensitively.
899         /// assert!(!re.is_match("fooBARquux"));
900         /// ```
case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder901         pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
902             self.builder.case_insensitive(yes);
903             self
904         }
905 
906         /// This configures multi-line mode for all of the patterns.
907         ///
908         /// Enabling multi-line mode changes the behavior of the `^` and `$`
909         /// anchor assertions. Instead of only matching at the beginning and
910         /// end of a haystack, respectively, multi-line mode causes them to
911         /// match at the beginning and end of a line *in addition* to the
912         /// beginning and end of a haystack. More precisely, `^` will match at
913         /// the position immediately following a `\n` and `$` will match at the
914         /// position immediately preceding a `\n`.
915         ///
916         /// The behavior of this option can be impacted by other settings too:
917         ///
918         /// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
919         /// above to any ASCII byte.
920         /// * The [`RegexSetBuilder::crlf`] option changes the line terminator
921         /// to be either `\r` or `\n`, but never at the position between a `\r`
922         /// and `\n`.
923         ///
924         /// This setting can also be configured using the inline flag `m` in
925         /// the pattern.
926         ///
927         /// The default for this is `false`.
928         ///
929         /// # Example
930         ///
931         /// ```
932         /// use regex::RegexSetBuilder;
933         ///
934         /// let re = RegexSetBuilder::new([r"^foo$"])
935         ///     .multi_line(true)
936         ///     .build()
937         ///     .unwrap();
938         /// assert!(re.is_match("\nfoo\n"));
939         /// ```
multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder940         pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
941             self.builder.multi_line(yes);
942             self
943         }
944 
945         /// This configures dot-matches-new-line mode for the entire pattern.
946         ///
947         /// Perhaps surprisingly, the default behavior for `.` is not to match
948         /// any character, but rather, to match any character except for the
949         /// line terminator (which is `\n` by default). When this mode is
950         /// enabled, the behavior changes such that `.` truly matches any
951         /// character.
952         ///
953         /// This setting can also be configured using the inline flag `s` in
954         /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
955         /// regexes.
956         ///
957         /// The default for this is `false`.
958         ///
959         /// # Example
960         ///
961         /// ```
962         /// use regex::RegexSetBuilder;
963         ///
964         /// let re = RegexSetBuilder::new([r"foo.bar"])
965         ///     .dot_matches_new_line(true)
966         ///     .build()
967         ///     .unwrap();
968         /// let hay = "foo\nbar";
969         /// assert!(re.is_match(hay));
970         /// ```
dot_matches_new_line( &mut self, yes: bool, ) -> &mut RegexSetBuilder971         pub fn dot_matches_new_line(
972             &mut self,
973             yes: bool,
974         ) -> &mut RegexSetBuilder {
975             self.builder.dot_matches_new_line(yes);
976             self
977         }
978 
979         /// This configures CRLF mode for all of the patterns.
980         ///
981         /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
982         /// short) and `\n` ("line feed" or LF for short) are treated as line
983         /// terminators. This results in the following:
984         ///
985         /// * Unless dot-matches-new-line mode is enabled, `.` will now match
986         /// any character except for `\n` and `\r`.
987         /// * When multi-line mode is enabled, `^` will match immediately
988         /// following a `\n` or a `\r`. Similarly, `$` will match immediately
989         /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
990         /// between `\r` and `\n`.
991         ///
992         /// This setting can also be configured using the inline flag `R` in
993         /// the pattern.
994         ///
995         /// The default for this is `false`.
996         ///
997         /// # Example
998         ///
999         /// ```
1000         /// use regex::RegexSetBuilder;
1001         ///
1002         /// let re = RegexSetBuilder::new([r"^foo$"])
1003         ///     .multi_line(true)
1004         ///     .crlf(true)
1005         ///     .build()
1006         ///     .unwrap();
1007         /// let hay = "\r\nfoo\r\n";
1008         /// // If CRLF mode weren't enabled here, then '$' wouldn't match
1009         /// // immediately after 'foo', and thus no match would be found.
1010         /// assert!(re.is_match(hay));
1011         /// ```
1012         ///
1013         /// This example demonstrates that `^` will never match at a position
1014         /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
1015         /// and a `\n`.)
1016         ///
1017         /// ```
1018         /// use regex::RegexSetBuilder;
1019         ///
1020         /// let re = RegexSetBuilder::new([r"^\n"])
1021         ///     .multi_line(true)
1022         ///     .crlf(true)
1023         ///     .build()
1024         ///     .unwrap();
1025         /// assert!(!re.is_match("\r\n"));
1026         /// ```
crlf(&mut self, yes: bool) -> &mut RegexSetBuilder1027         pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
1028             self.builder.crlf(yes);
1029             self
1030         }
1031 
1032         /// Configures the line terminator to be used by the regex.
1033         ///
1034         /// The line terminator is relevant in two ways for a particular regex:
1035         ///
1036         /// * When dot-matches-new-line mode is *not* enabled (the default),
1037         /// then `.` will match any character except for the configured line
1038         /// terminator.
1039         /// * When multi-line mode is enabled (not the default), then `^` and
1040         /// `$` will match immediately after and before, respectively, a line
1041         /// terminator.
1042         ///
1043         /// In both cases, if CRLF mode is enabled in a particular context,
1044         /// then it takes precedence over any configured line terminator.
1045         ///
1046         /// This option cannot be configured from within the pattern.
1047         ///
1048         /// The default line terminator is `\n`.
1049         ///
1050         /// # Example
1051         ///
1052         /// This shows how to treat the NUL byte as a line terminator. This can
1053         /// be a useful heuristic when searching binary data.
1054         ///
1055         /// ```
1056         /// use regex::RegexSetBuilder;
1057         ///
1058         /// let re = RegexSetBuilder::new([r"^foo$"])
1059         ///     .multi_line(true)
1060         ///     .line_terminator(b'\x00')
1061         ///     .build()
1062         ///     .unwrap();
1063         /// let hay = "\x00foo\x00";
1064         /// assert!(re.is_match(hay));
1065         /// ```
1066         ///
1067         /// This example shows that the behavior of `.` is impacted by this
1068         /// setting as well:
1069         ///
1070         /// ```
1071         /// use regex::RegexSetBuilder;
1072         ///
1073         /// let re = RegexSetBuilder::new([r"."])
1074         ///     .line_terminator(b'\x00')
1075         ///     .build()
1076         ///     .unwrap();
1077         /// assert!(re.is_match("\n"));
1078         /// assert!(!re.is_match("\x00"));
1079         /// ```
1080         ///
1081         /// This shows that building a regex will fail if the byte given
1082         /// is not ASCII and the pattern could result in matching invalid
1083         /// UTF-8. This is because any singular non-ASCII byte is not valid
1084         /// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid
1085         /// UTF-8. (It is permissible to use a non-ASCII byte when building a
1086         /// [`bytes::RegexSet`](crate::bytes::RegexSet).)
1087         ///
1088         /// ```
1089         /// use regex::RegexSetBuilder;
1090         ///
1091         /// assert!(
1092         ///     RegexSetBuilder::new([r"."])
1093         ///         .line_terminator(0x80)
1094         ///         .build()
1095         ///         .is_err()
1096         /// );
1097         /// // Note that using a non-ASCII byte isn't enough on its own to
1098         /// // cause regex compilation to fail. You actually have to make use
1099         /// // of it in the regex in a way that leads to matching invalid
1100         /// // UTF-8. If you don't, then regex compilation will succeed!
1101         /// assert!(
1102         ///     RegexSetBuilder::new([r"a"])
1103         ///         .line_terminator(0x80)
1104         ///         .build()
1105         ///         .is_ok()
1106         /// );
1107         /// ```
line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder1108         pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
1109             self.builder.line_terminator(byte);
1110             self
1111         }
1112 
1113         /// This configures swap-greed mode for all of the patterns.
1114         ///
1115         /// When swap-greed mode is enabled, patterns like `a+` will become
1116         /// non-greedy and patterns like `a+?` will become greedy. In other
1117         /// words, the meanings of `a+` and `a+?` are switched.
1118         ///
1119         /// This setting can also be configured using the inline flag `U` in
1120         /// the pattern.
1121         ///
1122         /// Note that this is generally not useful for a `RegexSet` since a
1123         /// `RegexSet` can only report whether a pattern matches or not. Since
1124         /// greediness never impacts whether a match is found or not (only the
1125         /// offsets of the match), it follows that whether parts of a pattern
1126         /// are greedy or not doesn't matter for a `RegexSet`.
1127         ///
1128         /// The default for this is `false`.
swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder1129         pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
1130             self.builder.swap_greed(yes);
1131             self
1132         }
1133 
1134         /// This configures verbose mode for all of the patterns.
1135         ///
1136         /// When enabled, whitespace will treated as insignifcant in the
1137         /// pattern and `#` can be used to start a comment until the next new
1138         /// line.
1139         ///
1140         /// Normally, in most places in a pattern, whitespace is treated
1141         /// literally. For example ` +` will match one or more ASCII whitespace
1142         /// characters.
1143         ///
1144         /// When verbose mode is enabled, `\#` can be used to match a literal
1145         /// `#` and `\ ` can be used to match a literal ASCII whitespace
1146         /// character.
1147         ///
1148         /// Verbose mode is useful for permitting regexes to be formatted and
1149         /// broken up more nicely. This may make them more easily readable.
1150         ///
1151         /// This setting can also be configured using the inline flag `x` in
1152         /// the pattern.
1153         ///
1154         /// The default for this is `false`.
1155         ///
1156         /// # Example
1157         ///
1158         /// ```
1159         /// use regex::RegexSetBuilder;
1160         ///
1161         /// let pat = r"
1162         ///     \b
1163         ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
1164         ///     [\s--\n]+                   # whitespace should separate names
1165         ///     (?: # middle name can be an initial!
1166         ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
1167         ///         [\s--\n]+
1168         ///     )?
1169         ///     (?<last>\p{Uppercase}\w*)
1170         ///     \b
1171         /// ";
1172         /// let re = RegexSetBuilder::new([pat])
1173         ///     .ignore_whitespace(true)
1174         ///     .build()
1175         ///     .unwrap();
1176         /// assert!(re.is_match("Harry Potter"));
1177         /// assert!(re.is_match("Harry J. Potter"));
1178         /// assert!(re.is_match("Harry James Potter"));
1179         /// assert!(!re.is_match("harry J. Potter"));
1180         /// ```
ignore_whitespace( &mut self, yes: bool, ) -> &mut RegexSetBuilder1181         pub fn ignore_whitespace(
1182             &mut self,
1183             yes: bool,
1184         ) -> &mut RegexSetBuilder {
1185             self.builder.ignore_whitespace(yes);
1186             self
1187         }
1188 
1189         /// This configures octal mode for all of the patterns.
1190         ///
1191         /// Octal syntax is a little-known way of uttering Unicode codepoints
1192         /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
1193         /// equivalent patterns, where the last example shows octal syntax.
1194         ///
1195         /// While supporting octal syntax isn't in and of itself a problem,
1196         /// it does make good error messages harder. That is, in PCRE based
1197         /// regex engines, syntax like `\1` invokes a backreference, which is
1198         /// explicitly unsupported this library. However, many users expect
1199         /// backreferences to be supported. Therefore, when octal support
1200         /// is disabled, the error message will explicitly mention that
1201         /// backreferences aren't supported.
1202         ///
1203         /// The default for this is `false`.
1204         ///
1205         /// # Example
1206         ///
1207         /// ```
1208         /// use regex::RegexSetBuilder;
1209         ///
1210         /// // Normally this pattern would not compile, with an error message
1211         /// // about backreferences not being supported. But with octal mode
1212         /// // enabled, octal escape sequences work.
1213         /// let re = RegexSetBuilder::new([r"\141"])
1214         ///     .octal(true)
1215         ///     .build()
1216         ///     .unwrap();
1217         /// assert!(re.is_match("a"));
1218         /// ```
octal(&mut self, yes: bool) -> &mut RegexSetBuilder1219         pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
1220             self.builder.octal(yes);
1221             self
1222         }
1223 
1224         /// Sets the approximate size limit, in bytes, of the compiled regex.
1225         ///
1226         /// This roughly corresponds to the number of heap memory, in
1227         /// bytes, occupied by a single regex. If the regex would otherwise
1228         /// approximately exceed this limit, then compiling that regex will
1229         /// fail.
1230         ///
1231         /// The main utility of a method like this is to avoid compiling
1232         /// regexes that use an unexpected amount of resources, such as
1233         /// time and memory. Even if the memory usage of a large regex is
1234         /// acceptable, its search time may not be. Namely, worst case time
1235         /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
1236         /// `n ~ len(haystack)`. That is, search time depends, in part, on the
1237         /// size of the compiled regex. This means that putting a limit on the
1238         /// size of the regex limits how much a regex can impact search time.
1239         ///
1240         /// For more information about regex size limits, see the section on
1241         /// [untrusted inputs](crate#untrusted-input) in the top-level crate
1242         /// documentation.
1243         ///
1244         /// The default for this is some reasonable number that permits most
1245         /// patterns to compile successfully.
1246         ///
1247         /// # Example
1248         ///
1249         /// ```
1250         /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
1251         /// use regex::RegexSetBuilder;
1252         ///
1253         /// // It may surprise you how big some seemingly small patterns can
1254         /// // be! Since \w is Unicode aware, this generates a regex that can
1255         /// // match approximately 140,000 distinct codepoints.
1256         /// assert!(
1257         ///     RegexSetBuilder::new([r"\w"])
1258         ///         .size_limit(45_000)
1259         ///         .build()
1260         ///         .is_err()
1261         /// );
1262         /// ```
size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder1263         pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
1264             self.builder.size_limit(bytes);
1265             self
1266         }
1267 
1268         /// Set the approximate capacity, in bytes, of the cache of transitions
1269         /// used by the lazy DFA.
1270         ///
1271         /// While the lazy DFA isn't always used, in tends to be the most
1272         /// commonly use regex engine in default configurations. It tends to
1273         /// adopt the performance profile of a fully build DFA, but without the
1274         /// downside of taking worst case exponential time to build.
1275         ///
1276         /// The downside is that it needs to keep a cache of transitions and
1277         /// states that are built while running a search, and this cache
1278         /// can fill up. When it fills up, the cache will reset itself. Any
1279         /// previously generated states and transitions will then need to be
1280         /// re-generated. If this happens too many times, then this library
1281         /// will bail out of using the lazy DFA and switch to a different regex
1282         /// engine.
1283         ///
1284         /// If your regex provokes this particular downside of the lazy DFA,
1285         /// then it may be beneficial to increase its cache capacity. This will
1286         /// potentially reduce the frequency of cache resetting (ideally to
1287         /// `0`). While it won't fix all potential performance problems with
1288         /// the lazy DFA, increasing the cache capacity does fix some.
1289         ///
1290         /// There is no easy way to determine, a priori, whether increasing
1291         /// this cache capacity will help. In general, the larger your regex,
1292         /// the more cache it's likely to use. But that isn't an ironclad rule.
1293         /// For example, a regex like `[01]*1[01]{N}` would normally produce a
1294         /// fully build DFA that is exponential in size with respect to `N`.
1295         /// The lazy DFA will prevent exponential space blow-up, but it cache
1296         /// is likely to fill up, even when it's large and even for smallish
1297         /// values of `N`.
1298         ///
1299         /// If you aren't sure whether this helps or not, it is sensible to
1300         /// set this to some arbitrarily large number in testing, such as
1301         /// `usize::MAX`. Namely, this represents the amount of capacity that
1302         /// *may* be used. It's probably not a good idea to use `usize::MAX` in
1303         /// production though, since it implies there are no controls on heap
1304         /// memory used by this library during a search. In effect, set it to
1305         /// whatever you're willing to allocate for a single regex search.
dfa_size_limit( &mut self, bytes: usize, ) -> &mut RegexSetBuilder1306         pub fn dfa_size_limit(
1307             &mut self,
1308             bytes: usize,
1309         ) -> &mut RegexSetBuilder {
1310             self.builder.dfa_size_limit(bytes);
1311             self
1312         }
1313 
1314         /// Set the nesting limit for this parser.
1315         ///
1316         /// The nesting limit controls how deep the abstract syntax tree is
1317         /// allowed to be. If the AST exceeds the given limit (e.g., with too
1318         /// many nested groups), then an error is returned by the parser.
1319         ///
1320         /// The purpose of this limit is to act as a heuristic to prevent stack
1321         /// overflow for consumers that do structural induction on an AST using
1322         /// explicit recursion. While this crate never does this (instead using
1323         /// constant stack space and moving the call stack to the heap), other
1324         /// crates may.
1325         ///
1326         /// This limit is not checked until the entire AST is parsed.
1327         /// Therefore, if callers want to put a limit on the amount of heap
1328         /// space used, then they should impose a limit on the length, in
1329         /// bytes, of the concrete pattern string. In particular, this is
1330         /// viable since this parser implementation will limit itself to heap
1331         /// space proportional to the length of the pattern string. See also
1332         /// the [untrusted inputs](crate#untrusted-input) section in the
1333         /// top-level crate documentation for more information about this.
1334         ///
1335         /// Note that a nest limit of `0` will return a nest limit error for
1336         /// most patterns but not all. For example, a nest limit of `0` permits
1337         /// `a` but not `ab`, since `ab` requires an explicit concatenation,
1338         /// which results in a nest depth of `1`. In general, a nest limit is
1339         /// not something that manifests in an obvious way in the concrete
1340         /// syntax, therefore, it should not be used in a granular way.
1341         ///
1342         /// # Example
1343         ///
1344         /// ```
1345         /// use regex::RegexSetBuilder;
1346         ///
1347         /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
1348         /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
1349         /// ```
nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder1350         pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
1351             self.builder.nest_limit(limit);
1352             self
1353         }
1354     }
1355 }
1356 
1357 pub(crate) mod bytes {
1358     use crate::{
1359         bytes::{Regex, RegexSet},
1360         error::Error,
1361     };
1362 
1363     use super::Builder;
1364 
1365     /// A configurable builder for a [`Regex`].
1366     ///
1367     /// This builder can be used to programmatically set flags such as `i`
1368     /// (case insensitive) and `x` (for verbose mode). This builder can also be
1369     /// used to configure things like the line terminator and a size limit on
1370     /// the compiled regular expression.
1371     #[derive(Clone, Debug)]
1372     pub struct RegexBuilder {
1373         builder: Builder,
1374     }
1375 
1376     impl RegexBuilder {
1377         /// Create a new builder with a default configuration for the given
1378         /// pattern.
1379         ///
1380         /// If the pattern is invalid or exceeds the configured size limits,
1381         /// then an error will be returned when [`RegexBuilder::build`] is
1382         /// called.
new(pattern: &str) -> RegexBuilder1383         pub fn new(pattern: &str) -> RegexBuilder {
1384             RegexBuilder { builder: Builder::new([pattern]) }
1385         }
1386 
1387         /// Compiles the pattern given to `RegexBuilder::new` with the
1388         /// configuration set on this builder.
1389         ///
1390         /// If the pattern isn't a valid regex or if a configured size limit
1391         /// was exceeded, then an error is returned.
build(&self) -> Result<Regex, Error>1392         pub fn build(&self) -> Result<Regex, Error> {
1393             self.builder.build_one_bytes()
1394         }
1395 
1396         /// This configures Unicode mode for the entire pattern.
1397         ///
1398         /// Enabling Unicode mode does a number of things:
1399         ///
1400         /// * Most fundamentally, it causes the fundamental atom of matching
1401         /// to be a single codepoint. When Unicode mode is disabled, it's a
1402         /// single byte. For example, when Unicode mode is enabled, `.` will
1403         /// match `��` once, where as it will match 4 times when Unicode mode
1404         /// is disabled. (Since the UTF-8 encoding of `��` is 4 bytes long.)
1405         /// * Case insensitive matching uses Unicode simple case folding rules.
1406         /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
1407         /// available.
1408         /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
1409         /// `\d`.
1410         /// * The word boundary assertions, `\b` and `\B`, use the Unicode
1411         /// definition of a word character.
1412         ///
1413         /// Note that unlike the top-level `Regex` for searching `&str`, it
1414         /// is permitted to disable Unicode mode even if the resulting pattern
1415         /// could match invalid UTF-8. For example, `(?-u:.)` is not a valid
1416         /// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`.
1417         ///
1418         /// For more details on the Unicode support in this crate, see the
1419         /// [Unicode section](crate#unicode) in this crate's top-level
1420         /// documentation.
1421         ///
1422         /// The default for this is `true`.
1423         ///
1424         /// # Example
1425         ///
1426         /// ```
1427         /// use regex::bytes::RegexBuilder;
1428         ///
1429         /// let re = RegexBuilder::new(r"\w")
1430         ///     .unicode(false)
1431         ///     .build()
1432         ///     .unwrap();
1433         /// // Normally greek letters would be included in \w, but since
1434         /// // Unicode mode is disabled, it only matches ASCII letters.
1435         /// assert!(!re.is_match("δ".as_bytes()));
1436         ///
1437         /// let re = RegexBuilder::new(r"s")
1438         ///     .case_insensitive(true)
1439         ///     .unicode(false)
1440         ///     .build()
1441         ///     .unwrap();
1442         /// // Normally 'ſ' is included when searching for 's' case
1443         /// // insensitively due to Unicode's simple case folding rules. But
1444         /// // when Unicode mode is disabled, only ASCII case insensitive rules
1445         /// // are used.
1446         /// assert!(!re.is_match("ſ".as_bytes()));
1447         /// ```
1448         ///
1449         /// Since this builder is for constructing a [`bytes::Regex`](Regex),
1450         /// one can disable Unicode mode even if it would match invalid UTF-8:
1451         ///
1452         /// ```
1453         /// use regex::bytes::RegexBuilder;
1454         ///
1455         /// let re = RegexBuilder::new(r".")
1456         ///     .unicode(false)
1457         ///     .build()
1458         ///     .unwrap();
1459         /// // Normally greek letters would be included in \w, but since
1460         /// // Unicode mode is disabled, it only matches ASCII letters.
1461         /// assert!(re.is_match(b"\xFF"));
1462         /// ```
unicode(&mut self, yes: bool) -> &mut RegexBuilder1463         pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
1464             self.builder.unicode(yes);
1465             self
1466         }
1467 
1468         /// This configures whether to enable case insensitive matching for the
1469         /// entire pattern.
1470         ///
1471         /// This setting can also be configured using the inline flag `i`
1472         /// in the pattern. For example, `(?i:foo)` matches `foo` case
1473         /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
1474         ///
1475         /// The default for this is `false`.
1476         ///
1477         /// # Example
1478         ///
1479         /// ```
1480         /// use regex::bytes::RegexBuilder;
1481         ///
1482         /// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
1483         ///     .case_insensitive(true)
1484         ///     .build()
1485         ///     .unwrap();
1486         /// assert!(re.is_match(b"FoObarQuUx"));
1487         /// // Even though case insensitive matching is enabled in the builder,
1488         /// // it can be locally disabled within the pattern. In this case,
1489         /// // `bar` is matched case sensitively.
1490         /// assert!(!re.is_match(b"fooBARquux"));
1491         /// ```
case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder1492         pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
1493             self.builder.case_insensitive(yes);
1494             self
1495         }
1496 
1497         /// This configures multi-line mode for the entire pattern.
1498         ///
1499         /// Enabling multi-line mode changes the behavior of the `^` and `$`
1500         /// anchor assertions. Instead of only matching at the beginning and
1501         /// end of a haystack, respectively, multi-line mode causes them to
1502         /// match at the beginning and end of a line *in addition* to the
1503         /// beginning and end of a haystack. More precisely, `^` will match at
1504         /// the position immediately following a `\n` and `$` will match at the
1505         /// position immediately preceding a `\n`.
1506         ///
1507         /// The behavior of this option can be impacted by other settings too:
1508         ///
1509         /// * The [`RegexBuilder::line_terminator`] option changes `\n` above
1510         /// to any ASCII byte.
1511         /// * The [`RegexBuilder::crlf`] option changes the line terminator to
1512         /// be either `\r` or `\n`, but never at the position between a `\r`
1513         /// and `\n`.
1514         ///
1515         /// This setting can also be configured using the inline flag `m` in
1516         /// the pattern.
1517         ///
1518         /// The default for this is `false`.
1519         ///
1520         /// # Example
1521         ///
1522         /// ```
1523         /// use regex::bytes::RegexBuilder;
1524         ///
1525         /// let re = RegexBuilder::new(r"^foo$")
1526         ///     .multi_line(true)
1527         ///     .build()
1528         ///     .unwrap();
1529         /// assert_eq!(Some(1..4), re.find(b"\nfoo\n").map(|m| m.range()));
1530         /// ```
multi_line(&mut self, yes: bool) -> &mut RegexBuilder1531         pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
1532             self.builder.multi_line(yes);
1533             self
1534         }
1535 
1536         /// This configures dot-matches-new-line mode for the entire pattern.
1537         ///
1538         /// Perhaps surprisingly, the default behavior for `.` is not to match
1539         /// any character, but rather, to match any character except for the
1540         /// line terminator (which is `\n` by default). When this mode is
1541         /// enabled, the behavior changes such that `.` truly matches any
1542         /// character.
1543         ///
1544         /// This setting can also be configured using the inline flag `s` in
1545         /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
1546         /// regexes.
1547         ///
1548         /// The default for this is `false`.
1549         ///
1550         /// # Example
1551         ///
1552         /// ```
1553         /// use regex::bytes::RegexBuilder;
1554         ///
1555         /// let re = RegexBuilder::new(r"foo.bar")
1556         ///     .dot_matches_new_line(true)
1557         ///     .build()
1558         ///     .unwrap();
1559         /// let hay = b"foo\nbar";
1560         /// assert_eq!(Some(&b"foo\nbar"[..]), re.find(hay).map(|m| m.as_bytes()));
1561         /// ```
dot_matches_new_line( &mut self, yes: bool, ) -> &mut RegexBuilder1562         pub fn dot_matches_new_line(
1563             &mut self,
1564             yes: bool,
1565         ) -> &mut RegexBuilder {
1566             self.builder.dot_matches_new_line(yes);
1567             self
1568         }
1569 
1570         /// This configures CRLF mode for the entire pattern.
1571         ///
1572         /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
1573         /// short) and `\n` ("line feed" or LF for short) are treated as line
1574         /// terminators. This results in the following:
1575         ///
1576         /// * Unless dot-matches-new-line mode is enabled, `.` will now match
1577         /// any character except for `\n` and `\r`.
1578         /// * When multi-line mode is enabled, `^` will match immediately
1579         /// following a `\n` or a `\r`. Similarly, `$` will match immediately
1580         /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
1581         /// between `\r` and `\n`.
1582         ///
1583         /// This setting can also be configured using the inline flag `R` in
1584         /// the pattern.
1585         ///
1586         /// The default for this is `false`.
1587         ///
1588         /// # Example
1589         ///
1590         /// ```
1591         /// use regex::bytes::RegexBuilder;
1592         ///
1593         /// let re = RegexBuilder::new(r"^foo$")
1594         ///     .multi_line(true)
1595         ///     .crlf(true)
1596         ///     .build()
1597         ///     .unwrap();
1598         /// let hay = b"\r\nfoo\r\n";
1599         /// // If CRLF mode weren't enabled here, then '$' wouldn't match
1600         /// // immediately after 'foo', and thus no match would be found.
1601         /// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(|m| m.as_bytes()));
1602         /// ```
1603         ///
1604         /// This example demonstrates that `^` will never match at a position
1605         /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
1606         /// and a `\n`.)
1607         ///
1608         /// ```
1609         /// use regex::bytes::RegexBuilder;
1610         ///
1611         /// let re = RegexBuilder::new(r"^")
1612         ///     .multi_line(true)
1613         ///     .crlf(true)
1614         ///     .build()
1615         ///     .unwrap();
1616         /// let hay = b"\r\n\r\n";
1617         /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
1618         /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
1619         /// ```
crlf(&mut self, yes: bool) -> &mut RegexBuilder1620         pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
1621             self.builder.crlf(yes);
1622             self
1623         }
1624 
1625         /// Configures the line terminator to be used by the regex.
1626         ///
1627         /// The line terminator is relevant in two ways for a particular regex:
1628         ///
1629         /// * When dot-matches-new-line mode is *not* enabled (the default),
1630         /// then `.` will match any character except for the configured line
1631         /// terminator.
1632         /// * When multi-line mode is enabled (not the default), then `^` and
1633         /// `$` will match immediately after and before, respectively, a line
1634         /// terminator.
1635         ///
1636         /// In both cases, if CRLF mode is enabled in a particular context,
1637         /// then it takes precedence over any configured line terminator.
1638         ///
1639         /// This option cannot be configured from within the pattern.
1640         ///
1641         /// The default line terminator is `\n`.
1642         ///
1643         /// # Example
1644         ///
1645         /// This shows how to treat the NUL byte as a line terminator. This can
1646         /// be a useful heuristic when searching binary data.
1647         ///
1648         /// ```
1649         /// use regex::bytes::RegexBuilder;
1650         ///
1651         /// let re = RegexBuilder::new(r"^foo$")
1652         ///     .multi_line(true)
1653         ///     .line_terminator(b'\x00')
1654         ///     .build()
1655         ///     .unwrap();
1656         /// let hay = b"\x00foo\x00";
1657         /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
1658         /// ```
1659         ///
1660         /// This example shows that the behavior of `.` is impacted by this
1661         /// setting as well:
1662         ///
1663         /// ```
1664         /// use regex::bytes::RegexBuilder;
1665         ///
1666         /// let re = RegexBuilder::new(r".")
1667         ///     .line_terminator(b'\x00')
1668         ///     .build()
1669         ///     .unwrap();
1670         /// assert!(re.is_match(b"\n"));
1671         /// assert!(!re.is_match(b"\x00"));
1672         /// ```
1673         ///
1674         /// This shows that building a regex will work even when the byte
1675         /// given is not ASCII. This is unlike the top-level `Regex` API where
1676         /// matching invalid UTF-8 is not allowed.
1677         ///
1678         /// Note though that you must disable Unicode mode. This is required
1679         /// because Unicode mode requires matching one codepoint at a time,
1680         /// and there is no way to match a non-ASCII byte as if it were a
1681         /// codepoint.
1682         ///
1683         /// ```
1684         /// use regex::bytes::RegexBuilder;
1685         ///
1686         /// assert!(
1687         ///     RegexBuilder::new(r".")
1688         ///         .unicode(false)
1689         ///         .line_terminator(0x80)
1690         ///         .build()
1691         ///         .is_ok(),
1692         /// );
1693         /// ```
line_terminator(&mut self, byte: u8) -> &mut RegexBuilder1694         pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
1695             self.builder.line_terminator(byte);
1696             self
1697         }
1698 
1699         /// This configures swap-greed mode for the entire pattern.
1700         ///
1701         /// When swap-greed mode is enabled, patterns like `a+` will become
1702         /// non-greedy and patterns like `a+?` will become greedy. In other
1703         /// words, the meanings of `a+` and `a+?` are switched.
1704         ///
1705         /// This setting can also be configured using the inline flag `U` in
1706         /// the pattern.
1707         ///
1708         /// The default for this is `false`.
1709         ///
1710         /// # Example
1711         ///
1712         /// ```
1713         /// use regex::bytes::RegexBuilder;
1714         ///
1715         /// let re = RegexBuilder::new(r"a+")
1716         ///     .swap_greed(true)
1717         ///     .build()
1718         ///     .unwrap();
1719         /// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(|m| m.as_bytes()));
1720         /// ```
swap_greed(&mut self, yes: bool) -> &mut RegexBuilder1721         pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
1722             self.builder.swap_greed(yes);
1723             self
1724         }
1725 
1726         /// This configures verbose mode for the entire pattern.
1727         ///
1728         /// When enabled, whitespace will treated as insignifcant in the
1729         /// pattern and `#` can be used to start a comment until the next new
1730         /// line.
1731         ///
1732         /// Normally, in most places in a pattern, whitespace is treated
1733         /// literally. For example ` +` will match one or more ASCII whitespace
1734         /// characters.
1735         ///
1736         /// When verbose mode is enabled, `\#` can be used to match a literal
1737         /// `#` and `\ ` can be used to match a literal ASCII whitespace
1738         /// character.
1739         ///
1740         /// Verbose mode is useful for permitting regexes to be formatted and
1741         /// broken up more nicely. This may make them more easily readable.
1742         ///
1743         /// This setting can also be configured using the inline flag `x` in
1744         /// the pattern.
1745         ///
1746         /// The default for this is `false`.
1747         ///
1748         /// # Example
1749         ///
1750         /// ```
1751         /// use regex::bytes::RegexBuilder;
1752         ///
1753         /// let pat = r"
1754         ///     \b
1755         ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
1756         ///     [\s--\n]+                   # whitespace should separate names
1757         ///     (?: # middle name can be an initial!
1758         ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
1759         ///         [\s--\n]+
1760         ///     )?
1761         ///     (?<last>\p{Uppercase}\w*)
1762         ///     \b
1763         /// ";
1764         /// let re = RegexBuilder::new(pat)
1765         ///     .ignore_whitespace(true)
1766         ///     .build()
1767         ///     .unwrap();
1768         ///
1769         /// let caps = re.captures(b"Harry Potter").unwrap();
1770         /// assert_eq!(&b"Harry"[..], &caps["first"]);
1771         /// assert_eq!(&b"Potter"[..], &caps["last"]);
1772         ///
1773         /// let caps = re.captures(b"Harry J. Potter").unwrap();
1774         /// assert_eq!(&b"Harry"[..], &caps["first"]);
1775         /// // Since a middle name/initial isn't required for an overall match,
1776         /// // we can't assume that 'initial' or 'middle' will be populated!
1777         /// assert_eq!(
1778         ///     Some(&b"J"[..]),
1779         ///     caps.name("initial").map(|m| m.as_bytes()),
1780         /// );
1781         /// assert_eq!(None, caps.name("middle").map(|m| m.as_bytes()));
1782         /// assert_eq!(&b"Potter"[..], &caps["last"]);
1783         ///
1784         /// let caps = re.captures(b"Harry James Potter").unwrap();
1785         /// assert_eq!(&b"Harry"[..], &caps["first"]);
1786         /// // Since a middle name/initial isn't required for an overall match,
1787         /// // we can't assume that 'initial' or 'middle' will be populated!
1788         /// assert_eq!(None, caps.name("initial").map(|m| m.as_bytes()));
1789         /// assert_eq!(
1790         ///     Some(&b"James"[..]),
1791         ///     caps.name("middle").map(|m| m.as_bytes()),
1792         /// );
1793         /// assert_eq!(&b"Potter"[..], &caps["last"]);
1794         /// ```
ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder1795         pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
1796             self.builder.ignore_whitespace(yes);
1797             self
1798         }
1799 
1800         /// This configures octal mode for the entire pattern.
1801         ///
1802         /// Octal syntax is a little-known way of uttering Unicode codepoints
1803         /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
1804         /// equivalent patterns, where the last example shows octal syntax.
1805         ///
1806         /// While supporting octal syntax isn't in and of itself a problem,
1807         /// it does make good error messages harder. That is, in PCRE based
1808         /// regex engines, syntax like `\1` invokes a backreference, which is
1809         /// explicitly unsupported this library. However, many users expect
1810         /// backreferences to be supported. Therefore, when octal support
1811         /// is disabled, the error message will explicitly mention that
1812         /// backreferences aren't supported.
1813         ///
1814         /// The default for this is `false`.
1815         ///
1816         /// # Example
1817         ///
1818         /// ```
1819         /// use regex::bytes::RegexBuilder;
1820         ///
1821         /// // Normally this pattern would not compile, with an error message
1822         /// // about backreferences not being supported. But with octal mode
1823         /// // enabled, octal escape sequences work.
1824         /// let re = RegexBuilder::new(r"\141")
1825         ///     .octal(true)
1826         ///     .build()
1827         ///     .unwrap();
1828         /// assert!(re.is_match(b"a"));
1829         /// ```
octal(&mut self, yes: bool) -> &mut RegexBuilder1830         pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
1831             self.builder.octal(yes);
1832             self
1833         }
1834 
1835         /// Sets the approximate size limit, in bytes, of the compiled regex.
1836         ///
1837         /// This roughly corresponds to the number of heap memory, in
1838         /// bytes, occupied by a single regex. If the regex would otherwise
1839         /// approximately exceed this limit, then compiling that regex will
1840         /// fail.
1841         ///
1842         /// The main utility of a method like this is to avoid compiling
1843         /// regexes that use an unexpected amount of resources, such as
1844         /// time and memory. Even if the memory usage of a large regex is
1845         /// acceptable, its search time may not be. Namely, worst case time
1846         /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
1847         /// `n ~ len(haystack)`. That is, search time depends, in part, on the
1848         /// size of the compiled regex. This means that putting a limit on the
1849         /// size of the regex limits how much a regex can impact search time.
1850         ///
1851         /// For more information about regex size limits, see the section on
1852         /// [untrusted inputs](crate#untrusted-input) in the top-level crate
1853         /// documentation.
1854         ///
1855         /// The default for this is some reasonable number that permits most
1856         /// patterns to compile successfully.
1857         ///
1858         /// # Example
1859         ///
1860         /// ```
1861         /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
1862         /// use regex::bytes::RegexBuilder;
1863         ///
1864         /// // It may surprise you how big some seemingly small patterns can
1865         /// // be! Since \w is Unicode aware, this generates a regex that can
1866         /// // match approximately 140,000 distinct codepoints.
1867         /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
1868         /// ```
size_limit(&mut self, bytes: usize) -> &mut RegexBuilder1869         pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
1870             self.builder.size_limit(bytes);
1871             self
1872         }
1873 
1874         /// Set the approximate capacity, in bytes, of the cache of transitions
1875         /// used by the lazy DFA.
1876         ///
1877         /// While the lazy DFA isn't always used, in tends to be the most
1878         /// commonly use regex engine in default configurations. It tends to
1879         /// adopt the performance profile of a fully build DFA, but without the
1880         /// downside of taking worst case exponential time to build.
1881         ///
1882         /// The downside is that it needs to keep a cache of transitions and
1883         /// states that are built while running a search, and this cache
1884         /// can fill up. When it fills up, the cache will reset itself. Any
1885         /// previously generated states and transitions will then need to be
1886         /// re-generated. If this happens too many times, then this library
1887         /// will bail out of using the lazy DFA and switch to a different regex
1888         /// engine.
1889         ///
1890         /// If your regex provokes this particular downside of the lazy DFA,
1891         /// then it may be beneficial to increase its cache capacity. This will
1892         /// potentially reduce the frequency of cache resetting (ideally to
1893         /// `0`). While it won't fix all potential performance problems with
1894         /// the lazy DFA, increasing the cache capacity does fix some.
1895         ///
1896         /// There is no easy way to determine, a priori, whether increasing
1897         /// this cache capacity will help. In general, the larger your regex,
1898         /// the more cache it's likely to use. But that isn't an ironclad rule.
1899         /// For example, a regex like `[01]*1[01]{N}` would normally produce a
1900         /// fully build DFA that is exponential in size with respect to `N`.
1901         /// The lazy DFA will prevent exponential space blow-up, but it cache
1902         /// is likely to fill up, even when it's large and even for smallish
1903         /// values of `N`.
1904         ///
1905         /// If you aren't sure whether this helps or not, it is sensible to
1906         /// set this to some arbitrarily large number in testing, such as
1907         /// `usize::MAX`. Namely, this represents the amount of capacity that
1908         /// *may* be used. It's probably not a good idea to use `usize::MAX` in
1909         /// production though, since it implies there are no controls on heap
1910         /// memory used by this library during a search. In effect, set it to
1911         /// whatever you're willing to allocate for a single regex search.
dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder1912         pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
1913             self.builder.dfa_size_limit(bytes);
1914             self
1915         }
1916 
1917         /// Set the nesting limit for this parser.
1918         ///
1919         /// The nesting limit controls how deep the abstract syntax tree is
1920         /// allowed to be. If the AST exceeds the given limit (e.g., with too
1921         /// many nested groups), then an error is returned by the parser.
1922         ///
1923         /// The purpose of this limit is to act as a heuristic to prevent stack
1924         /// overflow for consumers that do structural induction on an AST using
1925         /// explicit recursion. While this crate never does this (instead using
1926         /// constant stack space and moving the call stack to the heap), other
1927         /// crates may.
1928         ///
1929         /// This limit is not checked until the entire AST is parsed.
1930         /// Therefore, if callers want to put a limit on the amount of heap
1931         /// space used, then they should impose a limit on the length, in
1932         /// bytes, of the concrete pattern string. In particular, this is
1933         /// viable since this parser implementation will limit itself to heap
1934         /// space proportional to the length of the pattern string. See also
1935         /// the [untrusted inputs](crate#untrusted-input) section in the
1936         /// top-level crate documentation for more information about this.
1937         ///
1938         /// Note that a nest limit of `0` will return a nest limit error for
1939         /// most patterns but not all. For example, a nest limit of `0` permits
1940         /// `a` but not `ab`, since `ab` requires an explicit concatenation,
1941         /// which results in a nest depth of `1`. In general, a nest limit is
1942         /// not something that manifests in an obvious way in the concrete
1943         /// syntax, therefore, it should not be used in a granular way.
1944         ///
1945         /// # Example
1946         ///
1947         /// ```
1948         /// use regex::bytes::RegexBuilder;
1949         ///
1950         /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
1951         /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
1952         /// ```
nest_limit(&mut self, limit: u32) -> &mut RegexBuilder1953         pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
1954             self.builder.nest_limit(limit);
1955             self
1956         }
1957     }
1958 
1959     /// A configurable builder for a [`RegexSet`].
1960     ///
1961     /// This builder can be used to programmatically set flags such as `i`
1962     /// (case insensitive) and `x` (for verbose mode). This builder can also be
1963     /// used to configure things like the line terminator and a size limit on
1964     /// the compiled regular expression.
1965     #[derive(Clone, Debug)]
1966     pub struct RegexSetBuilder {
1967         builder: Builder,
1968     }
1969 
1970     impl RegexSetBuilder {
1971         /// Create a new builder with a default configuration for the given
1972         /// patterns.
1973         ///
1974         /// If the patterns are invalid or exceed the configured size limits,
1975         /// then an error will be returned when [`RegexSetBuilder::build`] is
1976         /// called.
new<I, S>(patterns: I) -> RegexSetBuilder where I: IntoIterator<Item = S>, S: AsRef<str>,1977         pub fn new<I, S>(patterns: I) -> RegexSetBuilder
1978         where
1979             I: IntoIterator<Item = S>,
1980             S: AsRef<str>,
1981         {
1982             RegexSetBuilder { builder: Builder::new(patterns) }
1983         }
1984 
1985         /// Compiles the patterns given to `RegexSetBuilder::new` with the
1986         /// configuration set on this builder.
1987         ///
1988         /// If the patterns aren't valid regexes or if a configured size limit
1989         /// was exceeded, then an error is returned.
build(&self) -> Result<RegexSet, Error>1990         pub fn build(&self) -> Result<RegexSet, Error> {
1991             self.builder.build_many_bytes()
1992         }
1993 
1994         /// This configures Unicode mode for the all of the patterns.
1995         ///
1996         /// Enabling Unicode mode does a number of things:
1997         ///
1998         /// * Most fundamentally, it causes the fundamental atom of matching
1999         /// to be a single codepoint. When Unicode mode is disabled, it's a
2000         /// single byte. For example, when Unicode mode is enabled, `.` will
2001         /// match `��` once, where as it will match 4 times when Unicode mode
2002         /// is disabled. (Since the UTF-8 encoding of `��` is 4 bytes long.)
2003         /// * Case insensitive matching uses Unicode simple case folding rules.
2004         /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
2005         /// available.
2006         /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
2007         /// `\d`.
2008         /// * The word boundary assertions, `\b` and `\B`, use the Unicode
2009         /// definition of a word character.
2010         ///
2011         /// Note that unlike the top-level `RegexSet` for searching `&str`,
2012         /// it is permitted to disable Unicode mode even if the resulting
2013         /// pattern could match invalid UTF-8. For example, `(?-u:.)` is not
2014         /// a valid pattern for a top-level `RegexSet`, but is valid for a
2015         /// `bytes::RegexSet`.
2016         ///
2017         /// For more details on the Unicode support in this crate, see the
2018         /// [Unicode section](crate#unicode) in this crate's top-level
2019         /// documentation.
2020         ///
2021         /// The default for this is `true`.
2022         ///
2023         /// # Example
2024         ///
2025         /// ```
2026         /// use regex::bytes::RegexSetBuilder;
2027         ///
2028         /// let re = RegexSetBuilder::new([r"\w"])
2029         ///     .unicode(false)
2030         ///     .build()
2031         ///     .unwrap();
2032         /// // Normally greek letters would be included in \w, but since
2033         /// // Unicode mode is disabled, it only matches ASCII letters.
2034         /// assert!(!re.is_match("δ".as_bytes()));
2035         ///
2036         /// let re = RegexSetBuilder::new([r"s"])
2037         ///     .case_insensitive(true)
2038         ///     .unicode(false)
2039         ///     .build()
2040         ///     .unwrap();
2041         /// // Normally 'ſ' is included when searching for 's' case
2042         /// // insensitively due to Unicode's simple case folding rules. But
2043         /// // when Unicode mode is disabled, only ASCII case insensitive rules
2044         /// // are used.
2045         /// assert!(!re.is_match("ſ".as_bytes()));
2046         /// ```
2047         ///
2048         /// Since this builder is for constructing a
2049         /// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if
2050         /// it would match invalid UTF-8:
2051         ///
2052         /// ```
2053         /// use regex::bytes::RegexSetBuilder;
2054         ///
2055         /// let re = RegexSetBuilder::new([r"."])
2056         ///     .unicode(false)
2057         ///     .build()
2058         ///     .unwrap();
2059         /// // Normally greek letters would be included in \w, but since
2060         /// // Unicode mode is disabled, it only matches ASCII letters.
2061         /// assert!(re.is_match(b"\xFF"));
2062         /// ```
unicode(&mut self, yes: bool) -> &mut RegexSetBuilder2063         pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
2064             self.builder.unicode(yes);
2065             self
2066         }
2067 
2068         /// This configures whether to enable case insensitive matching for all
2069         /// of the patterns.
2070         ///
2071         /// This setting can also be configured using the inline flag `i`
2072         /// in the pattern. For example, `(?i:foo)` matches `foo` case
2073         /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
2074         ///
2075         /// The default for this is `false`.
2076         ///
2077         /// # Example
2078         ///
2079         /// ```
2080         /// use regex::bytes::RegexSetBuilder;
2081         ///
2082         /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
2083         ///     .case_insensitive(true)
2084         ///     .build()
2085         ///     .unwrap();
2086         /// assert!(re.is_match(b"FoObarQuUx"));
2087         /// // Even though case insensitive matching is enabled in the builder,
2088         /// // it can be locally disabled within the pattern. In this case,
2089         /// // `bar` is matched case sensitively.
2090         /// assert!(!re.is_match(b"fooBARquux"));
2091         /// ```
case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder2092         pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
2093             self.builder.case_insensitive(yes);
2094             self
2095         }
2096 
2097         /// This configures multi-line mode for all of the patterns.
2098         ///
2099         /// Enabling multi-line mode changes the behavior of the `^` and `$`
2100         /// anchor assertions. Instead of only matching at the beginning and
2101         /// end of a haystack, respectively, multi-line mode causes them to
2102         /// match at the beginning and end of a line *in addition* to the
2103         /// beginning and end of a haystack. More precisely, `^` will match at
2104         /// the position immediately following a `\n` and `$` will match at the
2105         /// position immediately preceding a `\n`.
2106         ///
2107         /// The behavior of this option can be impacted by other settings too:
2108         ///
2109         /// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
2110         /// above to any ASCII byte.
2111         /// * The [`RegexSetBuilder::crlf`] option changes the line terminator
2112         /// to be either `\r` or `\n`, but never at the position between a `\r`
2113         /// and `\n`.
2114         ///
2115         /// This setting can also be configured using the inline flag `m` in
2116         /// the pattern.
2117         ///
2118         /// The default for this is `false`.
2119         ///
2120         /// # Example
2121         ///
2122         /// ```
2123         /// use regex::bytes::RegexSetBuilder;
2124         ///
2125         /// let re = RegexSetBuilder::new([r"^foo$"])
2126         ///     .multi_line(true)
2127         ///     .build()
2128         ///     .unwrap();
2129         /// assert!(re.is_match(b"\nfoo\n"));
2130         /// ```
multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder2131         pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
2132             self.builder.multi_line(yes);
2133             self
2134         }
2135 
2136         /// This configures dot-matches-new-line mode for the entire pattern.
2137         ///
2138         /// Perhaps surprisingly, the default behavior for `.` is not to match
2139         /// any character, but rather, to match any character except for the
2140         /// line terminator (which is `\n` by default). When this mode is
2141         /// enabled, the behavior changes such that `.` truly matches any
2142         /// character.
2143         ///
2144         /// This setting can also be configured using the inline flag `s` in
2145         /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
2146         /// regexes.
2147         ///
2148         /// The default for this is `false`.
2149         ///
2150         /// # Example
2151         ///
2152         /// ```
2153         /// use regex::bytes::RegexSetBuilder;
2154         ///
2155         /// let re = RegexSetBuilder::new([r"foo.bar"])
2156         ///     .dot_matches_new_line(true)
2157         ///     .build()
2158         ///     .unwrap();
2159         /// let hay = b"foo\nbar";
2160         /// assert!(re.is_match(hay));
2161         /// ```
dot_matches_new_line( &mut self, yes: bool, ) -> &mut RegexSetBuilder2162         pub fn dot_matches_new_line(
2163             &mut self,
2164             yes: bool,
2165         ) -> &mut RegexSetBuilder {
2166             self.builder.dot_matches_new_line(yes);
2167             self
2168         }
2169 
2170         /// This configures CRLF mode for all of the patterns.
2171         ///
2172         /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
2173         /// short) and `\n` ("line feed" or LF for short) are treated as line
2174         /// terminators. This results in the following:
2175         ///
2176         /// * Unless dot-matches-new-line mode is enabled, `.` will now match
2177         /// any character except for `\n` and `\r`.
2178         /// * When multi-line mode is enabled, `^` will match immediately
2179         /// following a `\n` or a `\r`. Similarly, `$` will match immediately
2180         /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
2181         /// between `\r` and `\n`.
2182         ///
2183         /// This setting can also be configured using the inline flag `R` in
2184         /// the pattern.
2185         ///
2186         /// The default for this is `false`.
2187         ///
2188         /// # Example
2189         ///
2190         /// ```
2191         /// use regex::bytes::RegexSetBuilder;
2192         ///
2193         /// let re = RegexSetBuilder::new([r"^foo$"])
2194         ///     .multi_line(true)
2195         ///     .crlf(true)
2196         ///     .build()
2197         ///     .unwrap();
2198         /// let hay = b"\r\nfoo\r\n";
2199         /// // If CRLF mode weren't enabled here, then '$' wouldn't match
2200         /// // immediately after 'foo', and thus no match would be found.
2201         /// assert!(re.is_match(hay));
2202         /// ```
2203         ///
2204         /// This example demonstrates that `^` will never match at a position
2205         /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
2206         /// and a `\n`.)
2207         ///
2208         /// ```
2209         /// use regex::bytes::RegexSetBuilder;
2210         ///
2211         /// let re = RegexSetBuilder::new([r"^\n"])
2212         ///     .multi_line(true)
2213         ///     .crlf(true)
2214         ///     .build()
2215         ///     .unwrap();
2216         /// assert!(!re.is_match(b"\r\n"));
2217         /// ```
crlf(&mut self, yes: bool) -> &mut RegexSetBuilder2218         pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
2219             self.builder.crlf(yes);
2220             self
2221         }
2222 
2223         /// Configures the line terminator to be used by the regex.
2224         ///
2225         /// The line terminator is relevant in two ways for a particular regex:
2226         ///
2227         /// * When dot-matches-new-line mode is *not* enabled (the default),
2228         /// then `.` will match any character except for the configured line
2229         /// terminator.
2230         /// * When multi-line mode is enabled (not the default), then `^` and
2231         /// `$` will match immediately after and before, respectively, a line
2232         /// terminator.
2233         ///
2234         /// In both cases, if CRLF mode is enabled in a particular context,
2235         /// then it takes precedence over any configured line terminator.
2236         ///
2237         /// This option cannot be configured from within the pattern.
2238         ///
2239         /// The default line terminator is `\n`.
2240         ///
2241         /// # Example
2242         ///
2243         /// This shows how to treat the NUL byte as a line terminator. This can
2244         /// be a useful heuristic when searching binary data.
2245         ///
2246         /// ```
2247         /// use regex::bytes::RegexSetBuilder;
2248         ///
2249         /// let re = RegexSetBuilder::new([r"^foo$"])
2250         ///     .multi_line(true)
2251         ///     .line_terminator(b'\x00')
2252         ///     .build()
2253         ///     .unwrap();
2254         /// let hay = b"\x00foo\x00";
2255         /// assert!(re.is_match(hay));
2256         /// ```
2257         ///
2258         /// This example shows that the behavior of `.` is impacted by this
2259         /// setting as well:
2260         ///
2261         /// ```
2262         /// use regex::bytes::RegexSetBuilder;
2263         ///
2264         /// let re = RegexSetBuilder::new([r"."])
2265         ///     .line_terminator(b'\x00')
2266         ///     .build()
2267         ///     .unwrap();
2268         /// assert!(re.is_match(b"\n"));
2269         /// assert!(!re.is_match(b"\x00"));
2270         /// ```
2271         ///
2272         /// This shows that building a regex will work even when the byte given
2273         /// is not ASCII. This is unlike the top-level `RegexSet` API where
2274         /// matching invalid UTF-8 is not allowed.
2275         ///
2276         /// Note though that you must disable Unicode mode. This is required
2277         /// because Unicode mode requires matching one codepoint at a time,
2278         /// and there is no way to match a non-ASCII byte as if it were a
2279         /// codepoint.
2280         ///
2281         /// ```
2282         /// use regex::bytes::RegexSetBuilder;
2283         ///
2284         /// assert!(
2285         ///     RegexSetBuilder::new([r"."])
2286         ///         .unicode(false)
2287         ///         .line_terminator(0x80)
2288         ///         .build()
2289         ///         .is_ok(),
2290         /// );
2291         /// ```
line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder2292         pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
2293             self.builder.line_terminator(byte);
2294             self
2295         }
2296 
2297         /// This configures swap-greed mode for all of the patterns.
2298         ///
2299         /// When swap-greed mode is enabled, patterns like `a+` will become
2300         /// non-greedy and patterns like `a+?` will become greedy. In other
2301         /// words, the meanings of `a+` and `a+?` are switched.
2302         ///
2303         /// This setting can also be configured using the inline flag `U` in
2304         /// the pattern.
2305         ///
2306         /// Note that this is generally not useful for a `RegexSet` since a
2307         /// `RegexSet` can only report whether a pattern matches or not. Since
2308         /// greediness never impacts whether a match is found or not (only the
2309         /// offsets of the match), it follows that whether parts of a pattern
2310         /// are greedy or not doesn't matter for a `RegexSet`.
2311         ///
2312         /// The default for this is `false`.
swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder2313         pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
2314             self.builder.swap_greed(yes);
2315             self
2316         }
2317 
2318         /// This configures verbose mode for all of the patterns.
2319         ///
2320         /// When enabled, whitespace will treated as insignifcant in the
2321         /// pattern and `#` can be used to start a comment until the next new
2322         /// line.
2323         ///
2324         /// Normally, in most places in a pattern, whitespace is treated
2325         /// literally. For example ` +` will match one or more ASCII whitespace
2326         /// characters.
2327         ///
2328         /// When verbose mode is enabled, `\#` can be used to match a literal
2329         /// `#` and `\ ` can be used to match a literal ASCII whitespace
2330         /// character.
2331         ///
2332         /// Verbose mode is useful for permitting regexes to be formatted and
2333         /// broken up more nicely. This may make them more easily readable.
2334         ///
2335         /// This setting can also be configured using the inline flag `x` in
2336         /// the pattern.
2337         ///
2338         /// The default for this is `false`.
2339         ///
2340         /// # Example
2341         ///
2342         /// ```
2343         /// use regex::bytes::RegexSetBuilder;
2344         ///
2345         /// let pat = r"
2346         ///     \b
2347         ///     (?<first>\p{Uppercase}\w*)  # always start with uppercase letter
2348         ///     [\s--\n]+                   # whitespace should separate names
2349         ///     (?: # middle name can be an initial!
2350         ///         (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
2351         ///         [\s--\n]+
2352         ///     )?
2353         ///     (?<last>\p{Uppercase}\w*)
2354         ///     \b
2355         /// ";
2356         /// let re = RegexSetBuilder::new([pat])
2357         ///     .ignore_whitespace(true)
2358         ///     .build()
2359         ///     .unwrap();
2360         /// assert!(re.is_match(b"Harry Potter"));
2361         /// assert!(re.is_match(b"Harry J. Potter"));
2362         /// assert!(re.is_match(b"Harry James Potter"));
2363         /// assert!(!re.is_match(b"harry J. Potter"));
2364         /// ```
ignore_whitespace( &mut self, yes: bool, ) -> &mut RegexSetBuilder2365         pub fn ignore_whitespace(
2366             &mut self,
2367             yes: bool,
2368         ) -> &mut RegexSetBuilder {
2369             self.builder.ignore_whitespace(yes);
2370             self
2371         }
2372 
2373         /// This configures octal mode for all of the patterns.
2374         ///
2375         /// Octal syntax is a little-known way of uttering Unicode codepoints
2376         /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
2377         /// equivalent patterns, where the last example shows octal syntax.
2378         ///
2379         /// While supporting octal syntax isn't in and of itself a problem,
2380         /// it does make good error messages harder. That is, in PCRE based
2381         /// regex engines, syntax like `\1` invokes a backreference, which is
2382         /// explicitly unsupported this library. However, many users expect
2383         /// backreferences to be supported. Therefore, when octal support
2384         /// is disabled, the error message will explicitly mention that
2385         /// backreferences aren't supported.
2386         ///
2387         /// The default for this is `false`.
2388         ///
2389         /// # Example
2390         ///
2391         /// ```
2392         /// use regex::bytes::RegexSetBuilder;
2393         ///
2394         /// // Normally this pattern would not compile, with an error message
2395         /// // about backreferences not being supported. But with octal mode
2396         /// // enabled, octal escape sequences work.
2397         /// let re = RegexSetBuilder::new([r"\141"])
2398         ///     .octal(true)
2399         ///     .build()
2400         ///     .unwrap();
2401         /// assert!(re.is_match(b"a"));
2402         /// ```
octal(&mut self, yes: bool) -> &mut RegexSetBuilder2403         pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
2404             self.builder.octal(yes);
2405             self
2406         }
2407 
2408         /// Sets the approximate size limit, in bytes, of the compiled regex.
2409         ///
2410         /// This roughly corresponds to the number of heap memory, in
2411         /// bytes, occupied by a single regex. If the regex would otherwise
2412         /// approximately exceed this limit, then compiling that regex will
2413         /// fail.
2414         ///
2415         /// The main utility of a method like this is to avoid compiling
2416         /// regexes that use an unexpected amount of resources, such as
2417         /// time and memory. Even if the memory usage of a large regex is
2418         /// acceptable, its search time may not be. Namely, worst case time
2419         /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
2420         /// `n ~ len(haystack)`. That is, search time depends, in part, on the
2421         /// size of the compiled regex. This means that putting a limit on the
2422         /// size of the regex limits how much a regex can impact search time.
2423         ///
2424         /// For more information about regex size limits, see the section on
2425         /// [untrusted inputs](crate#untrusted-input) in the top-level crate
2426         /// documentation.
2427         ///
2428         /// The default for this is some reasonable number that permits most
2429         /// patterns to compile successfully.
2430         ///
2431         /// # Example
2432         ///
2433         /// ```
2434         /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
2435         /// use regex::bytes::RegexSetBuilder;
2436         ///
2437         /// // It may surprise you how big some seemingly small patterns can
2438         /// // be! Since \w is Unicode aware, this generates a regex that can
2439         /// // match approximately 140,000 distinct codepoints.
2440         /// assert!(
2441         ///     RegexSetBuilder::new([r"\w"])
2442         ///         .size_limit(45_000)
2443         ///         .build()
2444         ///         .is_err()
2445         /// );
2446         /// ```
size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder2447         pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
2448             self.builder.size_limit(bytes);
2449             self
2450         }
2451 
2452         /// Set the approximate capacity, in bytes, of the cache of transitions
2453         /// used by the lazy DFA.
2454         ///
2455         /// While the lazy DFA isn't always used, in tends to be the most
2456         /// commonly use regex engine in default configurations. It tends to
2457         /// adopt the performance profile of a fully build DFA, but without the
2458         /// downside of taking worst case exponential time to build.
2459         ///
2460         /// The downside is that it needs to keep a cache of transitions and
2461         /// states that are built while running a search, and this cache
2462         /// can fill up. When it fills up, the cache will reset itself. Any
2463         /// previously generated states and transitions will then need to be
2464         /// re-generated. If this happens too many times, then this library
2465         /// will bail out of using the lazy DFA and switch to a different regex
2466         /// engine.
2467         ///
2468         /// If your regex provokes this particular downside of the lazy DFA,
2469         /// then it may be beneficial to increase its cache capacity. This will
2470         /// potentially reduce the frequency of cache resetting (ideally to
2471         /// `0`). While it won't fix all potential performance problems with
2472         /// the lazy DFA, increasing the cache capacity does fix some.
2473         ///
2474         /// There is no easy way to determine, a priori, whether increasing
2475         /// this cache capacity will help. In general, the larger your regex,
2476         /// the more cache it's likely to use. But that isn't an ironclad rule.
2477         /// For example, a regex like `[01]*1[01]{N}` would normally produce a
2478         /// fully build DFA that is exponential in size with respect to `N`.
2479         /// The lazy DFA will prevent exponential space blow-up, but it cache
2480         /// is likely to fill up, even when it's large and even for smallish
2481         /// values of `N`.
2482         ///
2483         /// If you aren't sure whether this helps or not, it is sensible to
2484         /// set this to some arbitrarily large number in testing, such as
2485         /// `usize::MAX`. Namely, this represents the amount of capacity that
2486         /// *may* be used. It's probably not a good idea to use `usize::MAX` in
2487         /// production though, since it implies there are no controls on heap
2488         /// memory used by this library during a search. In effect, set it to
2489         /// whatever you're willing to allocate for a single regex search.
dfa_size_limit( &mut self, bytes: usize, ) -> &mut RegexSetBuilder2490         pub fn dfa_size_limit(
2491             &mut self,
2492             bytes: usize,
2493         ) -> &mut RegexSetBuilder {
2494             self.builder.dfa_size_limit(bytes);
2495             self
2496         }
2497 
2498         /// Set the nesting limit for this parser.
2499         ///
2500         /// The nesting limit controls how deep the abstract syntax tree is
2501         /// allowed to be. If the AST exceeds the given limit (e.g., with too
2502         /// many nested groups), then an error is returned by the parser.
2503         ///
2504         /// The purpose of this limit is to act as a heuristic to prevent stack
2505         /// overflow for consumers that do structural induction on an AST using
2506         /// explicit recursion. While this crate never does this (instead using
2507         /// constant stack space and moving the call stack to the heap), other
2508         /// crates may.
2509         ///
2510         /// This limit is not checked until the entire AST is parsed.
2511         /// Therefore, if callers want to put a limit on the amount of heap
2512         /// space used, then they should impose a limit on the length, in
2513         /// bytes, of the concrete pattern string. In particular, this is
2514         /// viable since this parser implementation will limit itself to heap
2515         /// space proportional to the length of the pattern string. See also
2516         /// the [untrusted inputs](crate#untrusted-input) section in the
2517         /// top-level crate documentation for more information about this.
2518         ///
2519         /// Note that a nest limit of `0` will return a nest limit error for
2520         /// most patterns but not all. For example, a nest limit of `0` permits
2521         /// `a` but not `ab`, since `ab` requires an explicit concatenation,
2522         /// which results in a nest depth of `1`. In general, a nest limit is
2523         /// not something that manifests in an obvious way in the concrete
2524         /// syntax, therefore, it should not be used in a granular way.
2525         ///
2526         /// # Example
2527         ///
2528         /// ```
2529         /// use regex::bytes::RegexSetBuilder;
2530         ///
2531         /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
2532         /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
2533         /// ```
nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder2534         pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
2535             self.builder.nest_limit(limit);
2536             self
2537         }
2538     }
2539 }
2540