1 /*!
2 Provides an architecture independent implementation of the "packed pair"
3 algorithm.
4 
5 The "packed pair" algorithm is based on the [generic SIMD] algorithm. The main
6 difference is that it (by default) uses a background distribution of byte
7 frequencies to heuristically select the pair of bytes to search for. Note that
8 this module provides an architecture independent version that doesn't do as
9 good of a job keeping the search for candidates inside a SIMD hot path. It
10 however can be good enough in many circumstances.
11 
12 [generic SIMD]: http://0x80.pl/articles/simd-strfind.html#first-and-last
13 */
14 
15 use crate::memchr;
16 
17 mod default_rank;
18 
19 /// An architecture independent "packed pair" finder.
20 ///
21 /// This finder picks two bytes that it believes have high predictive power for
22 /// indicating an overall match of a needle. At search time, it reports offsets
23 /// where the needle could match based on whether the pair of bytes it chose
24 /// match.
25 ///
26 /// This is architecture independent because it utilizes `memchr` to find the
27 /// occurrence of one of the bytes in the pair, and then checks whether the
28 /// second byte matches. If it does, in the case of [`Finder::find_prefilter`],
29 /// the location at which the needle could match is returned.
30 ///
31 /// It is generally preferred to use architecture specific routines for a
32 /// "packed pair" prefilter, but this can be a useful fallback when the
33 /// architecture independent routines are unavailable.
34 #[derive(Clone, Copy, Debug)]
35 pub struct Finder {
36     pair: Pair,
37     byte1: u8,
38     byte2: u8,
39 }
40 
41 impl Finder {
42     /// Create a new prefilter that reports possible locations where the given
43     /// needle matches.
44     #[inline]
new(needle: &[u8]) -> Option<Finder>45     pub fn new(needle: &[u8]) -> Option<Finder> {
46         Finder::with_pair(needle, Pair::new(needle)?)
47     }
48 
49     /// Create a new prefilter using the pair given.
50     ///
51     /// If the prefilter could not be constructed, then `None` is returned.
52     ///
53     /// This constructor permits callers to control precisely which pair of
54     /// bytes is used as a predicate.
55     #[inline]
with_pair(needle: &[u8], pair: Pair) -> Option<Finder>56     pub fn with_pair(needle: &[u8], pair: Pair) -> Option<Finder> {
57         let byte1 = needle[usize::from(pair.index1())];
58         let byte2 = needle[usize::from(pair.index2())];
59         // Currently this can never fail so we could just return a Finder,
60         // but it's conceivable this could change.
61         Some(Finder { pair, byte1, byte2 })
62     }
63 
64     /// Run this finder on the given haystack as a prefilter.
65     ///
66     /// If a candidate match is found, then an offset where the needle *could*
67     /// begin in the haystack is returned.
68     #[inline]
find_prefilter(&self, haystack: &[u8]) -> Option<usize>69     pub fn find_prefilter(&self, haystack: &[u8]) -> Option<usize> {
70         let mut i = 0;
71         let index1 = usize::from(self.pair.index1());
72         let index2 = usize::from(self.pair.index2());
73         loop {
74             // Use a fast vectorized implementation to skip to the next
75             // occurrence of the rarest byte (heuristically chosen) in the
76             // needle.
77             i += memchr(self.byte1, &haystack[i..])?;
78             let found = i;
79             i += 1;
80 
81             // If we can't align our first byte match with the haystack, then a
82             // match is impossible.
83             let aligned1 = match found.checked_sub(index1) {
84                 None => continue,
85                 Some(aligned1) => aligned1,
86             };
87 
88             // Now align the second byte match with the haystack. A mismatch
89             // means that a match is impossible.
90             let aligned2 = match aligned1.checked_add(index2) {
91                 None => continue,
92                 Some(aligned_index2) => aligned_index2,
93             };
94             if haystack.get(aligned2).map_or(true, |&b| b != self.byte2) {
95                 continue;
96             }
97 
98             // We've done what we can. There might be a match here.
99             return Some(aligned1);
100         }
101     }
102 
103     /// Returns the pair of offsets (into the needle) used to check as a
104     /// predicate before confirming whether a needle exists at a particular
105     /// position.
106     #[inline]
pair(&self) -> &Pair107     pub fn pair(&self) -> &Pair {
108         &self.pair
109     }
110 }
111 
112 /// A pair of byte offsets into a needle to use as a predicate.
113 ///
114 /// This pair is used as a predicate to quickly filter out positions in a
115 /// haystack in which a needle cannot match. In some cases, this pair can even
116 /// be used in vector algorithms such that the vector algorithm only switches
117 /// over to scalar code once this pair has been found.
118 ///
119 /// A pair of offsets can be used in both substring search implementations and
120 /// in prefilters. The former will report matches of a needle in a haystack
121 /// where as the latter will only report possible matches of a needle.
122 ///
123 /// The offsets are limited each to a maximum of 255 to keep memory usage low.
124 /// Moreover, it's rarely advantageous to create a predicate using offsets
125 /// greater than 255 anyway.
126 ///
127 /// The only guarantee enforced on the pair of offsets is that they are not
128 /// equivalent. It is not necessarily the case that `index1 < index2` for
129 /// example. By convention, `index1` corresponds to the byte in the needle
130 /// that is believed to be most the predictive. Note also that because of the
131 /// requirement that the indices be both valid for the needle used to build
132 /// the pair and not equal, it follows that a pair can only be constructed for
133 /// needles with length at least 2.
134 #[derive(Clone, Copy, Debug)]
135 pub struct Pair {
136     index1: u8,
137     index2: u8,
138 }
139 
140 impl Pair {
141     /// Create a new pair of offsets from the given needle.
142     ///
143     /// If a pair could not be created (for example, if the needle is too
144     /// short), then `None` is returned.
145     ///
146     /// This chooses the pair in the needle that is believed to be as
147     /// predictive of an overall match of the needle as possible.
148     #[inline]
new(needle: &[u8]) -> Option<Pair>149     pub fn new(needle: &[u8]) -> Option<Pair> {
150         Pair::with_ranker(needle, DefaultFrequencyRank)
151     }
152 
153     /// Create a new pair of offsets from the given needle and ranker.
154     ///
155     /// This permits the caller to choose a background frequency distribution
156     /// with which bytes are selected. The idea is to select a pair of bytes
157     /// that is believed to strongly predict a match in the haystack. This
158     /// usually means selecting bytes that occur rarely in a haystack.
159     ///
160     /// If a pair could not be created (for example, if the needle is too
161     /// short), then `None` is returned.
162     #[inline]
with_ranker<R: HeuristicFrequencyRank>( needle: &[u8], ranker: R, ) -> Option<Pair>163     pub fn with_ranker<R: HeuristicFrequencyRank>(
164         needle: &[u8],
165         ranker: R,
166     ) -> Option<Pair> {
167         if needle.len() <= 1 {
168             return None;
169         }
170         // Find the rarest two bytes. We make them distinct indices by
171         // construction. (The actual byte value may be the same in degenerate
172         // cases, but that's OK.)
173         let (mut rare1, mut index1) = (needle[0], 0);
174         let (mut rare2, mut index2) = (needle[1], 1);
175         if ranker.rank(rare2) < ranker.rank(rare1) {
176             core::mem::swap(&mut rare1, &mut rare2);
177             core::mem::swap(&mut index1, &mut index2);
178         }
179         let max = usize::from(core::u8::MAX);
180         for (i, &b) in needle.iter().enumerate().take(max).skip(2) {
181             if ranker.rank(b) < ranker.rank(rare1) {
182                 rare2 = rare1;
183                 index2 = index1;
184                 rare1 = b;
185                 index1 = u8::try_from(i).unwrap();
186             } else if b != rare1 && ranker.rank(b) < ranker.rank(rare2) {
187                 rare2 = b;
188                 index2 = u8::try_from(i).unwrap();
189             }
190         }
191         // While not strictly required for how a Pair is normally used, we
192         // really don't want these to be equivalent. If they were, it would
193         // reduce the effectiveness of candidate searching using these rare
194         // bytes by increasing the rate of false positives.
195         assert_ne!(index1, index2);
196         Some(Pair { index1, index2 })
197     }
198 
199     /// Create a new pair using the offsets given for the needle given.
200     ///
201     /// This bypasses any sort of heuristic process for choosing the offsets
202     /// and permits the caller to choose the offsets themselves.
203     ///
204     /// Indices are limited to valid `u8` values so that a `Pair` uses less
205     /// memory. It is not possible to create a `Pair` with offsets bigger than
206     /// `u8::MAX`. It's likely that such a thing is not needed, but if it is,
207     /// it's suggested to build your own bespoke algorithm because you're
208     /// likely working on a very niche case. (File an issue if this suggestion
209     /// does not make sense to you.)
210     ///
211     /// If a pair could not be created (for example, if the needle is too
212     /// short), then `None` is returned.
213     #[inline]
with_indices( needle: &[u8], index1: u8, index2: u8, ) -> Option<Pair>214     pub fn with_indices(
215         needle: &[u8],
216         index1: u8,
217         index2: u8,
218     ) -> Option<Pair> {
219         // While not strictly required for how a Pair is normally used, we
220         // really don't want these to be equivalent. If they were, it would
221         // reduce the effectiveness of candidate searching using these rare
222         // bytes by increasing the rate of false positives.
223         if index1 == index2 {
224             return None;
225         }
226         // Similarly, invalid indices means the Pair is invalid too.
227         if usize::from(index1) >= needle.len() {
228             return None;
229         }
230         if usize::from(index2) >= needle.len() {
231             return None;
232         }
233         Some(Pair { index1, index2 })
234     }
235 
236     /// Returns the first offset of the pair.
237     #[inline]
index1(&self) -> u8238     pub fn index1(&self) -> u8 {
239         self.index1
240     }
241 
242     /// Returns the second offset of the pair.
243     #[inline]
index2(&self) -> u8244     pub fn index2(&self) -> u8 {
245         self.index2
246     }
247 }
248 
249 /// This trait allows the user to customize the heuristic used to determine the
250 /// relative frequency of a given byte in the dataset being searched.
251 ///
252 /// The use of this trait can have a dramatic impact on performance depending
253 /// on the type of data being searched. The details of why are explained in the
254 /// docs of [`crate::memmem::Prefilter`]. To summarize, the core algorithm uses
255 /// a prefilter to quickly identify candidate matches that are later verified
256 /// more slowly. This prefilter is implemented in terms of trying to find
257 /// `rare` bytes at specific offsets that will occur less frequently in the
258 /// dataset. While the concept of a `rare` byte is similar for most datasets,
259 /// there are some specific datasets (like binary executables) that have
260 /// dramatically different byte distributions. For these datasets customizing
261 /// the byte frequency heuristic can have a massive impact on performance, and
262 /// might even need to be done at runtime.
263 ///
264 /// The default implementation of `HeuristicFrequencyRank` reads from the
265 /// static frequency table defined in `src/memmem/byte_frequencies.rs`. This
266 /// is optimal for most inputs, so if you are unsure of the impact of using a
267 /// custom `HeuristicFrequencyRank` you should probably just use the default.
268 ///
269 /// # Example
270 ///
271 /// ```
272 /// use memchr::{
273 ///     arch::all::packedpair::HeuristicFrequencyRank,
274 ///     memmem::FinderBuilder,
275 /// };
276 ///
277 /// /// A byte-frequency table that is good for scanning binary executables.
278 /// struct Binary;
279 ///
280 /// impl HeuristicFrequencyRank for Binary {
281 ///     fn rank(&self, byte: u8) -> u8 {
282 ///         const TABLE: [u8; 256] = [
283 ///             255, 128, 61, 43, 50, 41, 27, 28, 57, 15, 21, 13, 24, 17, 17,
284 ///             89, 58, 16, 11, 7, 14, 23, 7, 6, 24, 9, 6, 5, 9, 4, 7, 16,
285 ///             68, 11, 9, 6, 88, 7, 4, 4, 23, 9, 4, 8, 8, 5, 10, 4, 30, 11,
286 ///             9, 24, 11, 5, 5, 5, 19, 11, 6, 17, 9, 9, 6, 8,
287 ///             48, 58, 11, 14, 53, 40, 9, 9, 254, 35, 3, 6, 52, 23, 6, 6, 27,
288 ///             4, 7, 11, 14, 13, 10, 11, 11, 5, 2, 10, 16, 12, 6, 19,
289 ///             19, 20, 5, 14, 16, 31, 19, 7, 14, 20, 4, 4, 19, 8, 18, 20, 24,
290 ///             1, 25, 19, 58, 29, 10, 5, 15, 20, 2, 2, 9, 4, 3, 5,
291 ///             51, 11, 4, 53, 23, 39, 6, 4, 13, 81, 4, 186, 5, 67, 3, 2, 15,
292 ///             0, 0, 1, 3, 2, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0,
293 ///             12, 2, 1, 1, 3, 1, 1, 1, 6, 1, 2, 1, 3, 1, 1, 2, 9, 1, 1, 0,
294 ///             2, 2, 4, 4, 11, 6, 7, 3, 6, 9, 4, 5,
295 ///             46, 18, 8, 18, 17, 3, 8, 20, 16, 10, 3, 7, 175, 4, 6, 7, 13,
296 ///             3, 7, 3, 3, 1, 3, 3, 10, 3, 1, 5, 2, 0, 1, 2,
297 ///             16, 3, 5, 1, 6, 1, 1, 2, 58, 20, 3, 14, 12, 2, 1, 3, 16, 3, 5,
298 ///             8, 3, 1, 8, 6, 17, 6, 5, 3, 8, 6, 13, 175,
299 ///         ];
300 ///         TABLE[byte as usize]
301 ///     }
302 /// }
303 /// // Create a new finder with the custom heuristic.
304 /// let finder = FinderBuilder::new()
305 ///     .build_forward_with_ranker(Binary, b"\x00\x00\xdd\xdd");
306 /// // Find needle with custom heuristic.
307 /// assert!(finder.find(b"\x00\x00\x00\xdd\xdd").is_some());
308 /// ```
309 pub trait HeuristicFrequencyRank {
310     /// Return the heuristic frequency rank of the given byte. A lower rank
311     /// means the byte is believed to occur less frequently in the haystack.
312     ///
313     /// Some uses of this heuristic may treat arbitrary absolute rank values as
314     /// significant. For example, an implementation detail in this crate may
315     /// determine that heuristic prefilters are inappropriate if every byte in
316     /// the needle has a "high" rank.
rank(&self, byte: u8) -> u8317     fn rank(&self, byte: u8) -> u8;
318 }
319 
320 /// The default byte frequency heuristic that is good for most haystacks.
321 pub(crate) struct DefaultFrequencyRank;
322 
323 impl HeuristicFrequencyRank for DefaultFrequencyRank {
rank(&self, byte: u8) -> u8324     fn rank(&self, byte: u8) -> u8 {
325         self::default_rank::RANK[usize::from(byte)]
326     }
327 }
328 
329 /// This permits passing any implementation of `HeuristicFrequencyRank` as a
330 /// borrowed version of itself.
331 impl<'a, R> HeuristicFrequencyRank for &'a R
332 where
333     R: HeuristicFrequencyRank,
334 {
rank(&self, byte: u8) -> u8335     fn rank(&self, byte: u8) -> u8 {
336         (**self).rank(byte)
337     }
338 }
339 
340 #[cfg(test)]
341 mod tests {
342     use super::*;
343 
344     #[test]
forward_packedpair()345     fn forward_packedpair() {
346         fn find(
347             haystack: &[u8],
348             needle: &[u8],
349             _index1: u8,
350             _index2: u8,
351         ) -> Option<Option<usize>> {
352             // We ignore the index positions requested since it winds up making
353             // this test too slow overall.
354             let f = Finder::new(needle)?;
355             Some(f.find_prefilter(haystack))
356         }
357         crate::tests::packedpair::Runner::new().fwd(find).run()
358     }
359 }
360