1 // See the README in this directory for an explanation of the Teddy algorithm. 2 3 use std::cmp; 4 use std::collections::BTreeMap; 5 use std::fmt; 6 7 use crate::packed::pattern::{PatternID, Patterns}; 8 use crate::packed::teddy::Teddy; 9 10 /// A builder for constructing a Teddy matcher. 11 /// 12 /// The builder primarily permits fine grained configuration of the Teddy 13 /// matcher. Most options are made only available for testing/benchmarking 14 /// purposes. In reality, options are automatically determined by the nature 15 /// and number of patterns given to the builder. 16 #[derive(Clone, Debug)] 17 pub struct Builder { 18 /// When none, this is automatically determined. Otherwise, `false` means 19 /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used 20 /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't 21 /// available and Fat Teddy was requested, no matcher will be built. 22 fat: Option<bool>, 23 /// When none, this is automatically determined. Otherwise, `false` means 24 /// that 128-bit vectors will be used (up to SSSE3 instructions) where as 25 /// `true` means that 256-bit vectors will be used. As with `fat`, if 26 /// 256-bit vectors are requested and they aren't available, then a 27 /// searcher will not be built. 28 avx: Option<bool>, 29 } 30 31 impl Default for Builder { default() -> Builder32 fn default() -> Builder { 33 Builder::new() 34 } 35 } 36 37 impl Builder { 38 /// Create a new builder for configuring a Teddy matcher. new() -> Builder39 pub fn new() -> Builder { 40 Builder { fat: None, avx: None } 41 } 42 43 /// Build a matcher for the set of patterns given. If a matcher could not 44 /// be built, then `None` is returned. 45 /// 46 /// Generally, a matcher isn't built if the necessary CPU features aren't 47 /// available, an unsupported target or if the searcher is believed to be 48 /// slower than standard techniques (i.e., if there are too many literals). build(&self, patterns: &Patterns) -> Option<Teddy>49 pub fn build(&self, patterns: &Patterns) -> Option<Teddy> { 50 self.build_imp(patterns) 51 } 52 53 /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses 54 /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful 55 /// for a larger set of literals. 56 /// 57 /// `None` is the default, which results in an automatic selection based 58 /// on the number of literals and available CPU features. fat(&mut self, yes: Option<bool>) -> &mut Builder59 pub fn fat(&mut self, yes: Option<bool>) -> &mut Builder { 60 self.fat = yes; 61 self 62 } 63 64 /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). 65 /// Generally, a larger vector size is better since it either permits 66 /// matching more patterns or matching more bytes in the haystack at once. 67 /// 68 /// `None` is the default, which results in an automatic selection based on 69 /// the number of literals and available CPU features. avx(&mut self, yes: Option<bool>) -> &mut Builder70 pub fn avx(&mut self, yes: Option<bool>) -> &mut Builder { 71 self.avx = yes; 72 self 73 } 74 build_imp(&self, patterns: &Patterns) -> Option<Teddy>75 fn build_imp(&self, patterns: &Patterns) -> Option<Teddy> { 76 use crate::packed::teddy::runtime; 77 78 // Most of the logic here is just about selecting the optimal settings, 79 // or perhaps even rejecting construction altogether. The choices 80 // we have are: fat (avx only) or not, ssse3 or avx2, and how many 81 // patterns we allow ourselves to search. Additionally, for testing 82 // and benchmarking, we permit callers to try to "force" a setting, 83 // and if the setting isn't allowed (e.g., forcing AVX when AVX isn't 84 // available), then we bail and return nothing. 85 86 if patterns.len() > 64 { 87 return None; 88 } 89 let has_ssse3 = is_x86_feature_detected!("ssse3"); 90 let has_avx = is_x86_feature_detected!("avx2"); 91 let avx = if self.avx == Some(true) { 92 if !has_avx { 93 return None; 94 } 95 true 96 } else if self.avx == Some(false) { 97 if !has_ssse3 { 98 return None; 99 } 100 false 101 } else if !has_ssse3 && !has_avx { 102 return None; 103 } else { 104 has_avx 105 }; 106 let fat = match self.fat { 107 None => avx && patterns.len() > 32, 108 Some(false) => false, 109 Some(true) if !avx => return None, 110 Some(true) => true, 111 }; 112 113 let mut compiler = Compiler::new(patterns, fat); 114 compiler.compile(); 115 let Compiler { buckets, masks, .. } = compiler; 116 // SAFETY: It is required that the builder only produce Teddy matchers 117 // that are allowed to run on the current CPU, since we later assume 118 // that the presence of (for example) TeddySlim1Mask256 means it is 119 // safe to call functions marked with the `avx2` target feature. 120 match (masks.len(), avx, fat) { 121 (1, false, _) => Some(Teddy { 122 buckets, 123 max_pattern_id: patterns.max_pattern_id(), 124 exec: runtime::Exec::TeddySlim1Mask128( 125 runtime::TeddySlim1Mask128 { 126 mask1: runtime::Mask128::new(masks[0]), 127 }, 128 ), 129 }), 130 (1, true, false) => Some(Teddy { 131 buckets, 132 max_pattern_id: patterns.max_pattern_id(), 133 exec: runtime::Exec::TeddySlim1Mask256( 134 runtime::TeddySlim1Mask256 { 135 mask1: runtime::Mask256::new(masks[0]), 136 }, 137 ), 138 }), 139 (1, true, true) => Some(Teddy { 140 buckets, 141 max_pattern_id: patterns.max_pattern_id(), 142 exec: runtime::Exec::TeddyFat1Mask256( 143 runtime::TeddyFat1Mask256 { 144 mask1: runtime::Mask256::new(masks[0]), 145 }, 146 ), 147 }), 148 (2, false, _) => Some(Teddy { 149 buckets, 150 max_pattern_id: patterns.max_pattern_id(), 151 exec: runtime::Exec::TeddySlim2Mask128( 152 runtime::TeddySlim2Mask128 { 153 mask1: runtime::Mask128::new(masks[0]), 154 mask2: runtime::Mask128::new(masks[1]), 155 }, 156 ), 157 }), 158 (2, true, false) => Some(Teddy { 159 buckets, 160 max_pattern_id: patterns.max_pattern_id(), 161 exec: runtime::Exec::TeddySlim2Mask256( 162 runtime::TeddySlim2Mask256 { 163 mask1: runtime::Mask256::new(masks[0]), 164 mask2: runtime::Mask256::new(masks[1]), 165 }, 166 ), 167 }), 168 (2, true, true) => Some(Teddy { 169 buckets, 170 max_pattern_id: patterns.max_pattern_id(), 171 exec: runtime::Exec::TeddyFat2Mask256( 172 runtime::TeddyFat2Mask256 { 173 mask1: runtime::Mask256::new(masks[0]), 174 mask2: runtime::Mask256::new(masks[1]), 175 }, 176 ), 177 }), 178 (3, false, _) => Some(Teddy { 179 buckets, 180 max_pattern_id: patterns.max_pattern_id(), 181 exec: runtime::Exec::TeddySlim3Mask128( 182 runtime::TeddySlim3Mask128 { 183 mask1: runtime::Mask128::new(masks[0]), 184 mask2: runtime::Mask128::new(masks[1]), 185 mask3: runtime::Mask128::new(masks[2]), 186 }, 187 ), 188 }), 189 (3, true, false) => Some(Teddy { 190 buckets, 191 max_pattern_id: patterns.max_pattern_id(), 192 exec: runtime::Exec::TeddySlim3Mask256( 193 runtime::TeddySlim3Mask256 { 194 mask1: runtime::Mask256::new(masks[0]), 195 mask2: runtime::Mask256::new(masks[1]), 196 mask3: runtime::Mask256::new(masks[2]), 197 }, 198 ), 199 }), 200 (3, true, true) => Some(Teddy { 201 buckets, 202 max_pattern_id: patterns.max_pattern_id(), 203 exec: runtime::Exec::TeddyFat3Mask256( 204 runtime::TeddyFat3Mask256 { 205 mask1: runtime::Mask256::new(masks[0]), 206 mask2: runtime::Mask256::new(masks[1]), 207 mask3: runtime::Mask256::new(masks[2]), 208 }, 209 ), 210 }), 211 _ => unreachable!(), 212 } 213 } 214 } 215 216 /// A compiler is in charge of allocating patterns into buckets and generating 217 /// the masks necessary for searching. 218 #[derive(Clone)] 219 struct Compiler<'p> { 220 patterns: &'p Patterns, 221 buckets: Vec<Vec<PatternID>>, 222 masks: Vec<Mask>, 223 } 224 225 impl<'p> Compiler<'p> { 226 /// Create a new Teddy compiler for the given patterns. If `fat` is true, 227 /// then 16 buckets will be used instead of 8. 228 /// 229 /// This panics if any of the patterns given are empty. new(patterns: &'p Patterns, fat: bool) -> Compiler<'p>230 fn new(patterns: &'p Patterns, fat: bool) -> Compiler<'p> { 231 let mask_len = cmp::min(3, patterns.minimum_len()); 232 assert!(1 <= mask_len && mask_len <= 3); 233 234 Compiler { 235 patterns, 236 buckets: vec![vec![]; if fat { 16 } else { 8 }], 237 masks: vec![Mask::default(); mask_len], 238 } 239 } 240 241 /// Compile the patterns in this compiler into buckets and masks. compile(&mut self)242 fn compile(&mut self) { 243 let mut lonibble_to_bucket: BTreeMap<Vec<u8>, usize> = BTreeMap::new(); 244 for (id, pattern) in self.patterns.iter() { 245 // We try to be slightly clever in how we assign patterns into 246 // buckets. Generally speaking, we want patterns with the same 247 // prefix to be in the same bucket, since it minimizes the amount 248 // of time we spend churning through buckets in the verification 249 // step. 250 // 251 // So we could assign patterns with the same N-prefix (where N 252 // is the size of the mask, which is one of {1, 2, 3}) to the 253 // same bucket. However, case insensitive searches are fairly 254 // common, so we'd for example, ideally want to treat `abc` and 255 // `ABC` as if they shared the same prefix. ASCII has the nice 256 // property that the lower 4 bits of A and a are the same, so we 257 // therefore group patterns with the same low-nybbe-N-prefix into 258 // the same bucket. 259 // 260 // MOREOVER, this is actually necessary for correctness! In 261 // particular, by grouping patterns with the same prefix into the 262 // same bucket, we ensure that we preserve correct leftmost-first 263 // and leftmost-longest match semantics. In addition to the fact 264 // that `patterns.iter()` iterates in the correct order, this 265 // guarantees that all possible ambiguous matches will occur in 266 // the same bucket. The verification routine could be adjusted to 267 // support correct leftmost match semantics regardless of bucket 268 // allocation, but that results in a performance hit. It's much 269 // nicer to be able to just stop as soon as a match is found. 270 let lonybs = pattern.low_nybbles(self.masks.len()); 271 if let Some(&bucket) = lonibble_to_bucket.get(&lonybs) { 272 self.buckets[bucket].push(id); 273 } else { 274 // N.B. We assign buckets in reverse because it shouldn't have 275 // any influence on performance, but it does make it harder to 276 // get leftmost match semantics accidentally correct. 277 let bucket = (self.buckets.len() - 1) 278 - (id as usize % self.buckets.len()); 279 self.buckets[bucket].push(id); 280 lonibble_to_bucket.insert(lonybs, bucket); 281 } 282 } 283 for (bucket_index, bucket) in self.buckets.iter().enumerate() { 284 for &pat_id in bucket { 285 let pat = self.patterns.get(pat_id); 286 for (i, mask) in self.masks.iter_mut().enumerate() { 287 if self.buckets.len() == 8 { 288 mask.add_slim(bucket_index as u8, pat.bytes()[i]); 289 } else { 290 mask.add_fat(bucket_index as u8, pat.bytes()[i]); 291 } 292 } 293 } 294 } 295 } 296 } 297 298 impl<'p> fmt::Debug for Compiler<'p> { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result299 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 300 let mut buckets = vec![vec![]; self.buckets.len()]; 301 for (i, bucket) in self.buckets.iter().enumerate() { 302 for &patid in bucket { 303 buckets[i].push(self.patterns.get(patid)); 304 } 305 } 306 f.debug_struct("Compiler") 307 .field("buckets", &buckets) 308 .field("masks", &self.masks) 309 .finish() 310 } 311 } 312 313 /// Mask represents the low and high nybble masks that will be used during 314 /// search. Each mask is 32 bytes wide, although only the first 16 bytes are 315 /// used for the SSSE3 runtime. 316 /// 317 /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set 318 /// if and only if the corresponding nybble is in the ith bucket. The index of 319 /// the byte (0-15, inclusive) corresponds to the nybble. 320 /// 321 /// Each mask is used as the target of a shuffle, where the indices for the 322 /// shuffle are taken from the haystack. AND'ing the shuffles for both the 323 /// low and high masks together also results in 8-bit bitsets, but where bit 324 /// `i` is set if and only if the correspond *byte* is in the ith bucket. 325 /// 326 /// During compilation, masks are just arrays. But during search, these masks 327 /// are represented as 128-bit or 256-bit vectors. 328 /// 329 /// (See the README is this directory for more details.) 330 #[derive(Clone, Copy, Default)] 331 pub struct Mask { 332 lo: [u8; 32], 333 hi: [u8; 32], 334 } 335 336 impl Mask { 337 /// Update this mask by adding the given byte to the given bucket. The 338 /// given bucket must be in the range 0-7. 339 /// 340 /// This is for "slim" Teddy, where there are only 8 buckets. add_slim(&mut self, bucket: u8, byte: u8)341 fn add_slim(&mut self, bucket: u8, byte: u8) { 342 assert!(bucket < 8); 343 344 let byte_lo = (byte & 0xF) as usize; 345 let byte_hi = ((byte >> 4) & 0xF) as usize; 346 // When using 256-bit vectors, we need to set this bucket assignment in 347 // the low and high 128-bit portions of the mask. This allows us to 348 // process 32 bytes at a time. Namely, AVX2 shuffles operate on each 349 // of the 128-bit lanes, rather than the full 256-bit vector at once. 350 self.lo[byte_lo] |= 1 << bucket; 351 self.lo[byte_lo + 16] |= 1 << bucket; 352 self.hi[byte_hi] |= 1 << bucket; 353 self.hi[byte_hi + 16] |= 1 << bucket; 354 } 355 356 /// Update this mask by adding the given byte to the given bucket. The 357 /// given bucket must be in the range 0-15. 358 /// 359 /// This is for "fat" Teddy, where there are 16 buckets. add_fat(&mut self, bucket: u8, byte: u8)360 fn add_fat(&mut self, bucket: u8, byte: u8) { 361 assert!(bucket < 16); 362 363 let byte_lo = (byte & 0xF) as usize; 364 let byte_hi = ((byte >> 4) & 0xF) as usize; 365 // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, 366 // the high 128 bits of our mask correspond to buckets 8-15, while the 367 // low 128 bits correspond to buckets 0-7. 368 if bucket < 8 { 369 self.lo[byte_lo] |= 1 << bucket; 370 self.hi[byte_hi] |= 1 << bucket; 371 } else { 372 self.lo[byte_lo + 16] |= 1 << (bucket % 8); 373 self.hi[byte_hi + 16] |= 1 << (bucket % 8); 374 } 375 } 376 377 /// Return the low 128 bits of the low-nybble mask. lo128(&self) -> [u8; 16]378 pub fn lo128(&self) -> [u8; 16] { 379 let mut tmp = [0; 16]; 380 tmp.copy_from_slice(&self.lo[..16]); 381 tmp 382 } 383 384 /// Return the full low-nybble mask. lo256(&self) -> [u8; 32]385 pub fn lo256(&self) -> [u8; 32] { 386 self.lo 387 } 388 389 /// Return the low 128 bits of the high-nybble mask. hi128(&self) -> [u8; 16]390 pub fn hi128(&self) -> [u8; 16] { 391 let mut tmp = [0; 16]; 392 tmp.copy_from_slice(&self.hi[..16]); 393 tmp 394 } 395 396 /// Return the full high-nybble mask. hi256(&self) -> [u8; 32]397 pub fn hi256(&self) -> [u8; 32] { 398 self.hi 399 } 400 } 401 402 impl fmt::Debug for Mask { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result403 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 404 let (mut parts_lo, mut parts_hi) = (vec![], vec![]); 405 for i in 0..32 { 406 parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); 407 parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); 408 } 409 f.debug_struct("Mask") 410 .field("lo", &parts_lo) 411 .field("hi", &parts_hi) 412 .finish() 413 } 414 } 415