1 use crate::UnalignedBuffer; 2 use core::{cmp, hash::Hasher}; 3 4 #[cfg(feature = "serialize")] 5 use serde::{Deserialize, Serialize}; 6 7 const CHUNK_SIZE: usize = 16; 8 9 pub const PRIME_1: u32 = 2_654_435_761; 10 pub const PRIME_2: u32 = 2_246_822_519; 11 pub const PRIME_3: u32 = 3_266_489_917; 12 pub const PRIME_4: u32 = 668_265_263; 13 pub const PRIME_5: u32 = 374_761_393; 14 15 #[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))] 16 #[derive(Copy, Clone, PartialEq)] 17 struct XxCore { 18 v1: u32, 19 v2: u32, 20 v3: u32, 21 v4: u32, 22 } 23 24 /// Calculates the 32-bit hash. Care should be taken when using this 25 /// hash. 26 /// 27 /// Although this struct implements `Hasher`, it only calculates a 28 /// 32-bit number, leaving the upper bits as 0. This means it is 29 /// unlikely to be correct to use this in places like a `HashMap`. 30 #[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))] 31 #[derive(Debug, Copy, Clone, PartialEq)] 32 pub struct XxHash32 { 33 total_len: u64, 34 seed: u32, 35 core: XxCore, 36 #[cfg_attr(feature = "serialize", serde(flatten))] 37 buffer: Buffer, 38 } 39 40 impl XxCore { with_seed(seed: u32) -> XxCore41 fn with_seed(seed: u32) -> XxCore { 42 XxCore { 43 v1: seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2), 44 v2: seed.wrapping_add(PRIME_2), 45 v3: seed, 46 v4: seed.wrapping_sub(PRIME_1), 47 } 48 } 49 50 #[inline(always)] ingest_chunks<I>(&mut self, values: I) where I: IntoIterator<Item = [u32; 4]>,51 fn ingest_chunks<I>(&mut self, values: I) 52 where 53 I: IntoIterator<Item = [u32; 4]>, 54 { 55 #[inline(always)] ingest_one_number(mut current_value: u32, mut value: u32) -> u3256 fn ingest_one_number(mut current_value: u32, mut value: u32) -> u32 { 57 value = value.wrapping_mul(PRIME_2); 58 current_value = current_value.wrapping_add(value); 59 current_value = current_value.rotate_left(13); 60 current_value.wrapping_mul(PRIME_1) 61 } 62 63 // By drawing these out, we can avoid going back and forth to 64 // memory. It only really helps for large files, when we need 65 // to iterate multiple times here. 66 67 let mut v1 = self.v1; 68 let mut v2 = self.v2; 69 let mut v3 = self.v3; 70 let mut v4 = self.v4; 71 72 for [n1, n2, n3, n4] in values { 73 v1 = ingest_one_number(v1, n1.to_le()); 74 v2 = ingest_one_number(v2, n2.to_le()); 75 v3 = ingest_one_number(v3, n3.to_le()); 76 v4 = ingest_one_number(v4, n4.to_le()); 77 } 78 79 self.v1 = v1; 80 self.v2 = v2; 81 self.v3 = v3; 82 self.v4 = v4; 83 } 84 85 #[inline(always)] finish(&self) -> u3286 fn finish(&self) -> u32 { 87 // The original code pulls out local vars for v[1234] 88 // here. Performance tests did not show that to be effective 89 // here, presumably because this method is not called in a 90 // tight loop. 91 92 #[allow(unknown_lints, clippy::needless_late_init)] // keeping things parallel 93 let mut hash; 94 95 hash = self.v1.rotate_left(1); 96 hash = hash.wrapping_add(self.v2.rotate_left(7)); 97 hash = hash.wrapping_add(self.v3.rotate_left(12)); 98 hash = hash.wrapping_add(self.v4.rotate_left(18)); 99 100 hash 101 } 102 } 103 104 impl core::fmt::Debug for XxCore { fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error>105 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { 106 write!( 107 f, 108 "XxCore {{ {:016x} {:016x} {:016x} {:016x} }}", 109 self.v1, self.v2, self.v3, self.v4 110 ) 111 } 112 } 113 114 #[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))] 115 #[derive(Debug, Copy, Clone, Default, PartialEq)] 116 #[repr(align(4))] 117 #[cfg_attr(feature = "serialize", serde(transparent))] 118 struct AlignToU32<T>(T); 119 120 #[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))] 121 #[derive(Debug, Copy, Clone, Default, PartialEq)] 122 struct Buffer { 123 #[cfg_attr(feature = "serialize", serde(rename = "buffer"))] 124 data: AlignToU32<[u8; CHUNK_SIZE]>, 125 #[cfg_attr(feature = "serialize", serde(rename = "buffer_usage"))] 126 len: usize, 127 } 128 129 impl Buffer { data(&self) -> &[u8]130 fn data(&self) -> &[u8] { 131 &self.data.0[..self.len] 132 } 133 134 /// Consumes as much of the parameter as it can, returning the unused part. consume<'a>(&mut self, data: &'a [u8]) -> &'a [u8]135 fn consume<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { 136 let to_use = cmp::min(self.available(), data.len()); 137 let (data, remaining) = data.split_at(to_use); 138 self.data.0[self.len..][..to_use].copy_from_slice(data); 139 self.len += to_use; 140 remaining 141 } 142 set_data(&mut self, data: &[u8])143 fn set_data(&mut self, data: &[u8]) { 144 debug_assert!(self.is_empty()); 145 debug_assert!(data.len() < CHUNK_SIZE); 146 self.data.0[..data.len()].copy_from_slice(data); 147 self.len = data.len(); 148 } 149 available(&self) -> usize150 fn available(&self) -> usize { 151 CHUNK_SIZE - self.len 152 } 153 is_empty(&self) -> bool154 fn is_empty(&self) -> bool { 155 self.len == 0 156 } 157 is_full(&self) -> bool158 fn is_full(&self) -> bool { 159 self.len == CHUNK_SIZE 160 } 161 } 162 163 impl XxHash32 { 164 /// Constructs the hash with an initial seed with_seed(seed: u32) -> XxHash32165 pub fn with_seed(seed: u32) -> XxHash32 { 166 XxHash32 { 167 total_len: 0, 168 seed, 169 core: XxCore::with_seed(seed), 170 buffer: Buffer::default(), 171 } 172 } 173 write(&mut self, bytes: &[u8])174 pub(crate) fn write(&mut self, bytes: &[u8]) { 175 let remaining = self.maybe_consume_bytes(bytes); 176 if !remaining.is_empty() { 177 let mut remaining = UnalignedBuffer::new(remaining); 178 self.core.ingest_chunks(&mut remaining); 179 self.buffer.set_data(remaining.remaining()); 180 } 181 self.total_len += bytes.len() as u64; 182 } 183 184 // Consume bytes and try to make `self.buffer` empty. 185 // If there are not enough bytes, `self.buffer` can be non-empty, and this 186 // function returns an empty slice. maybe_consume_bytes<'a>(&mut self, data: &'a [u8]) -> &'a [u8]187 fn maybe_consume_bytes<'a>(&mut self, data: &'a [u8]) -> &'a [u8] { 188 if self.buffer.is_empty() { 189 data 190 } else { 191 let data = self.buffer.consume(data); 192 if self.buffer.is_full() { 193 let mut u32s = UnalignedBuffer::new(self.buffer.data()); 194 self.core.ingest_chunks(&mut u32s); 195 debug_assert!(u32s.remaining().is_empty()); 196 self.buffer.len = 0; 197 } 198 data 199 } 200 } 201 finish(&self) -> u32202 pub(crate) fn finish(&self) -> u32 { 203 let mut hash = if self.total_len >= CHUNK_SIZE as u64 { 204 // We have processed at least one full chunk 205 self.core.finish() 206 } else { 207 self.seed.wrapping_add(PRIME_5) 208 }; 209 210 hash = hash.wrapping_add(self.total_len as u32); 211 212 let mut buffered_u32s = UnalignedBuffer::<u32>::new(self.buffer.data()); 213 for buffered_u32 in &mut buffered_u32s { 214 let k1 = buffered_u32.to_le().wrapping_mul(PRIME_3); 215 hash = hash.wrapping_add(k1); 216 hash = hash.rotate_left(17); 217 hash = hash.wrapping_mul(PRIME_4); 218 } 219 220 let buffered_u8s = buffered_u32s.remaining(); 221 for &buffered_u8 in buffered_u8s { 222 let k1 = u32::from(buffered_u8).wrapping_mul(PRIME_5); 223 hash = hash.wrapping_add(k1); 224 hash = hash.rotate_left(11); 225 hash = hash.wrapping_mul(PRIME_1); 226 } 227 228 // The final intermixing 229 hash ^= hash >> 15; 230 hash = hash.wrapping_mul(PRIME_2); 231 hash ^= hash >> 13; 232 hash = hash.wrapping_mul(PRIME_3); 233 hash ^= hash >> 16; 234 235 hash 236 } 237 seed(&self) -> u32238 pub fn seed(&self) -> u32 { 239 self.seed 240 } 241 242 /// Get the total number of bytes hashed, truncated to 32 bits. 243 /// For the full 64-bit byte count, use `total_len_64` total_len(&self) -> u32244 pub fn total_len(&self) -> u32 { 245 self.total_len as u32 246 } 247 248 /// Get the total number of bytes hashed. total_len_64(&self) -> u64249 pub fn total_len_64(&self) -> u64 { 250 self.total_len 251 } 252 } 253 254 impl Default for XxHash32 { default() -> XxHash32255 fn default() -> XxHash32 { 256 XxHash32::with_seed(0) 257 } 258 } 259 260 impl Hasher for XxHash32 { finish(&self) -> u64261 fn finish(&self) -> u64 { 262 u64::from(XxHash32::finish(self)) 263 } 264 write(&mut self, bytes: &[u8])265 fn write(&mut self, bytes: &[u8]) { 266 XxHash32::write(self, bytes) 267 } 268 } 269 270 #[cfg(feature = "std")] 271 pub use crate::std_support::thirty_two::RandomXxHashBuilder32; 272 273 #[cfg(test)] 274 mod test { 275 use super::{RandomXxHashBuilder32, XxHash32}; 276 use std::collections::HashMap; 277 use std::hash::BuildHasherDefault; 278 use std::prelude::v1::*; 279 280 #[test] ingesting_byte_by_byte_is_equivalent_to_large_chunks()281 fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() { 282 let bytes: Vec<_> = (0..32).map(|_| 0).collect(); 283 284 let mut byte_by_byte = XxHash32::with_seed(0); 285 for byte in bytes.chunks(1) { 286 byte_by_byte.write(byte); 287 } 288 289 let mut one_chunk = XxHash32::with_seed(0); 290 one_chunk.write(&bytes); 291 292 assert_eq!(byte_by_byte.core, one_chunk.core); 293 } 294 295 #[test] hash_of_nothing_matches_c_implementation()296 fn hash_of_nothing_matches_c_implementation() { 297 let mut hasher = XxHash32::with_seed(0); 298 hasher.write(&[]); 299 assert_eq!(hasher.finish(), 0x02cc_5d05); 300 } 301 302 #[test] hash_of_single_byte_matches_c_implementation()303 fn hash_of_single_byte_matches_c_implementation() { 304 let mut hasher = XxHash32::with_seed(0); 305 hasher.write(&[42]); 306 assert_eq!(hasher.finish(), 0xe0fe_705f); 307 } 308 309 #[test] hash_of_multiple_bytes_matches_c_implementation()310 fn hash_of_multiple_bytes_matches_c_implementation() { 311 let mut hasher = XxHash32::with_seed(0); 312 hasher.write(b"Hello, world!\0"); 313 assert_eq!(hasher.finish(), 0x9e5e_7e93); 314 } 315 316 #[test] hash_of_multiple_chunks_matches_c_implementation()317 fn hash_of_multiple_chunks_matches_c_implementation() { 318 let bytes: Vec<_> = (0..100).collect(); 319 let mut hasher = XxHash32::with_seed(0); 320 hasher.write(&bytes); 321 assert_eq!(hasher.finish(), 0x7f89_ba44); 322 } 323 324 #[test] hash_with_different_seed_matches_c_implementation()325 fn hash_with_different_seed_matches_c_implementation() { 326 let mut hasher = XxHash32::with_seed(0x42c9_1977); 327 hasher.write(&[]); 328 assert_eq!(hasher.finish(), 0xd6bf_8459); 329 } 330 331 #[test] hash_with_different_seed_and_multiple_chunks_matches_c_implementation()332 fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() { 333 let bytes: Vec<_> = (0..100).collect(); 334 let mut hasher = XxHash32::with_seed(0x42c9_1977); 335 hasher.write(&bytes); 336 assert_eq!(hasher.finish(), 0x6d2f_6c17); 337 } 338 339 #[test] can_be_used_in_a_hashmap_with_a_default_seed()340 fn can_be_used_in_a_hashmap_with_a_default_seed() { 341 let mut hash: HashMap<_, _, BuildHasherDefault<XxHash32>> = Default::default(); 342 hash.insert(42, "the answer"); 343 assert_eq!(hash.get(&42), Some(&"the answer")); 344 } 345 346 #[test] can_be_used_in_a_hashmap_with_a_random_seed()347 fn can_be_used_in_a_hashmap_with_a_random_seed() { 348 let mut hash: HashMap<_, _, RandomXxHashBuilder32> = Default::default(); 349 hash.insert(42, "the answer"); 350 assert_eq!(hash.get(&42), Some(&"the answer")); 351 } 352 353 #[cfg(feature = "serialize")] 354 type TestResult<T = ()> = Result<T, Box<dyn std::error::Error>>; 355 356 #[cfg(feature = "serialize")] 357 #[test] test_serialization_cycle() -> TestResult358 fn test_serialization_cycle() -> TestResult { 359 let mut hasher = XxHash32::with_seed(0); 360 hasher.write(b"Hello, world!\0"); 361 hasher.finish(); 362 363 let serialized = serde_json::to_string(&hasher)?; 364 let unserialized: XxHash32 = serde_json::from_str(&serialized)?; 365 assert_eq!(hasher, unserialized); 366 Ok(()) 367 } 368 369 #[cfg(feature = "serialize")] 370 #[test] test_serialization_stability() -> TestResult371 fn test_serialization_stability() -> TestResult { 372 let mut hasher = XxHash32::with_seed(0); 373 hasher.write(b"Hello, world!\0"); 374 hasher.finish(); 375 376 let serialized = r#"{ 377 "total_len": 14, 378 "seed": 0, 379 "core": { 380 "v1": 606290984, 381 "v2": 2246822519, 382 "v3": 0, 383 "v4": 1640531535 384 }, 385 "buffer": [ 386 72, 101, 108, 108, 111, 44, 32, 119, 387 111, 114, 108, 100, 33, 0, 0, 0 388 ], 389 "buffer_usage": 14 390 }"#; 391 392 let unserialized: XxHash32 = serde_json::from_str(serialized).unwrap(); 393 assert_eq!(hasher, unserialized); 394 Ok(()) 395 } 396 397 // This test validates wraparound/truncation behavior for very large inputs 398 // of a 32-bit hash, but runs very slowly in the normal "cargo test" 399 // build config since it hashes 4.3GB of data. It runs reasonably quick 400 // under "cargo test --release". 401 /* 402 #[test] 403 fn len_overflow_32bit() { 404 // Hash 4.3 billion (4_300_000_000) bytes, which overflows a u32. 405 let bytes200: Vec<u8> = (0..200).collect(); 406 let mut hasher = XxHash32::with_seed(0); 407 for _ in 0..(4_300_000_000u64 / 200u64) { 408 hasher.write(&bytes200); 409 } 410 assert_eq!(hasher.total_len_64(), 0x0000_0001_004c_cb00); 411 assert_eq!(hasher.total_len(), 0x004c_cb00); 412 // retult is tested against the C implementation 413 assert_eq!(hasher.finish(), 0x1522_4ca7); 414 } 415 */ 416 } 417