1 use crate::UnalignedBuffer;
2 use core::{cmp, hash::Hasher};
3 
4 #[cfg(feature = "serialize")]
5 use serde::{Deserialize, Serialize};
6 
7 const CHUNK_SIZE: usize = 16;
8 
9 pub const PRIME_1: u32 = 2_654_435_761;
10 pub const PRIME_2: u32 = 2_246_822_519;
11 pub const PRIME_3: u32 = 3_266_489_917;
12 pub const PRIME_4: u32 = 668_265_263;
13 pub const PRIME_5: u32 = 374_761_393;
14 
15 #[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))]
16 #[derive(Copy, Clone, PartialEq)]
17 struct XxCore {
18     v1: u32,
19     v2: u32,
20     v3: u32,
21     v4: u32,
22 }
23 
24 /// Calculates the 32-bit hash. Care should be taken when using this
25 /// hash.
26 ///
27 /// Although this struct implements `Hasher`, it only calculates a
28 /// 32-bit number, leaving the upper bits as 0. This means it is
29 /// unlikely to be correct to use this in places like a `HashMap`.
30 #[cfg_attr(feature = "serialize", derive(Deserialize, Serialize))]
31 #[derive(Debug, Copy, Clone, PartialEq)]
32 pub struct XxHash32 {
33     total_len: u64,
34     seed: u32,
35     core: XxCore,
36     #[cfg_attr(feature = "serialize", serde(flatten))]
37     buffer: Buffer,
38 }
39 
40 impl XxCore {
with_seed(seed: u32) -> XxCore41     fn with_seed(seed: u32) -> XxCore {
42         XxCore {
43             v1: seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2),
44             v2: seed.wrapping_add(PRIME_2),
45             v3: seed,
46             v4: seed.wrapping_sub(PRIME_1),
47         }
48     }
49 
50     #[inline(always)]
ingest_chunks<I>(&mut self, values: I) where I: IntoIterator<Item = [u32; 4]>,51     fn ingest_chunks<I>(&mut self, values: I)
52     where
53         I: IntoIterator<Item = [u32; 4]>,
54     {
55         #[inline(always)]
ingest_one_number(mut current_value: u32, mut value: u32) -> u3256         fn ingest_one_number(mut current_value: u32, mut value: u32) -> u32 {
57             value = value.wrapping_mul(PRIME_2);
58             current_value = current_value.wrapping_add(value);
59             current_value = current_value.rotate_left(13);
60             current_value.wrapping_mul(PRIME_1)
61         }
62 
63         // By drawing these out, we can avoid going back and forth to
64         // memory. It only really helps for large files, when we need
65         // to iterate multiple times here.
66 
67         let mut v1 = self.v1;
68         let mut v2 = self.v2;
69         let mut v3 = self.v3;
70         let mut v4 = self.v4;
71 
72         for [n1, n2, n3, n4] in values {
73             v1 = ingest_one_number(v1, n1.to_le());
74             v2 = ingest_one_number(v2, n2.to_le());
75             v3 = ingest_one_number(v3, n3.to_le());
76             v4 = ingest_one_number(v4, n4.to_le());
77         }
78 
79         self.v1 = v1;
80         self.v2 = v2;
81         self.v3 = v3;
82         self.v4 = v4;
83     }
84 
85     #[inline(always)]
finish(&self) -> u3286     fn finish(&self) -> u32 {
87         // The original code pulls out local vars for v[1234]
88         // here. Performance tests did not show that to be effective
89         // here, presumably because this method is not called in a
90         // tight loop.
91 
92         #[allow(unknown_lints, clippy::needless_late_init)] // keeping things parallel
93         let mut hash;
94 
95         hash = self.v1.rotate_left(1);
96         hash = hash.wrapping_add(self.v2.rotate_left(7));
97         hash = hash.wrapping_add(self.v3.rotate_left(12));
98         hash = hash.wrapping_add(self.v4.rotate_left(18));
99 
100         hash
101     }
102 }
103 
104 impl core::fmt::Debug for XxCore {
fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error>105     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
106         write!(
107             f,
108             "XxCore {{ {:016x} {:016x} {:016x} {:016x} }}",
109             self.v1, self.v2, self.v3, self.v4
110         )
111     }
112 }
113 
114 #[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
115 #[derive(Debug, Copy, Clone, Default, PartialEq)]
116 #[repr(align(4))]
117 #[cfg_attr(feature = "serialize", serde(transparent))]
118 struct AlignToU32<T>(T);
119 
120 #[cfg_attr(feature = "serialize", derive(Serialize, Deserialize))]
121 #[derive(Debug, Copy, Clone, Default, PartialEq)]
122 struct Buffer {
123     #[cfg_attr(feature = "serialize", serde(rename = "buffer"))]
124     data: AlignToU32<[u8; CHUNK_SIZE]>,
125     #[cfg_attr(feature = "serialize", serde(rename = "buffer_usage"))]
126     len: usize,
127 }
128 
129 impl Buffer {
data(&self) -> &[u8]130     fn data(&self) -> &[u8] {
131         &self.data.0[..self.len]
132     }
133 
134     /// Consumes as much of the parameter as it can, returning the unused part.
consume<'a>(&mut self, data: &'a [u8]) -> &'a [u8]135     fn consume<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
136         let to_use = cmp::min(self.available(), data.len());
137         let (data, remaining) = data.split_at(to_use);
138         self.data.0[self.len..][..to_use].copy_from_slice(data);
139         self.len += to_use;
140         remaining
141     }
142 
set_data(&mut self, data: &[u8])143     fn set_data(&mut self, data: &[u8]) {
144         debug_assert!(self.is_empty());
145         debug_assert!(data.len() < CHUNK_SIZE);
146         self.data.0[..data.len()].copy_from_slice(data);
147         self.len = data.len();
148     }
149 
available(&self) -> usize150     fn available(&self) -> usize {
151         CHUNK_SIZE - self.len
152     }
153 
is_empty(&self) -> bool154     fn is_empty(&self) -> bool {
155         self.len == 0
156     }
157 
is_full(&self) -> bool158     fn is_full(&self) -> bool {
159         self.len == CHUNK_SIZE
160     }
161 }
162 
163 impl XxHash32 {
164     /// Constructs the hash with an initial seed
with_seed(seed: u32) -> XxHash32165     pub fn with_seed(seed: u32) -> XxHash32 {
166         XxHash32 {
167             total_len: 0,
168             seed,
169             core: XxCore::with_seed(seed),
170             buffer: Buffer::default(),
171         }
172     }
173 
write(&mut self, bytes: &[u8])174     pub(crate) fn write(&mut self, bytes: &[u8]) {
175         let remaining = self.maybe_consume_bytes(bytes);
176         if !remaining.is_empty() {
177             let mut remaining = UnalignedBuffer::new(remaining);
178             self.core.ingest_chunks(&mut remaining);
179             self.buffer.set_data(remaining.remaining());
180         }
181         self.total_len += bytes.len() as u64;
182     }
183 
184     // Consume bytes and try to make `self.buffer` empty.
185     // If there are not enough bytes, `self.buffer` can be non-empty, and this
186     // function returns an empty slice.
maybe_consume_bytes<'a>(&mut self, data: &'a [u8]) -> &'a [u8]187     fn maybe_consume_bytes<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
188         if self.buffer.is_empty() {
189             data
190         } else {
191             let data = self.buffer.consume(data);
192             if self.buffer.is_full() {
193                 let mut u32s = UnalignedBuffer::new(self.buffer.data());
194                 self.core.ingest_chunks(&mut u32s);
195                 debug_assert!(u32s.remaining().is_empty());
196                 self.buffer.len = 0;
197             }
198             data
199         }
200     }
201 
finish(&self) -> u32202     pub(crate) fn finish(&self) -> u32 {
203         let mut hash = if self.total_len >= CHUNK_SIZE as u64 {
204             // We have processed at least one full chunk
205             self.core.finish()
206         } else {
207             self.seed.wrapping_add(PRIME_5)
208         };
209 
210         hash = hash.wrapping_add(self.total_len as u32);
211 
212         let mut buffered_u32s = UnalignedBuffer::<u32>::new(self.buffer.data());
213         for buffered_u32 in &mut buffered_u32s {
214             let k1 = buffered_u32.to_le().wrapping_mul(PRIME_3);
215             hash = hash.wrapping_add(k1);
216             hash = hash.rotate_left(17);
217             hash = hash.wrapping_mul(PRIME_4);
218         }
219 
220         let buffered_u8s = buffered_u32s.remaining();
221         for &buffered_u8 in buffered_u8s {
222             let k1 = u32::from(buffered_u8).wrapping_mul(PRIME_5);
223             hash = hash.wrapping_add(k1);
224             hash = hash.rotate_left(11);
225             hash = hash.wrapping_mul(PRIME_1);
226         }
227 
228         // The final intermixing
229         hash ^= hash >> 15;
230         hash = hash.wrapping_mul(PRIME_2);
231         hash ^= hash >> 13;
232         hash = hash.wrapping_mul(PRIME_3);
233         hash ^= hash >> 16;
234 
235         hash
236     }
237 
seed(&self) -> u32238     pub fn seed(&self) -> u32 {
239         self.seed
240     }
241 
242     /// Get the total number of bytes hashed, truncated to 32 bits.
243     /// For the full 64-bit byte count, use `total_len_64`
total_len(&self) -> u32244     pub fn total_len(&self) -> u32 {
245         self.total_len as u32
246     }
247 
248     /// Get the total number of bytes hashed.
total_len_64(&self) -> u64249     pub fn total_len_64(&self) -> u64 {
250         self.total_len
251     }
252 }
253 
254 impl Default for XxHash32 {
default() -> XxHash32255     fn default() -> XxHash32 {
256         XxHash32::with_seed(0)
257     }
258 }
259 
260 impl Hasher for XxHash32 {
finish(&self) -> u64261     fn finish(&self) -> u64 {
262         u64::from(XxHash32::finish(self))
263     }
264 
write(&mut self, bytes: &[u8])265     fn write(&mut self, bytes: &[u8]) {
266         XxHash32::write(self, bytes)
267     }
268 }
269 
270 #[cfg(feature = "std")]
271 pub use crate::std_support::thirty_two::RandomXxHashBuilder32;
272 
273 #[cfg(test)]
274 mod test {
275     use super::{RandomXxHashBuilder32, XxHash32};
276     use std::collections::HashMap;
277     use std::hash::BuildHasherDefault;
278     use std::prelude::v1::*;
279 
280     #[test]
ingesting_byte_by_byte_is_equivalent_to_large_chunks()281     fn ingesting_byte_by_byte_is_equivalent_to_large_chunks() {
282         let bytes: Vec<_> = (0..32).map(|_| 0).collect();
283 
284         let mut byte_by_byte = XxHash32::with_seed(0);
285         for byte in bytes.chunks(1) {
286             byte_by_byte.write(byte);
287         }
288 
289         let mut one_chunk = XxHash32::with_seed(0);
290         one_chunk.write(&bytes);
291 
292         assert_eq!(byte_by_byte.core, one_chunk.core);
293     }
294 
295     #[test]
hash_of_nothing_matches_c_implementation()296     fn hash_of_nothing_matches_c_implementation() {
297         let mut hasher = XxHash32::with_seed(0);
298         hasher.write(&[]);
299         assert_eq!(hasher.finish(), 0x02cc_5d05);
300     }
301 
302     #[test]
hash_of_single_byte_matches_c_implementation()303     fn hash_of_single_byte_matches_c_implementation() {
304         let mut hasher = XxHash32::with_seed(0);
305         hasher.write(&[42]);
306         assert_eq!(hasher.finish(), 0xe0fe_705f);
307     }
308 
309     #[test]
hash_of_multiple_bytes_matches_c_implementation()310     fn hash_of_multiple_bytes_matches_c_implementation() {
311         let mut hasher = XxHash32::with_seed(0);
312         hasher.write(b"Hello, world!\0");
313         assert_eq!(hasher.finish(), 0x9e5e_7e93);
314     }
315 
316     #[test]
hash_of_multiple_chunks_matches_c_implementation()317     fn hash_of_multiple_chunks_matches_c_implementation() {
318         let bytes: Vec<_> = (0..100).collect();
319         let mut hasher = XxHash32::with_seed(0);
320         hasher.write(&bytes);
321         assert_eq!(hasher.finish(), 0x7f89_ba44);
322     }
323 
324     #[test]
hash_with_different_seed_matches_c_implementation()325     fn hash_with_different_seed_matches_c_implementation() {
326         let mut hasher = XxHash32::with_seed(0x42c9_1977);
327         hasher.write(&[]);
328         assert_eq!(hasher.finish(), 0xd6bf_8459);
329     }
330 
331     #[test]
hash_with_different_seed_and_multiple_chunks_matches_c_implementation()332     fn hash_with_different_seed_and_multiple_chunks_matches_c_implementation() {
333         let bytes: Vec<_> = (0..100).collect();
334         let mut hasher = XxHash32::with_seed(0x42c9_1977);
335         hasher.write(&bytes);
336         assert_eq!(hasher.finish(), 0x6d2f_6c17);
337     }
338 
339     #[test]
can_be_used_in_a_hashmap_with_a_default_seed()340     fn can_be_used_in_a_hashmap_with_a_default_seed() {
341         let mut hash: HashMap<_, _, BuildHasherDefault<XxHash32>> = Default::default();
342         hash.insert(42, "the answer");
343         assert_eq!(hash.get(&42), Some(&"the answer"));
344     }
345 
346     #[test]
can_be_used_in_a_hashmap_with_a_random_seed()347     fn can_be_used_in_a_hashmap_with_a_random_seed() {
348         let mut hash: HashMap<_, _, RandomXxHashBuilder32> = Default::default();
349         hash.insert(42, "the answer");
350         assert_eq!(hash.get(&42), Some(&"the answer"));
351     }
352 
353     #[cfg(feature = "serialize")]
354     type TestResult<T = ()> = Result<T, Box<dyn std::error::Error>>;
355 
356     #[cfg(feature = "serialize")]
357     #[test]
test_serialization_cycle() -> TestResult358     fn test_serialization_cycle() -> TestResult {
359         let mut hasher = XxHash32::with_seed(0);
360         hasher.write(b"Hello, world!\0");
361         hasher.finish();
362 
363         let serialized = serde_json::to_string(&hasher)?;
364         let unserialized: XxHash32 = serde_json::from_str(&serialized)?;
365         assert_eq!(hasher, unserialized);
366         Ok(())
367     }
368 
369     #[cfg(feature = "serialize")]
370     #[test]
test_serialization_stability() -> TestResult371     fn test_serialization_stability() -> TestResult {
372         let mut hasher = XxHash32::with_seed(0);
373         hasher.write(b"Hello, world!\0");
374         hasher.finish();
375 
376         let serialized = r#"{
377             "total_len": 14,
378             "seed": 0,
379             "core": {
380               "v1": 606290984,
381               "v2": 2246822519,
382               "v3": 0,
383               "v4": 1640531535
384             },
385             "buffer": [
386               72,  101, 108, 108, 111, 44, 32, 119,
387               111, 114, 108, 100, 33,  0,  0,  0
388             ],
389             "buffer_usage": 14
390         }"#;
391 
392         let unserialized: XxHash32 = serde_json::from_str(serialized).unwrap();
393         assert_eq!(hasher, unserialized);
394         Ok(())
395     }
396 
397     // This test validates wraparound/truncation behavior for very large inputs
398     // of a 32-bit hash, but runs very slowly in the normal "cargo test"
399     // build config since it hashes 4.3GB of data. It runs reasonably quick
400     // under "cargo test --release".
401     /*
402     #[test]
403     fn len_overflow_32bit() {
404         // Hash 4.3 billion (4_300_000_000) bytes, which overflows a u32.
405         let bytes200: Vec<u8> = (0..200).collect();
406         let mut hasher = XxHash32::with_seed(0);
407         for _ in 0..(4_300_000_000u64 / 200u64) {
408             hasher.write(&bytes200);
409         }
410         assert_eq!(hasher.total_len_64(), 0x0000_0001_004c_cb00);
411         assert_eq!(hasher.total_len(), 0x004c_cb00);
412         // retult is tested against the C implementation
413         assert_eq!(hasher.finish(), 0x1522_4ca7);
414     }
415     */
416 }
417