1 // Copyright 2022 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Hardware accelerated CRC32 computation on Intel and ARM architecture.
16 
17 #include <cstddef>
18 #include <cstdint>
19 
20 #include "absl/base/attributes.h"
21 #include "absl/base/config.h"
22 #include "absl/base/dynamic_annotations.h"
23 #include "absl/base/internal/endian.h"
24 #include "absl/base/internal/prefetch.h"
25 #include "absl/crc/internal/cpu_detect.h"
26 #include "absl/crc/internal/crc.h"
27 #include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
28 #include "absl/crc/internal/crc_internal.h"
29 #include "absl/memory/memory.h"
30 #include "absl/numeric/bits.h"
31 
32 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
33     defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
34 #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
35 #endif
36 
37 namespace absl {
38 ABSL_NAMESPACE_BEGIN
39 namespace crc_internal {
40 
41 #if defined(ABSL_INTERNAL_CAN_USE_SIMD_CRC32C)
42 
43 // Implementation details not exported outside of file
44 namespace {
45 
46 // Some machines have CRC acceleration hardware.
47 // We can do a faster version of Extend() on such machines.
48 class CRC32AcceleratedX86ARMCombined : public CRC32 {
49  public:
CRC32AcceleratedX86ARMCombined()50   CRC32AcceleratedX86ARMCombined() {}
~CRC32AcceleratedX86ARMCombined()51   ~CRC32AcceleratedX86ARMCombined() override {}
52   void ExtendByZeroes(uint32_t* crc, size_t length) const override;
53   uint32_t ComputeZeroConstant(size_t length) const;
54 
55  private:
56   CRC32AcceleratedX86ARMCombined(const CRC32AcceleratedX86ARMCombined&) =
57       delete;
58   CRC32AcceleratedX86ARMCombined& operator=(
59       const CRC32AcceleratedX86ARMCombined&) = delete;
60 };
61 
62 // Constants for switching between algorithms.
63 // Chosen by comparing speed at different powers of 2.
64 constexpr size_t kSmallCutoff = 256;
65 constexpr size_t kMediumCutoff = 2048;
66 
67 #define ABSL_INTERNAL_STEP1(crc)                      \
68   do {                                                \
69     crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
70   } while (0)
71 #define ABSL_INTERNAL_STEP2(crc)                                               \
72   do {                                                                         \
73     crc =                                                                      \
74         CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
75     p += 2;                                                                    \
76   } while (0)
77 #define ABSL_INTERNAL_STEP4(crc)                                               \
78   do {                                                                         \
79     crc =                                                                      \
80         CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
81     p += 4;                                                                    \
82   } while (0)
83 #define ABSL_INTERNAL_STEP8(crc, data)                  \
84   do {                                                  \
85     crc = CRC32_u64(static_cast<uint32_t>(crc),         \
86                     absl::little_endian::Load64(data)); \
87     data += 8;                                          \
88   } while (0)
89 #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
90   do {                                             \
91     ABSL_INTERNAL_STEP8(crc0, p0);                 \
92     ABSL_INTERNAL_STEP8(crc1, p1);                 \
93   } while (0)
94 #define ABSL_INTERNAL_STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
95   do {                                                       \
96     ABSL_INTERNAL_STEP8(crc0, p0);                           \
97     ABSL_INTERNAL_STEP8(crc1, p1);                           \
98     ABSL_INTERNAL_STEP8(crc2, p2);                           \
99   } while (0)
100 
101 namespace {
102 
multiply(uint32_t a,uint32_t b)103 uint32_t multiply(uint32_t a, uint32_t b) {
104   V128 shifts = V128_From2x64(0, 1);
105   V128 power = V128_From2x64(0, a);
106   V128 crc = V128_From2x64(0, b);
107   V128 res = V128_PMulLow(power, crc);
108 
109   // Combine crc values
110   res = V128_ShiftLeft64(res, shifts);
111   return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
112          CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
113 }
114 
115 // Powers of crc32c polynomial, for faster ExtendByZeros.
116 // Verified against folly:
117 // folly/hash/detail/Crc32CombineDetail.cpp
118 constexpr uint32_t kCRC32CPowers[] = {
119     0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
120     0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
121     0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
122     0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
123     0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
124     0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
125     0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
126     0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
127     0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
128     0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
129     0x00800000, 0x00008000,
130 };
131 
132 }  // namespace
133 
134 // Compute a magic constant, so that multiplying by it is the same as
135 // extending crc by length zeros.
ComputeZeroConstant(size_t length) const136 uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
137     size_t length) const {
138   // Lowest 2 bits are handled separately in ExtendByZeroes
139   length >>= 2;
140 
141   int index = absl::countr_zero(length);
142   uint32_t prev = kCRC32CPowers[index];
143   length &= length - 1;
144 
145   while (length) {
146     // For each bit of length, extend by 2**n zeros.
147     index = absl::countr_zero(length);
148     prev = multiply(prev, kCRC32CPowers[index]);
149     length &= length - 1;
150   }
151   return prev;
152 }
153 
ExtendByZeroes(uint32_t * crc,size_t length) const154 void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
155                                                     size_t length) const {
156   uint32_t val = *crc;
157   // Don't bother with multiplication for small length.
158   switch (length & 3) {
159     case 0:
160       break;
161     case 1:
162       val = CRC32_u8(val, 0);
163       break;
164     case 2:
165       val = CRC32_u16(val, 0);
166       break;
167     case 3:
168       val = CRC32_u8(val, 0);
169       val = CRC32_u16(val, 0);
170       break;
171   }
172   if (length > 3) {
173     val = multiply(val, ComputeZeroConstant(length));
174   }
175   *crc = val;
176 }
177 
178 // Taken from Intel paper "Fast CRC Computation for iSCSI Polynomial Using CRC32
179 // Instruction"
180 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
181 // We only need every 4th value, because we unroll loop by 4.
182 constexpr uint64_t kClmulConstants[] = {
183     0x09e4addf8, 0x0ba4fc28e, 0x00d3b6092, 0x09e4addf8, 0x0ab7aff2a,
184     0x102f9b8a2, 0x0b9e02b86, 0x00d3b6092, 0x1bf2e8b8a, 0x18266e456,
185     0x0d270f1a2, 0x0ab7aff2a, 0x11eef4f8e, 0x083348832, 0x0dd7e3b0c,
186     0x0b9e02b86, 0x0271d9844, 0x1b331e26a, 0x06b749fb2, 0x1bf2e8b8a,
187     0x0e6fc4e6a, 0x0ce7f39f4, 0x0d7a4825c, 0x0d270f1a2, 0x026f6a60a,
188     0x12ed0daac, 0x068bce87a, 0x11eef4f8e, 0x1329d9f7e, 0x0b3e32c28,
189     0x0170076fa, 0x0dd7e3b0c, 0x1fae1cc66, 0x010746f3c, 0x086d8e4d2,
190     0x0271d9844, 0x0b3af077a, 0x093a5f730, 0x1d88abd4a, 0x06b749fb2,
191     0x0c9c8b782, 0x0cec3662e, 0x1ddffc5d4, 0x0e6fc4e6a, 0x168763fa6,
192     0x0b0cd4768, 0x19b1afbc4, 0x0d7a4825c, 0x123888b7a, 0x00167d312,
193     0x133d7a042, 0x026f6a60a, 0x000bcf5f6, 0x19d34af3a, 0x1af900c24,
194     0x068bce87a, 0x06d390dec, 0x16cba8aca, 0x1f16a3418, 0x1329d9f7e,
195     0x19fb2a8b0, 0x02178513a, 0x1a0f717c4, 0x0170076fa,
196 };
197 
198 enum class CutoffStrategy {
199   // Use 3 CRC streams to fold into 1.
200   Fold3,
201   // Unroll CRC instructions for 64 bytes.
202   Unroll64CRC,
203 };
204 
205 // Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
206 // methods and data that don't need the template arguments.
207 class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
208     : public CRC32AcceleratedX86ARMCombined {
209  protected:
210   // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
211   // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
212   // high latency, so we run 4 128-bit partial checksums that can be reduced to
213   // a single value by FinalizePclmulStream later. Computing crc for arbitrary
214   // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
215   // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
216   // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
217   // We are applying it to CRC32C polynomial.
Process64BytesPclmul(const uint8_t * p,V128 * partialCRC) const218   ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
219       const uint8_t* p, V128* partialCRC) const {
220     V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
221 
222     V128 partialCRC1 = partialCRC[0];
223     V128 partialCRC2 = partialCRC[1];
224     V128 partialCRC3 = partialCRC[2];
225     V128 partialCRC4 = partialCRC[3];
226 
227     V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
228     V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
229     V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
230     V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
231     V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
232     V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
233     V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
234     V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
235     partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
236     partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
237     partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
238     partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
239     partialCRC1 = V128_Xor(tmp1, partialCRC1);
240     partialCRC2 = V128_Xor(tmp2, partialCRC2);
241     partialCRC3 = V128_Xor(tmp3, partialCRC3);
242     partialCRC4 = V128_Xor(tmp4, partialCRC4);
243     partialCRC1 = V128_Xor(partialCRC1, data1);
244     partialCRC2 = V128_Xor(partialCRC2, data2);
245     partialCRC3 = V128_Xor(partialCRC3, data3);
246     partialCRC4 = V128_Xor(partialCRC4, data4);
247     partialCRC[0] = partialCRC1;
248     partialCRC[1] = partialCRC2;
249     partialCRC[2] = partialCRC3;
250     partialCRC[3] = partialCRC4;
251   }
252 
253   // Reduce partialCRC produced by Process64BytesPclmul into a single value,
254   // that represents crc checksum of all the processed bytes.
255   ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
FinalizePclmulStream(V128 * partialCRC) const256   FinalizePclmulStream(V128* partialCRC) const {
257     V128 partialCRC1 = partialCRC[0];
258     V128 partialCRC2 = partialCRC[1];
259     V128 partialCRC3 = partialCRC[2];
260     V128 partialCRC4 = partialCRC[3];
261 
262     // Combine 4 vectors of partial crc into a single vector.
263     V128 reductionMultiplicands =
264         V128_Load(reinterpret_cast<const V128*>(k5k6));
265 
266     V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
267     V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
268 
269     partialCRC1 = V128_Xor(low, high);
270     partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
271 
272     low = V128_PMulLow(reductionMultiplicands, partialCRC3);
273     high = V128_PMulHi(reductionMultiplicands, partialCRC3);
274 
275     partialCRC3 = V128_Xor(low, high);
276     partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
277 
278     reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
279 
280     low = V128_PMulLow(reductionMultiplicands, partialCRC1);
281     high = V128_PMulHi(reductionMultiplicands, partialCRC1);
282     V128 fullCRC = V128_Xor(low, high);
283     fullCRC = V128_Xor(fullCRC, partialCRC3);
284 
285     // Reduce fullCRC into scalar value.
286     reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
287 
288     V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
289 
290     V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
291     fullCRC = V128_ShiftRight<8>(fullCRC);
292     fullCRC = V128_Xor(fullCRC, tmp);
293 
294     reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
295 
296     tmp = V128_ShiftRight<4>(fullCRC);
297     fullCRC = V128_And(fullCRC, mask);
298     fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
299     fullCRC = V128_Xor(tmp, fullCRC);
300 
301     reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
302 
303     tmp = V128_And(fullCRC, mask);
304     tmp = V128_PMul01(reductionMultiplicands, tmp);
305     tmp = V128_And(tmp, mask);
306     tmp = V128_PMulLow(reductionMultiplicands, tmp);
307 
308     fullCRC = V128_Xor(tmp, fullCRC);
309 
310     return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
311   }
312 
313   // Update crc with 64 bytes of data from p.
Process64BytesCRC(const uint8_t * p,uint64_t crc) const314   ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
315                                                           uint64_t crc) const {
316     for (int i = 0; i < 8; i++) {
317       crc =
318           CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
319       p += 8;
320     }
321     return crc;
322   }
323 
324   // Generated by crc32c_x86_test --crc32c_generate_constants=true
325   // and verified against constants in linux kernel for S390:
326   // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
327   alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
328   alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
329   alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
330   alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
331   alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
332   alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
333 
334   // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
335   // size. Each group is CRCed in parallel then combined at the end of the
336   // block.
337   static constexpr size_t kGroupsSmall = 3;
338   // For large runs we use up to kMaxStreams blocks computed with CRC
339   // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
340   // are combined in the end.
341   static constexpr size_t kMaxStreams = 3;
342 };
343 
344 #ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
345 alignas(16) constexpr uint64_t
346     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
347 alignas(16) constexpr uint64_t
348     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
349 alignas(16) constexpr uint64_t
350     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
351 alignas(16) constexpr uint64_t
352     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
353 alignas(16) constexpr uint64_t
354     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
355 alignas(16) constexpr uint32_t
356     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
357 constexpr size_t
358     CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
359 constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
360 #endif  // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
361 
362 template <size_t num_crc_streams, size_t num_pclmul_streams,
363           CutoffStrategy strategy>
364 class CRC32AcceleratedX86ARMCombinedMultipleStreams
365     : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
366   ABSL_ATTRIBUTE_HOT
Extend(uint32_t * crc,const void * bytes,size_t length) const367   void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
368     static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
369                   "Invalid number of crc streams");
370     static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
371                   "Invalid number of pclmul streams");
372     const uint8_t* p = static_cast<const uint8_t*>(bytes);
373     const uint8_t* e = p + length;
374     uint32_t l = *crc;
375     uint64_t l64;
376 
377     // We have dedicated instruction for 1,2,4 and 8 bytes.
378     if (length & 8) {
379       ABSL_INTERNAL_STEP8(l, p);
380       length &= ~size_t{8};
381     }
382     if (length & 4) {
383       ABSL_INTERNAL_STEP4(l);
384       length &= ~size_t{4};
385     }
386     if (length & 2) {
387       ABSL_INTERNAL_STEP2(l);
388       length &= ~size_t{2};
389     }
390     if (length & 1) {
391       ABSL_INTERNAL_STEP1(l);
392       length &= ~size_t{1};
393     }
394     if (length == 0) {
395       *crc = l;
396       return;
397     }
398     // length is now multiple of 16.
399 
400     // For small blocks just run simple loop, because cost of combining multiple
401     // streams is significant.
402     if (strategy != CutoffStrategy::Unroll64CRC) {
403       if (length < kSmallCutoff) {
404         while (length >= 16) {
405           ABSL_INTERNAL_STEP8(l, p);
406           ABSL_INTERNAL_STEP8(l, p);
407           length -= 16;
408         }
409         *crc = l;
410         return;
411       }
412     }
413 
414     // For medium blocks we run 3 crc streams and combine them as described in
415     // Intel paper above. Running 4th stream doesn't help, because crc
416     // instruction has latency 3 and throughput 1.
417     if (length < kMediumCutoff) {
418       l64 = l;
419       if (strategy == CutoffStrategy::Fold3) {
420         uint64_t l641 = 0;
421         uint64_t l642 = 0;
422         const size_t blockSize = 32;
423         size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
424         const uint8_t* p1 = p + bs * blockSize;
425         const uint8_t* p2 = p1 + bs * blockSize;
426 
427         for (size_t i = 0; i + 1 < bs; ++i) {
428           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
429           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
430           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
431           ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
432           base_internal::PrefetchT0(
433               reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
434           base_internal::PrefetchT0(
435               reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
436           base_internal::PrefetchT0(
437               reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
438         }
439         // Don't run crc on last 8 bytes.
440         ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
441         ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
442         ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
443         ABSL_INTERNAL_STEP8BY2(l64, l641, p, p1);
444 
445         V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
446 
447         V128 tmp = V128_From2x64(0, l64);
448 
449         V128 res1 = V128_PMulLow(tmp, magic);
450 
451         tmp = V128_From2x64(0, l641);
452 
453         V128 res2 = V128_PMul10(tmp, magic);
454         V128 x = V128_Xor(res1, res2);
455         l64 = static_cast<uint64_t>(V128_Low64(x)) ^
456               absl::little_endian::Load64(p2);
457         l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
458 
459         p = p2 + 8;
460       } else if (strategy == CutoffStrategy::Unroll64CRC) {
461         while ((e - p) >= 64) {
462           l64 = Process64BytesCRC(p, l64);
463           p += 64;
464         }
465       }
466     } else {
467       // There is a lot of data, we can ignore combine costs and run all
468       // requested streams (num_crc_streams + num_pclmul_streams),
469       // using prefetch. CRC and PCLMULQDQ use different cpu execution units,
470       // so on some cpus it makes sense to execute both of them for different
471       // streams.
472 
473       // Point x at first 8-byte aligned byte in string.
474       const uint8_t* x = RoundUp<8>(p);
475       // Process bytes until p is 8-byte aligned, if that isn't past the end.
476       while (p != x) {
477         ABSL_INTERNAL_STEP1(l);
478       }
479 
480       size_t bs = static_cast<size_t>(e - p) /
481                   (num_crc_streams + num_pclmul_streams) / 64;
482       const uint8_t* crc_streams[kMaxStreams];
483       const uint8_t* pclmul_streams[kMaxStreams];
484       // We are guaranteed to have at least one crc stream.
485       crc_streams[0] = p;
486       for (size_t i = 1; i < num_crc_streams; i++) {
487         crc_streams[i] = crc_streams[i - 1] + bs * 64;
488       }
489       pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
490       for (size_t i = 1; i < num_pclmul_streams; i++) {
491         pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
492       }
493 
494       // Per stream crc sums.
495       uint64_t l64_crc[kMaxStreams] = {l};
496       uint64_t l64_pclmul[kMaxStreams] = {0};
497 
498       // Peel first iteration, because PCLMULQDQ stream, needs setup.
499       for (size_t i = 0; i < num_crc_streams; i++) {
500         l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
501         crc_streams[i] += 16 * 4;
502       }
503 
504       V128 partialCRC[kMaxStreams][4];
505       for (size_t i = 0; i < num_pclmul_streams; i++) {
506         partialCRC[i][0] = V128_LoadU(
507             reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
508         partialCRC[i][1] = V128_LoadU(
509             reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 1));
510         partialCRC[i][2] = V128_LoadU(
511             reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 2));
512         partialCRC[i][3] = V128_LoadU(
513             reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 3));
514         pclmul_streams[i] += 16 * 4;
515       }
516 
517       for (size_t i = 1; i < bs; i++) {
518         // Prefetch data for next itterations.
519         for (size_t j = 0; j < num_crc_streams; j++) {
520           base_internal::PrefetchT0(
521               reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
522         }
523         for (size_t j = 0; j < num_pclmul_streams; j++) {
524           base_internal::PrefetchT0(reinterpret_cast<const char*>(
525               pclmul_streams[j] + kPrefetchHorizon));
526         }
527 
528         // We process each stream in 64 byte blocks. This can be written as
529         // for (int i = 0; i < num_pclmul_streams; i++) {
530         //   Process64BytesPclmul(pclmul_streams[i], partialCRC[i]);
531         //   pclmul_streams[i] += 16 * 4;
532         // }
533         // for (int i = 0; i < num_crc_streams; i++) {
534         //   l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
535         //   crc_streams[i] += 16*4;
536         // }
537         // But unrolling and interleaving PCLMULQDQ and CRC blocks manually
538         // gives ~2% performance boost.
539         l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
540         crc_streams[0] += 16 * 4;
541         if (num_pclmul_streams > 0) {
542           Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
543           pclmul_streams[0] += 16 * 4;
544         }
545         if (num_crc_streams > 1) {
546           l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
547           crc_streams[1] += 16 * 4;
548         }
549         if (num_pclmul_streams > 1) {
550           Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
551           pclmul_streams[1] += 16 * 4;
552         }
553         if (num_crc_streams > 2) {
554           l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
555           crc_streams[2] += 16 * 4;
556         }
557         if (num_pclmul_streams > 2) {
558           Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
559           pclmul_streams[2] += 16 * 4;
560         }
561       }
562 
563       // PCLMULQDQ based streams require special final step;
564       // CRC based don't.
565       for (size_t i = 0; i < num_pclmul_streams; i++) {
566         l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
567       }
568 
569       // Combine all streams into single result.
570       uint32_t magic = ComputeZeroConstant(bs * 64);
571       l64 = l64_crc[0];
572       for (size_t i = 1; i < num_crc_streams; i++) {
573         l64 = multiply(static_cast<uint32_t>(l64), magic);
574         l64 ^= l64_crc[i];
575       }
576       for (size_t i = 0; i < num_pclmul_streams; i++) {
577         l64 = multiply(static_cast<uint32_t>(l64), magic);
578         l64 ^= l64_pclmul[i];
579       }
580 
581       // Update p.
582       if (num_pclmul_streams > 0) {
583         p = pclmul_streams[num_pclmul_streams - 1];
584       } else {
585         p = crc_streams[num_crc_streams - 1];
586       }
587     }
588     l = static_cast<uint32_t>(l64);
589 
590     while ((e - p) >= 16) {
591       ABSL_INTERNAL_STEP8(l, p);
592       ABSL_INTERNAL_STEP8(l, p);
593     }
594     // Process the last few bytes
595     while (p != e) {
596       ABSL_INTERNAL_STEP1(l);
597     }
598 
599 #undef ABSL_INTERNAL_STEP8BY3
600 #undef ABSL_INTERNAL_STEP8BY2
601 #undef ABSL_INTERNAL_STEP8
602 #undef ABSL_INTERNAL_STEP4
603 #undef ABSL_INTERNAL_STEP2
604 #undef ABSL_INTERNAL_STEP1
605 
606     *crc = l;
607   }
608 };
609 
610 }  // namespace
611 
612 // Intel processors with SSE4.2 have an instruction for one particular
613 // 32-bit CRC polynomial:  crc32c
TryNewCRC32AcceleratedX86ARMCombined()614 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
615   CpuType type = GetCpuType();
616   switch (type) {
617     case CpuType::kIntelHaswell:
618     case CpuType::kAmdRome:
619     case CpuType::kAmdNaples:
620     case CpuType::kAmdMilan:
621       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
622           3, 1, CutoffStrategy::Fold3>();
623     // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation.
624     case CpuType::kIntelCascadelakeXeon:
625     case CpuType::kIntelSkylakeXeon:
626     case CpuType::kIntelBroadwell:
627     case CpuType::kIntelSkylake:
628       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
629           3, 2, CutoffStrategy::Fold3>();
630     // PCLMULQDQ is slow, don't use it.
631     case CpuType::kIntelIvybridge:
632     case CpuType::kIntelSandybridge:
633     case CpuType::kIntelWestmere:
634       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
635           3, 0, CutoffStrategy::Fold3>();
636     case CpuType::kArmNeoverseN1:
637       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
638           1, 1, CutoffStrategy::Unroll64CRC>();
639 #if defined(__aarch64__)
640     default:
641       // Not all ARM processors support the needed instructions, so check here
642       // before trying to use an accelerated implementation.
643       if (SupportsArmCRC32PMULL()) {
644         return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
645             1, 1, CutoffStrategy::Unroll64CRC>();
646       } else {
647         return nullptr;
648       }
649 #else
650     default:
651       // Something else, play it safe and assume slow PCLMULQDQ.
652       return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
653           3, 0, CutoffStrategy::Fold3>();
654 #endif
655   }
656 }
657 
NewCRC32AcceleratedX86ARMCombinedAll()658 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
659   auto ret = std::vector<std::unique_ptr<CRCImpl>>();
660   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
661                     1, 0, CutoffStrategy::Fold3>>());
662   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
663                     1, 1, CutoffStrategy::Fold3>>());
664   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
665                     1, 2, CutoffStrategy::Fold3>>());
666   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
667                     1, 3, CutoffStrategy::Fold3>>());
668   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
669                     2, 0, CutoffStrategy::Fold3>>());
670   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
671                     2, 1, CutoffStrategy::Fold3>>());
672   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
673                     2, 2, CutoffStrategy::Fold3>>());
674   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
675                     2, 3, CutoffStrategy::Fold3>>());
676   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
677                     3, 0, CutoffStrategy::Fold3>>());
678   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
679                     3, 1, CutoffStrategy::Fold3>>());
680   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
681                     3, 2, CutoffStrategy::Fold3>>());
682   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
683                     3, 3, CutoffStrategy::Fold3>>());
684   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
685                     1, 0, CutoffStrategy::Unroll64CRC>>());
686   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
687                     1, 1, CutoffStrategy::Unroll64CRC>>());
688   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
689                     1, 2, CutoffStrategy::Unroll64CRC>>());
690   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
691                     1, 3, CutoffStrategy::Unroll64CRC>>());
692   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
693                     2, 0, CutoffStrategy::Unroll64CRC>>());
694   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
695                     2, 1, CutoffStrategy::Unroll64CRC>>());
696   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
697                     2, 2, CutoffStrategy::Unroll64CRC>>());
698   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
699                     2, 3, CutoffStrategy::Unroll64CRC>>());
700   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
701                     3, 0, CutoffStrategy::Unroll64CRC>>());
702   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
703                     3, 1, CutoffStrategy::Unroll64CRC>>());
704   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
705                     3, 2, CutoffStrategy::Unroll64CRC>>());
706   ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
707                     3, 3, CutoffStrategy::Unroll64CRC>>());
708 
709   return ret;
710 }
711 
712 #else  // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
713 
714 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
715   return std::vector<std::unique_ptr<CRCImpl>>();
716 }
717 
718 // no hardware acceleration available
719 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }
720 
721 #endif
722 
723 }  // namespace crc_internal
724 ABSL_NAMESPACE_END
725 }  // namespace absl
726