1 // Copyright 2022 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // Hardware accelerated CRC32 computation on Intel and ARM architecture.
16
17 #include <cstddef>
18 #include <cstdint>
19
20 #include "absl/base/attributes.h"
21 #include "absl/base/config.h"
22 #include "absl/base/dynamic_annotations.h"
23 #include "absl/base/internal/endian.h"
24 #include "absl/base/internal/prefetch.h"
25 #include "absl/crc/internal/cpu_detect.h"
26 #include "absl/crc/internal/crc.h"
27 #include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
28 #include "absl/crc/internal/crc_internal.h"
29 #include "absl/memory/memory.h"
30 #include "absl/numeric/bits.h"
31
32 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
33 defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
34 #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
35 #endif
36
37 namespace absl {
38 ABSL_NAMESPACE_BEGIN
39 namespace crc_internal {
40
41 #if defined(ABSL_INTERNAL_CAN_USE_SIMD_CRC32C)
42
43 // Implementation details not exported outside of file
44 namespace {
45
46 // Some machines have CRC acceleration hardware.
47 // We can do a faster version of Extend() on such machines.
48 class CRC32AcceleratedX86ARMCombined : public CRC32 {
49 public:
CRC32AcceleratedX86ARMCombined()50 CRC32AcceleratedX86ARMCombined() {}
~CRC32AcceleratedX86ARMCombined()51 ~CRC32AcceleratedX86ARMCombined() override {}
52 void ExtendByZeroes(uint32_t* crc, size_t length) const override;
53 uint32_t ComputeZeroConstant(size_t length) const;
54
55 private:
56 CRC32AcceleratedX86ARMCombined(const CRC32AcceleratedX86ARMCombined&) =
57 delete;
58 CRC32AcceleratedX86ARMCombined& operator=(
59 const CRC32AcceleratedX86ARMCombined&) = delete;
60 };
61
62 // Constants for switching between algorithms.
63 // Chosen by comparing speed at different powers of 2.
64 constexpr size_t kSmallCutoff = 256;
65 constexpr size_t kMediumCutoff = 2048;
66
67 #define ABSL_INTERNAL_STEP1(crc) \
68 do { \
69 crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
70 } while (0)
71 #define ABSL_INTERNAL_STEP2(crc) \
72 do { \
73 crc = \
74 CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
75 p += 2; \
76 } while (0)
77 #define ABSL_INTERNAL_STEP4(crc) \
78 do { \
79 crc = \
80 CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
81 p += 4; \
82 } while (0)
83 #define ABSL_INTERNAL_STEP8(crc, data) \
84 do { \
85 crc = CRC32_u64(static_cast<uint32_t>(crc), \
86 absl::little_endian::Load64(data)); \
87 data += 8; \
88 } while (0)
89 #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
90 do { \
91 ABSL_INTERNAL_STEP8(crc0, p0); \
92 ABSL_INTERNAL_STEP8(crc1, p1); \
93 } while (0)
94 #define ABSL_INTERNAL_STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
95 do { \
96 ABSL_INTERNAL_STEP8(crc0, p0); \
97 ABSL_INTERNAL_STEP8(crc1, p1); \
98 ABSL_INTERNAL_STEP8(crc2, p2); \
99 } while (0)
100
101 namespace {
102
multiply(uint32_t a,uint32_t b)103 uint32_t multiply(uint32_t a, uint32_t b) {
104 V128 shifts = V128_From2x64(0, 1);
105 V128 power = V128_From2x64(0, a);
106 V128 crc = V128_From2x64(0, b);
107 V128 res = V128_PMulLow(power, crc);
108
109 // Combine crc values
110 res = V128_ShiftLeft64(res, shifts);
111 return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
112 CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
113 }
114
115 // Powers of crc32c polynomial, for faster ExtendByZeros.
116 // Verified against folly:
117 // folly/hash/detail/Crc32CombineDetail.cpp
118 constexpr uint32_t kCRC32CPowers[] = {
119 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
120 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
121 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
122 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
123 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
124 0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
125 0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
126 0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
127 0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
128 0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
129 0x00800000, 0x00008000,
130 };
131
132 } // namespace
133
134 // Compute a magic constant, so that multiplying by it is the same as
135 // extending crc by length zeros.
ComputeZeroConstant(size_t length) const136 uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
137 size_t length) const {
138 // Lowest 2 bits are handled separately in ExtendByZeroes
139 length >>= 2;
140
141 int index = absl::countr_zero(length);
142 uint32_t prev = kCRC32CPowers[index];
143 length &= length - 1;
144
145 while (length) {
146 // For each bit of length, extend by 2**n zeros.
147 index = absl::countr_zero(length);
148 prev = multiply(prev, kCRC32CPowers[index]);
149 length &= length - 1;
150 }
151 return prev;
152 }
153
ExtendByZeroes(uint32_t * crc,size_t length) const154 void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
155 size_t length) const {
156 uint32_t val = *crc;
157 // Don't bother with multiplication for small length.
158 switch (length & 3) {
159 case 0:
160 break;
161 case 1:
162 val = CRC32_u8(val, 0);
163 break;
164 case 2:
165 val = CRC32_u16(val, 0);
166 break;
167 case 3:
168 val = CRC32_u8(val, 0);
169 val = CRC32_u16(val, 0);
170 break;
171 }
172 if (length > 3) {
173 val = multiply(val, ComputeZeroConstant(length));
174 }
175 *crc = val;
176 }
177
178 // Taken from Intel paper "Fast CRC Computation for iSCSI Polynomial Using CRC32
179 // Instruction"
180 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
181 // We only need every 4th value, because we unroll loop by 4.
182 constexpr uint64_t kClmulConstants[] = {
183 0x09e4addf8, 0x0ba4fc28e, 0x00d3b6092, 0x09e4addf8, 0x0ab7aff2a,
184 0x102f9b8a2, 0x0b9e02b86, 0x00d3b6092, 0x1bf2e8b8a, 0x18266e456,
185 0x0d270f1a2, 0x0ab7aff2a, 0x11eef4f8e, 0x083348832, 0x0dd7e3b0c,
186 0x0b9e02b86, 0x0271d9844, 0x1b331e26a, 0x06b749fb2, 0x1bf2e8b8a,
187 0x0e6fc4e6a, 0x0ce7f39f4, 0x0d7a4825c, 0x0d270f1a2, 0x026f6a60a,
188 0x12ed0daac, 0x068bce87a, 0x11eef4f8e, 0x1329d9f7e, 0x0b3e32c28,
189 0x0170076fa, 0x0dd7e3b0c, 0x1fae1cc66, 0x010746f3c, 0x086d8e4d2,
190 0x0271d9844, 0x0b3af077a, 0x093a5f730, 0x1d88abd4a, 0x06b749fb2,
191 0x0c9c8b782, 0x0cec3662e, 0x1ddffc5d4, 0x0e6fc4e6a, 0x168763fa6,
192 0x0b0cd4768, 0x19b1afbc4, 0x0d7a4825c, 0x123888b7a, 0x00167d312,
193 0x133d7a042, 0x026f6a60a, 0x000bcf5f6, 0x19d34af3a, 0x1af900c24,
194 0x068bce87a, 0x06d390dec, 0x16cba8aca, 0x1f16a3418, 0x1329d9f7e,
195 0x19fb2a8b0, 0x02178513a, 0x1a0f717c4, 0x0170076fa,
196 };
197
198 enum class CutoffStrategy {
199 // Use 3 CRC streams to fold into 1.
200 Fold3,
201 // Unroll CRC instructions for 64 bytes.
202 Unroll64CRC,
203 };
204
205 // Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
206 // methods and data that don't need the template arguments.
207 class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
208 : public CRC32AcceleratedX86ARMCombined {
209 protected:
210 // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
211 // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
212 // high latency, so we run 4 128-bit partial checksums that can be reduced to
213 // a single value by FinalizePclmulStream later. Computing crc for arbitrary
214 // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
215 // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
216 // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
217 // We are applying it to CRC32C polynomial.
Process64BytesPclmul(const uint8_t * p,V128 * partialCRC) const218 ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
219 const uint8_t* p, V128* partialCRC) const {
220 V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
221
222 V128 partialCRC1 = partialCRC[0];
223 V128 partialCRC2 = partialCRC[1];
224 V128 partialCRC3 = partialCRC[2];
225 V128 partialCRC4 = partialCRC[3];
226
227 V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
228 V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
229 V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
230 V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
231 V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
232 V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
233 V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
234 V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
235 partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
236 partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
237 partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
238 partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
239 partialCRC1 = V128_Xor(tmp1, partialCRC1);
240 partialCRC2 = V128_Xor(tmp2, partialCRC2);
241 partialCRC3 = V128_Xor(tmp3, partialCRC3);
242 partialCRC4 = V128_Xor(tmp4, partialCRC4);
243 partialCRC1 = V128_Xor(partialCRC1, data1);
244 partialCRC2 = V128_Xor(partialCRC2, data2);
245 partialCRC3 = V128_Xor(partialCRC3, data3);
246 partialCRC4 = V128_Xor(partialCRC4, data4);
247 partialCRC[0] = partialCRC1;
248 partialCRC[1] = partialCRC2;
249 partialCRC[2] = partialCRC3;
250 partialCRC[3] = partialCRC4;
251 }
252
253 // Reduce partialCRC produced by Process64BytesPclmul into a single value,
254 // that represents crc checksum of all the processed bytes.
255 ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
FinalizePclmulStream(V128 * partialCRC) const256 FinalizePclmulStream(V128* partialCRC) const {
257 V128 partialCRC1 = partialCRC[0];
258 V128 partialCRC2 = partialCRC[1];
259 V128 partialCRC3 = partialCRC[2];
260 V128 partialCRC4 = partialCRC[3];
261
262 // Combine 4 vectors of partial crc into a single vector.
263 V128 reductionMultiplicands =
264 V128_Load(reinterpret_cast<const V128*>(k5k6));
265
266 V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
267 V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
268
269 partialCRC1 = V128_Xor(low, high);
270 partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
271
272 low = V128_PMulLow(reductionMultiplicands, partialCRC3);
273 high = V128_PMulHi(reductionMultiplicands, partialCRC3);
274
275 partialCRC3 = V128_Xor(low, high);
276 partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
277
278 reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
279
280 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
281 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
282 V128 fullCRC = V128_Xor(low, high);
283 fullCRC = V128_Xor(fullCRC, partialCRC3);
284
285 // Reduce fullCRC into scalar value.
286 reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
287
288 V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
289
290 V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
291 fullCRC = V128_ShiftRight<8>(fullCRC);
292 fullCRC = V128_Xor(fullCRC, tmp);
293
294 reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
295
296 tmp = V128_ShiftRight<4>(fullCRC);
297 fullCRC = V128_And(fullCRC, mask);
298 fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
299 fullCRC = V128_Xor(tmp, fullCRC);
300
301 reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
302
303 tmp = V128_And(fullCRC, mask);
304 tmp = V128_PMul01(reductionMultiplicands, tmp);
305 tmp = V128_And(tmp, mask);
306 tmp = V128_PMulLow(reductionMultiplicands, tmp);
307
308 fullCRC = V128_Xor(tmp, fullCRC);
309
310 return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
311 }
312
313 // Update crc with 64 bytes of data from p.
Process64BytesCRC(const uint8_t * p,uint64_t crc) const314 ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
315 uint64_t crc) const {
316 for (int i = 0; i < 8; i++) {
317 crc =
318 CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
319 p += 8;
320 }
321 return crc;
322 }
323
324 // Generated by crc32c_x86_test --crc32c_generate_constants=true
325 // and verified against constants in linux kernel for S390:
326 // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
327 alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
328 alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
329 alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
330 alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
331 alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
332 alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
333
334 // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
335 // size. Each group is CRCed in parallel then combined at the end of the
336 // block.
337 static constexpr size_t kGroupsSmall = 3;
338 // For large runs we use up to kMaxStreams blocks computed with CRC
339 // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
340 // are combined in the end.
341 static constexpr size_t kMaxStreams = 3;
342 };
343
344 #ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
345 alignas(16) constexpr uint64_t
346 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
347 alignas(16) constexpr uint64_t
348 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
349 alignas(16) constexpr uint64_t
350 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
351 alignas(16) constexpr uint64_t
352 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
353 alignas(16) constexpr uint64_t
354 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
355 alignas(16) constexpr uint32_t
356 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
357 constexpr size_t
358 CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
359 constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
360 #endif // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
361
362 template <size_t num_crc_streams, size_t num_pclmul_streams,
363 CutoffStrategy strategy>
364 class CRC32AcceleratedX86ARMCombinedMultipleStreams
365 : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
366 ABSL_ATTRIBUTE_HOT
Extend(uint32_t * crc,const void * bytes,size_t length) const367 void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
368 static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
369 "Invalid number of crc streams");
370 static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
371 "Invalid number of pclmul streams");
372 const uint8_t* p = static_cast<const uint8_t*>(bytes);
373 const uint8_t* e = p + length;
374 uint32_t l = *crc;
375 uint64_t l64;
376
377 // We have dedicated instruction for 1,2,4 and 8 bytes.
378 if (length & 8) {
379 ABSL_INTERNAL_STEP8(l, p);
380 length &= ~size_t{8};
381 }
382 if (length & 4) {
383 ABSL_INTERNAL_STEP4(l);
384 length &= ~size_t{4};
385 }
386 if (length & 2) {
387 ABSL_INTERNAL_STEP2(l);
388 length &= ~size_t{2};
389 }
390 if (length & 1) {
391 ABSL_INTERNAL_STEP1(l);
392 length &= ~size_t{1};
393 }
394 if (length == 0) {
395 *crc = l;
396 return;
397 }
398 // length is now multiple of 16.
399
400 // For small blocks just run simple loop, because cost of combining multiple
401 // streams is significant.
402 if (strategy != CutoffStrategy::Unroll64CRC) {
403 if (length < kSmallCutoff) {
404 while (length >= 16) {
405 ABSL_INTERNAL_STEP8(l, p);
406 ABSL_INTERNAL_STEP8(l, p);
407 length -= 16;
408 }
409 *crc = l;
410 return;
411 }
412 }
413
414 // For medium blocks we run 3 crc streams and combine them as described in
415 // Intel paper above. Running 4th stream doesn't help, because crc
416 // instruction has latency 3 and throughput 1.
417 if (length < kMediumCutoff) {
418 l64 = l;
419 if (strategy == CutoffStrategy::Fold3) {
420 uint64_t l641 = 0;
421 uint64_t l642 = 0;
422 const size_t blockSize = 32;
423 size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
424 const uint8_t* p1 = p + bs * blockSize;
425 const uint8_t* p2 = p1 + bs * blockSize;
426
427 for (size_t i = 0; i + 1 < bs; ++i) {
428 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
429 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
430 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
431 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
432 base_internal::PrefetchT0(
433 reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
434 base_internal::PrefetchT0(
435 reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
436 base_internal::PrefetchT0(
437 reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
438 }
439 // Don't run crc on last 8 bytes.
440 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
441 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
442 ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
443 ABSL_INTERNAL_STEP8BY2(l64, l641, p, p1);
444
445 V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
446
447 V128 tmp = V128_From2x64(0, l64);
448
449 V128 res1 = V128_PMulLow(tmp, magic);
450
451 tmp = V128_From2x64(0, l641);
452
453 V128 res2 = V128_PMul10(tmp, magic);
454 V128 x = V128_Xor(res1, res2);
455 l64 = static_cast<uint64_t>(V128_Low64(x)) ^
456 absl::little_endian::Load64(p2);
457 l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
458
459 p = p2 + 8;
460 } else if (strategy == CutoffStrategy::Unroll64CRC) {
461 while ((e - p) >= 64) {
462 l64 = Process64BytesCRC(p, l64);
463 p += 64;
464 }
465 }
466 } else {
467 // There is a lot of data, we can ignore combine costs and run all
468 // requested streams (num_crc_streams + num_pclmul_streams),
469 // using prefetch. CRC and PCLMULQDQ use different cpu execution units,
470 // so on some cpus it makes sense to execute both of them for different
471 // streams.
472
473 // Point x at first 8-byte aligned byte in string.
474 const uint8_t* x = RoundUp<8>(p);
475 // Process bytes until p is 8-byte aligned, if that isn't past the end.
476 while (p != x) {
477 ABSL_INTERNAL_STEP1(l);
478 }
479
480 size_t bs = static_cast<size_t>(e - p) /
481 (num_crc_streams + num_pclmul_streams) / 64;
482 const uint8_t* crc_streams[kMaxStreams];
483 const uint8_t* pclmul_streams[kMaxStreams];
484 // We are guaranteed to have at least one crc stream.
485 crc_streams[0] = p;
486 for (size_t i = 1; i < num_crc_streams; i++) {
487 crc_streams[i] = crc_streams[i - 1] + bs * 64;
488 }
489 pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
490 for (size_t i = 1; i < num_pclmul_streams; i++) {
491 pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
492 }
493
494 // Per stream crc sums.
495 uint64_t l64_crc[kMaxStreams] = {l};
496 uint64_t l64_pclmul[kMaxStreams] = {0};
497
498 // Peel first iteration, because PCLMULQDQ stream, needs setup.
499 for (size_t i = 0; i < num_crc_streams; i++) {
500 l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
501 crc_streams[i] += 16 * 4;
502 }
503
504 V128 partialCRC[kMaxStreams][4];
505 for (size_t i = 0; i < num_pclmul_streams; i++) {
506 partialCRC[i][0] = V128_LoadU(
507 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
508 partialCRC[i][1] = V128_LoadU(
509 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 1));
510 partialCRC[i][2] = V128_LoadU(
511 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 2));
512 partialCRC[i][3] = V128_LoadU(
513 reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 3));
514 pclmul_streams[i] += 16 * 4;
515 }
516
517 for (size_t i = 1; i < bs; i++) {
518 // Prefetch data for next itterations.
519 for (size_t j = 0; j < num_crc_streams; j++) {
520 base_internal::PrefetchT0(
521 reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
522 }
523 for (size_t j = 0; j < num_pclmul_streams; j++) {
524 base_internal::PrefetchT0(reinterpret_cast<const char*>(
525 pclmul_streams[j] + kPrefetchHorizon));
526 }
527
528 // We process each stream in 64 byte blocks. This can be written as
529 // for (int i = 0; i < num_pclmul_streams; i++) {
530 // Process64BytesPclmul(pclmul_streams[i], partialCRC[i]);
531 // pclmul_streams[i] += 16 * 4;
532 // }
533 // for (int i = 0; i < num_crc_streams; i++) {
534 // l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
535 // crc_streams[i] += 16*4;
536 // }
537 // But unrolling and interleaving PCLMULQDQ and CRC blocks manually
538 // gives ~2% performance boost.
539 l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
540 crc_streams[0] += 16 * 4;
541 if (num_pclmul_streams > 0) {
542 Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
543 pclmul_streams[0] += 16 * 4;
544 }
545 if (num_crc_streams > 1) {
546 l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
547 crc_streams[1] += 16 * 4;
548 }
549 if (num_pclmul_streams > 1) {
550 Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
551 pclmul_streams[1] += 16 * 4;
552 }
553 if (num_crc_streams > 2) {
554 l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
555 crc_streams[2] += 16 * 4;
556 }
557 if (num_pclmul_streams > 2) {
558 Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
559 pclmul_streams[2] += 16 * 4;
560 }
561 }
562
563 // PCLMULQDQ based streams require special final step;
564 // CRC based don't.
565 for (size_t i = 0; i < num_pclmul_streams; i++) {
566 l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
567 }
568
569 // Combine all streams into single result.
570 uint32_t magic = ComputeZeroConstant(bs * 64);
571 l64 = l64_crc[0];
572 for (size_t i = 1; i < num_crc_streams; i++) {
573 l64 = multiply(static_cast<uint32_t>(l64), magic);
574 l64 ^= l64_crc[i];
575 }
576 for (size_t i = 0; i < num_pclmul_streams; i++) {
577 l64 = multiply(static_cast<uint32_t>(l64), magic);
578 l64 ^= l64_pclmul[i];
579 }
580
581 // Update p.
582 if (num_pclmul_streams > 0) {
583 p = pclmul_streams[num_pclmul_streams - 1];
584 } else {
585 p = crc_streams[num_crc_streams - 1];
586 }
587 }
588 l = static_cast<uint32_t>(l64);
589
590 while ((e - p) >= 16) {
591 ABSL_INTERNAL_STEP8(l, p);
592 ABSL_INTERNAL_STEP8(l, p);
593 }
594 // Process the last few bytes
595 while (p != e) {
596 ABSL_INTERNAL_STEP1(l);
597 }
598
599 #undef ABSL_INTERNAL_STEP8BY3
600 #undef ABSL_INTERNAL_STEP8BY2
601 #undef ABSL_INTERNAL_STEP8
602 #undef ABSL_INTERNAL_STEP4
603 #undef ABSL_INTERNAL_STEP2
604 #undef ABSL_INTERNAL_STEP1
605
606 *crc = l;
607 }
608 };
609
610 } // namespace
611
612 // Intel processors with SSE4.2 have an instruction for one particular
613 // 32-bit CRC polynomial: crc32c
TryNewCRC32AcceleratedX86ARMCombined()614 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
615 CpuType type = GetCpuType();
616 switch (type) {
617 case CpuType::kIntelHaswell:
618 case CpuType::kAmdRome:
619 case CpuType::kAmdNaples:
620 case CpuType::kAmdMilan:
621 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
622 3, 1, CutoffStrategy::Fold3>();
623 // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation.
624 case CpuType::kIntelCascadelakeXeon:
625 case CpuType::kIntelSkylakeXeon:
626 case CpuType::kIntelBroadwell:
627 case CpuType::kIntelSkylake:
628 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
629 3, 2, CutoffStrategy::Fold3>();
630 // PCLMULQDQ is slow, don't use it.
631 case CpuType::kIntelIvybridge:
632 case CpuType::kIntelSandybridge:
633 case CpuType::kIntelWestmere:
634 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
635 3, 0, CutoffStrategy::Fold3>();
636 case CpuType::kArmNeoverseN1:
637 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
638 1, 1, CutoffStrategy::Unroll64CRC>();
639 #if defined(__aarch64__)
640 default:
641 // Not all ARM processors support the needed instructions, so check here
642 // before trying to use an accelerated implementation.
643 if (SupportsArmCRC32PMULL()) {
644 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
645 1, 1, CutoffStrategy::Unroll64CRC>();
646 } else {
647 return nullptr;
648 }
649 #else
650 default:
651 // Something else, play it safe and assume slow PCLMULQDQ.
652 return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
653 3, 0, CutoffStrategy::Fold3>();
654 #endif
655 }
656 }
657
NewCRC32AcceleratedX86ARMCombinedAll()658 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
659 auto ret = std::vector<std::unique_ptr<CRCImpl>>();
660 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
661 1, 0, CutoffStrategy::Fold3>>());
662 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
663 1, 1, CutoffStrategy::Fold3>>());
664 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
665 1, 2, CutoffStrategy::Fold3>>());
666 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
667 1, 3, CutoffStrategy::Fold3>>());
668 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
669 2, 0, CutoffStrategy::Fold3>>());
670 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
671 2, 1, CutoffStrategy::Fold3>>());
672 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
673 2, 2, CutoffStrategy::Fold3>>());
674 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
675 2, 3, CutoffStrategy::Fold3>>());
676 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
677 3, 0, CutoffStrategy::Fold3>>());
678 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
679 3, 1, CutoffStrategy::Fold3>>());
680 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
681 3, 2, CutoffStrategy::Fold3>>());
682 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
683 3, 3, CutoffStrategy::Fold3>>());
684 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
685 1, 0, CutoffStrategy::Unroll64CRC>>());
686 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
687 1, 1, CutoffStrategy::Unroll64CRC>>());
688 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
689 1, 2, CutoffStrategy::Unroll64CRC>>());
690 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
691 1, 3, CutoffStrategy::Unroll64CRC>>());
692 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
693 2, 0, CutoffStrategy::Unroll64CRC>>());
694 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
695 2, 1, CutoffStrategy::Unroll64CRC>>());
696 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
697 2, 2, CutoffStrategy::Unroll64CRC>>());
698 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
699 2, 3, CutoffStrategy::Unroll64CRC>>());
700 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
701 3, 0, CutoffStrategy::Unroll64CRC>>());
702 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
703 3, 1, CutoffStrategy::Unroll64CRC>>());
704 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
705 3, 2, CutoffStrategy::Unroll64CRC>>());
706 ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
707 3, 3, CutoffStrategy::Unroll64CRC>>());
708
709 return ret;
710 }
711
712 #else // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
713
714 std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
715 return std::vector<std::unique_ptr<CRCImpl>>();
716 }
717
718 // no hardware acceleration available
719 CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }
720
721 #endif
722
723 } // namespace crc_internal
724 ABSL_NAMESPACE_END
725 } // namespace absl
726