xref: /aosp_15_r20/external/webrtc/third_party/crc32c/src/src/crc32c_arm64.cc (revision d9f758449e529ab9291ac668be2861e7a55c2422)
1 // Copyright 2017 The CRC32C Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
4 
5 #include "./crc32c_arm64.h"
6 
7 // In a separate source file to allow this accelerated CRC32C function to be
8 // compiled with the appropriate compiler flags to enable ARM NEON CRC32C
9 // instructions.
10 
11 // This implementation is based on https://github.com/google/leveldb/pull/490.
12 
13 #include <cstddef>
14 #include <cstdint>
15 
16 #include "./crc32c_internal.h"
17 #include "crc32c/crc32c_config.h"
18 
19 #if HAVE_ARM64_CRC32C
20 
21 #include <arm_acle.h>
22 #include <arm_neon.h>
23 
24 #define KBYTES 1032
25 #define SEGMENTBYTES 256
26 
27 // compute 8bytes for each segment parallelly
28 #define CRC32C32BYTES(P, IND)                                             \
29   do {                                                                    \
30     crc1 = __crc32cd(                                                     \
31         crc1, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 1 + (IND))); \
32     crc2 = __crc32cd(                                                     \
33         crc2, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 2 + (IND))); \
34     crc3 = __crc32cd(                                                     \
35         crc3, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 3 + (IND))); \
36     crc0 = __crc32cd(                                                     \
37         crc0, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 0 + (IND))); \
38   } while (0);
39 
40 // compute 8*8 bytes for each segment parallelly
41 #define CRC32C256BYTES(P, IND)      \
42   do {                              \
43     CRC32C32BYTES((P), (IND)*8 + 0) \
44     CRC32C32BYTES((P), (IND)*8 + 1) \
45     CRC32C32BYTES((P), (IND)*8 + 2) \
46     CRC32C32BYTES((P), (IND)*8 + 3) \
47     CRC32C32BYTES((P), (IND)*8 + 4) \
48     CRC32C32BYTES((P), (IND)*8 + 5) \
49     CRC32C32BYTES((P), (IND)*8 + 6) \
50     CRC32C32BYTES((P), (IND)*8 + 7) \
51   } while (0);
52 
53 // compute 4*8*8 bytes for each segment parallelly
54 #define CRC32C1024BYTES(P)   \
55   do {                       \
56     CRC32C256BYTES((P), 0)   \
57     CRC32C256BYTES((P), 1)   \
58     CRC32C256BYTES((P), 2)   \
59     CRC32C256BYTES((P), 3)   \
60     (P) += 4 * SEGMENTBYTES; \
61   } while (0)
62 
63 namespace crc32c {
64 
ExtendArm64(uint32_t crc,const uint8_t * data,size_t size)65 uint32_t ExtendArm64(uint32_t crc, const uint8_t *data, size_t size) {
66   int64_t length = size;
67   uint32_t crc0, crc1, crc2, crc3;
68   uint64_t t0, t1, t2;
69 
70   // k0=CRC(x^(3*SEGMENTBYTES*8)), k1=CRC(x^(2*SEGMENTBYTES*8)),
71   // k2=CRC(x^(SEGMENTBYTES*8))
72   const poly64_t k0 = 0x8d96551c, k1 = 0xbd6f81f8, k2 = 0xdcb17aa4;
73 
74   crc = crc ^ kCRC32Xor;
75 
76   while (length >= KBYTES) {
77     crc0 = crc;
78     crc1 = 0;
79     crc2 = 0;
80     crc3 = 0;
81 
82     // Process 1024 bytes in parallel.
83     CRC32C1024BYTES(data);
84 
85     // Merge the 4 partial CRC32C values.
86     t2 = (uint64_t)vmull_p64(crc2, k2);
87     t1 = (uint64_t)vmull_p64(crc1, k1);
88     t0 = (uint64_t)vmull_p64(crc0, k0);
89     crc = __crc32cd(crc3, *(uint64_t *)data);
90     data += sizeof(uint64_t);
91     crc ^= __crc32cd(0, t2);
92     crc ^= __crc32cd(0, t1);
93     crc ^= __crc32cd(0, t0);
94 
95     length -= KBYTES;
96   }
97 
98   while (length >= 8) {
99     crc = __crc32cd(crc, *(uint64_t *)data);
100     data += 8;
101     length -= 8;
102   }
103 
104   if (length & 4) {
105     crc = __crc32cw(crc, *(uint32_t *)data);
106     data += 4;
107   }
108 
109   if (length & 2) {
110     crc = __crc32ch(crc, *(uint16_t *)data);
111     data += 2;
112   }
113 
114   if (length & 1) {
115     crc = __crc32cb(crc, *data);
116   }
117 
118   return crc ^ kCRC32Xor;
119 }
120 
121 }  // namespace crc32c
122 
123 #endif  // HAVE_ARM64_CRC32C
124