1 // Copyright 2022 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
16 #define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
17
18 #include <cstdint>
19
20 #include "absl/base/config.h"
21
22 // -------------------------------------------------------------------------
23 // Many x86 and ARM machines have CRC acceleration hardware.
24 // We can do a faster version of Extend() on such machines.
25 // We define a translation layer for both x86 and ARM for the ease of use and
26 // most performance gains.
27
28 // This implementation requires 64-bit CRC instructions (part of SSE 4.2) and
29 // PCLMULQDQ instructions. 32-bit builds with SSE 4.2 do exist, so the
30 // __x86_64__ condition is necessary.
31 #if defined(__x86_64__) && defined(__SSE4_2__) && defined(__PCLMUL__)
32
33 #include <x86intrin.h>
34 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
35
36 #elif defined(_MSC_VER) && defined(__AVX__)
37
38 // MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ.
39 #include <intrin.h>
40 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
41
42 #elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) && \
43 defined(__ARM_FEATURE_CRC32) && defined(ABSL_INTERNAL_HAVE_ARM_NEON) && \
44 defined(__ARM_FEATURE_CRYPTO)
45
46 #include <arm_acle.h>
47 #include <arm_neon.h>
48 #define ABSL_CRC_INTERNAL_HAVE_ARM_SIMD
49
50 #endif
51
52 namespace absl {
53 ABSL_NAMESPACE_BEGIN
54 namespace crc_internal {
55
56 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
57 defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
58
59 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
60 using V128 = uint64x2_t;
61 #else
62 using V128 = __m128i;
63 #endif
64
65 // Starting with the initial value in |crc|, accumulates a CRC32 value for
66 // unsigned integers of different sizes.
67 uint32_t CRC32_u8(uint32_t crc, uint8_t v);
68
69 uint32_t CRC32_u16(uint32_t crc, uint16_t v);
70
71 uint32_t CRC32_u32(uint32_t crc, uint32_t v);
72
73 uint32_t CRC32_u64(uint32_t crc, uint64_t v);
74
75 // Loads 128 bits of integer data. |src| must be 16-byte aligned.
76 V128 V128_Load(const V128* src);
77
78 // Load 128 bits of integer data. |src| does not need to be aligned.
79 V128 V128_LoadU(const V128* src);
80
81 // Polynomially multiplies the high 64 bits of |l| and |r|.
82 V128 V128_PMulHi(const V128 l, const V128 r);
83
84 // Polynomially multiplies the low 64 bits of |l| and |r|.
85 V128 V128_PMulLow(const V128 l, const V128 r);
86
87 // Polynomially multiplies the low 64 bits of |r| and high 64 bits of |l|.
88 V128 V128_PMul01(const V128 l, const V128 r);
89
90 // Polynomially multiplies the low 64 bits of |l| and high 64 bits of |r|.
91 V128 V128_PMul10(const V128 l, const V128 r);
92
93 // Produces a XOR operation of |l| and |r|.
94 V128 V128_Xor(const V128 l, const V128 r);
95
96 // Produces an AND operation of |l| and |r|.
97 V128 V128_And(const V128 l, const V128 r);
98
99 // Sets two 64 bit integers to one 128 bit vector. The order is reverse.
100 // dst[63:0] := |r|
101 // dst[127:64] := |l|
102 V128 V128_From2x64(const uint64_t l, const uint64_t r);
103
104 // Shift |l| right by |imm| bytes while shifting in zeros.
105 template <int imm>
106 V128 V128_ShiftRight(const V128 l);
107
108 // Extracts a 32-bit integer from |l|, selected with |imm|.
109 template <int imm>
110 int V128_Extract32(const V128 l);
111
112 // Extracts the low 64 bits from V128.
113 int64_t V128_Low64(const V128 l);
114
115 // Left-shifts packed 64-bit integers in l by r.
116 V128 V128_ShiftLeft64(const V128 l, const V128 r);
117
118 #endif
119
120 #if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
121
CRC32_u8(uint32_t crc,uint8_t v)122 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) {
123 return _mm_crc32_u8(crc, v);
124 }
125
CRC32_u16(uint32_t crc,uint16_t v)126 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
127 return _mm_crc32_u16(crc, v);
128 }
129
CRC32_u32(uint32_t crc,uint32_t v)130 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
131 return _mm_crc32_u32(crc, v);
132 }
133
CRC32_u64(uint32_t crc,uint64_t v)134 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
135 return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
136 }
137
V128_Load(const V128 * src)138 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
139
V128_LoadU(const V128 * src)140 inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
141
V128_PMulHi(const V128 l,const V128 r)142 inline V128 V128_PMulHi(const V128 l, const V128 r) {
143 return _mm_clmulepi64_si128(l, r, 0x11);
144 }
145
V128_PMulLow(const V128 l,const V128 r)146 inline V128 V128_PMulLow(const V128 l, const V128 r) {
147 return _mm_clmulepi64_si128(l, r, 0x00);
148 }
149
V128_PMul01(const V128 l,const V128 r)150 inline V128 V128_PMul01(const V128 l, const V128 r) {
151 return _mm_clmulepi64_si128(l, r, 0x01);
152 }
153
V128_PMul10(const V128 l,const V128 r)154 inline V128 V128_PMul10(const V128 l, const V128 r) {
155 return _mm_clmulepi64_si128(l, r, 0x10);
156 }
157
V128_Xor(const V128 l,const V128 r)158 inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
159
V128_And(const V128 l,const V128 r)160 inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
161
V128_From2x64(const uint64_t l,const uint64_t r)162 inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
163 return _mm_set_epi64x(static_cast<int64_t>(l), static_cast<int64_t>(r));
164 }
165
166 template <int imm>
V128_ShiftRight(const V128 l)167 inline V128 V128_ShiftRight(const V128 l) {
168 return _mm_srli_si128(l, imm);
169 }
170
171 template <int imm>
V128_Extract32(const V128 l)172 inline int V128_Extract32(const V128 l) {
173 return _mm_extract_epi32(l, imm);
174 }
175
V128_Low64(const V128 l)176 inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
177
V128_ShiftLeft64(const V128 l,const V128 r)178 inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
179 return _mm_sll_epi64(l, r);
180 }
181
182 #elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
183
CRC32_u8(uint32_t crc,uint8_t v)184 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { return __crc32cb(crc, v); }
185
CRC32_u16(uint32_t crc,uint16_t v)186 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
187 return __crc32ch(crc, v);
188 }
189
CRC32_u32(uint32_t crc,uint32_t v)190 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
191 return __crc32cw(crc, v);
192 }
193
CRC32_u64(uint32_t crc,uint64_t v)194 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
195 return __crc32cd(crc, v);
196 }
197
V128_Load(const V128 * src)198 inline V128 V128_Load(const V128* src) {
199 return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
200 }
201
V128_LoadU(const V128 * src)202 inline V128 V128_LoadU(const V128* src) {
203 return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
204 }
205
206 // Using inline assembly as clang does not generate the pmull2 instruction and
207 // performance drops by 15-20%.
208 // TODO(b/193678732): Investigate why the compiler decides not to generate
209 // such instructions and why it becomes so much worse.
V128_PMulHi(const V128 l,const V128 r)210 inline V128 V128_PMulHi(const V128 l, const V128 r) {
211 uint64x2_t res;
212 __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
213 : "=w"(res)
214 : "w"(l), "w"(r));
215 return res;
216 }
217
V128_PMulLow(const V128 l,const V128 r)218 inline V128 V128_PMulLow(const V128 l, const V128 r) {
219 return reinterpret_cast<V128>(vmull_p64(
220 reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
221 reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
222 }
223
V128_PMul01(const V128 l,const V128 r)224 inline V128 V128_PMul01(const V128 l, const V128 r) {
225 return reinterpret_cast<V128>(vmull_p64(
226 reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(l))),
227 reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
228 }
229
V128_PMul10(const V128 l,const V128 r)230 inline V128 V128_PMul10(const V128 l, const V128 r) {
231 return reinterpret_cast<V128>(vmull_p64(
232 reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
233 reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(r)))));
234 }
235
V128_Xor(const V128 l,const V128 r)236 inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
237
V128_And(const V128 l,const V128 r)238 inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
239
V128_From2x64(const uint64_t l,const uint64_t r)240 inline V128 V128_From2x64(const uint64_t l, const uint64_t r) {
241 return vcombine_u64(vcreate_u64(r), vcreate_u64(l));
242 }
243
244 template <int imm>
V128_ShiftRight(const V128 l)245 inline V128 V128_ShiftRight(const V128 l) {
246 return vreinterpretq_u64_s8(
247 vextq_s8(vreinterpretq_s8_u64(l), vdupq_n_s8(0), imm));
248 }
249
250 template <int imm>
V128_Extract32(const V128 l)251 inline int V128_Extract32(const V128 l) {
252 return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
253 }
254
V128_Low64(const V128 l)255 inline int64_t V128_Low64(const V128 l) {
256 return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
257 }
258
V128_ShiftLeft64(const V128 l,const V128 r)259 inline V128 V128_ShiftLeft64(const V128 l, const V128 r) {
260 return vshlq_u64(l, vreinterpretq_s64_u64(r));
261 }
262
263 #endif
264
265 } // namespace crc_internal
266 ABSL_NAMESPACE_END
267 } // namespace absl
268
269 #endif // ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
270