1 /* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
2 * Copyright Wangyang Guo ([email protected])
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 */
5
6 #ifdef X86_VPCLMULQDQ_CRC
7 #include "../../zbuild.h"
8 #include "../../fallback_builtins.h"
9
10 #include <immintrin.h>
11
12 #define ONCE(op) if (first) { \
13 first = 0; \
14 (op); \
15 }
16 #define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
17
fold_16_vpclmulqdq(__m128i * xmm_crc0,__m128i * xmm_crc1,__m128i * xmm_crc2,__m128i * xmm_crc3,uint8_t * dst,const uint8_t * src,size_t len)18 size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
19 __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
20 size_t len_tmp = len;
21 __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
22 __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
23 __m512i z0, z1, z2, z3;
24 const __m512i zmm_fold4 = _mm512_set4_epi32(
25 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
26 const __m512i zmm_fold16 = _mm512_set4_epi32(
27 0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
28
29 // zmm register init
30 zmm_crc0 = _mm512_setzero_si512();
31 zmm_t0 = _mm512_loadu_si512((__m512i *)src);
32 zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
33 zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
34 zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
35
36 /* already have intermediate CRC in xmm registers
37 * fold4 with 4 xmm_crc to get zmm_crc0
38 */
39 zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
40 zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
41 zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
42 zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
43 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
44 zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
45 zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
46 zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
47
48 _mm512_storeu_si512((__m512i *)dst, zmm_t0);
49 _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
50 _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
51 _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
52 len -= 256;
53 src += 256;
54 dst += 256;
55
56 // fold-16 loops
57 while (len >= 256) {
58 zmm_t0 = _mm512_loadu_si512((__m512i *)src);
59 zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
60 zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
61 zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
62
63 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
64 z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
65 z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
66 z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
67
68 zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
69 zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
70 zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
71 zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
72
73 zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
74 zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
75 zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
76 zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
77
78 zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
79 zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
80 zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
81 zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
82
83 _mm512_storeu_si512((__m512i *)dst, zmm_t0);
84 _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
85 _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
86 _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
87 len -= 256;
88 src += 256;
89 dst += 256;
90 }
91 // zmm_crc[0,1,2,3] -> zmm_crc0
92 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
93 zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
94 zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
95 zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
96
97 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
98 zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
99 zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
100 zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
101
102 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
103 zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
104 zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
105 zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
106
107 // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
108 *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
109 *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
110 *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
111 *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
112
113 return (len_tmp - len); // return n bytes processed
114 }
115
fold_16_vpclmulqdq_nocp(__m128i * xmm_crc0,__m128i * xmm_crc1,__m128i * xmm_crc2,__m128i * xmm_crc3,const uint8_t * src,size_t len,__m128i init_crc,int32_t first)116 size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1,
117 __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
118 __m128i init_crc, int32_t first) {
119 size_t len_tmp = len;
120 __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
121 __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
122 __m512i z0, z1, z2, z3;
123 __m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
124 const __m512i zmm_fold4 = _mm512_set4_epi32(
125 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
126 const __m512i zmm_fold16 = _mm512_set4_epi32(
127 0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
128
129 // zmm register init
130 zmm_crc0 = _mm512_setzero_si512();
131 zmm_t0 = _mm512_loadu_si512((__m512i *)src);
132 XOR_INITIAL(zmm_t0);
133 zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
134 zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
135 zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
136
137 /* already have intermediate CRC in xmm registers
138 * fold4 with 4 xmm_crc to get zmm_crc0
139 */
140 zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
141 zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
142 zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
143 zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
144 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
145 zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
146 zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
147 zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
148
149 len -= 256;
150 src += 256;
151
152 // fold-16 loops
153 while (len >= 256) {
154 zmm_t0 = _mm512_loadu_si512((__m512i *)src);
155 zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
156 zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
157 zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
158
159 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
160 z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
161 z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
162 z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
163
164 zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
165 zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
166 zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
167 zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
168
169 zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
170 zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
171 zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
172 zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
173
174 zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
175 zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
176 zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
177 zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
178
179 len -= 256;
180 src += 256;
181 }
182 // zmm_crc[0,1,2,3] -> zmm_crc0
183 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
184 zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
185 zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
186 zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
187
188 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
189 zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
190 zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
191 zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
192
193 z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
194 zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
195 zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
196 zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
197
198 // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
199 *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
200 *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
201 *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
202 *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
203
204 return (len_tmp - len); // return n bytes processed
205 }
206 #endif
207