1 /* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
2  * Copyright Wangyang Guo ([email protected])
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  */
5 
6 #ifdef X86_VPCLMULQDQ_CRC
7 #include "../../zbuild.h"
8 #include "../../fallback_builtins.h"
9 
10 #include <immintrin.h>
11 
12 #define ONCE(op) if (first) { \
13     first = 0; \
14     (op); \
15 }
16 #define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
17 
fold_16_vpclmulqdq(__m128i * xmm_crc0,__m128i * xmm_crc1,__m128i * xmm_crc2,__m128i * xmm_crc3,uint8_t * dst,const uint8_t * src,size_t len)18 size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
19     __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
20     size_t len_tmp = len;
21     __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
22     __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
23     __m512i z0, z1, z2, z3;
24     const __m512i zmm_fold4 = _mm512_set4_epi32(
25         0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
26     const __m512i zmm_fold16 = _mm512_set4_epi32(
27         0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
28 
29     // zmm register init
30     zmm_crc0 = _mm512_setzero_si512();
31     zmm_t0 = _mm512_loadu_si512((__m512i *)src);
32     zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
33     zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
34     zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
35 
36     /* already have intermediate CRC in xmm registers
37         * fold4 with 4 xmm_crc to get zmm_crc0
38     */
39     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
40     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
41     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
42     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
43     z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
44     zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
45     zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
46     zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
47 
48     _mm512_storeu_si512((__m512i *)dst, zmm_t0);
49     _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
50     _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
51     _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
52     len -= 256;
53     src += 256;
54     dst += 256;
55 
56     // fold-16 loops
57     while (len >= 256) {
58         zmm_t0 = _mm512_loadu_si512((__m512i *)src);
59         zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
60         zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
61         zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
62 
63         z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
64         z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
65         z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
66         z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
67 
68         zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
69         zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
70         zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
71         zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
72 
73         zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
74         zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
75         zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
76         zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
77 
78         zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
79         zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
80         zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
81         zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
82 
83         _mm512_storeu_si512((__m512i *)dst, zmm_t0);
84         _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
85         _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
86         _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
87         len -= 256;
88         src += 256;
89         dst += 256;
90     }
91     // zmm_crc[0,1,2,3] -> zmm_crc0
92     z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
93     zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
94     zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
95     zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
96 
97     z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
98     zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
99     zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
100     zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
101 
102     z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
103     zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
104     zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
105     zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
106 
107     // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
108     *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
109     *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
110     *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
111     *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
112 
113     return (len_tmp - len);  // return n bytes processed
114 }
115 
fold_16_vpclmulqdq_nocp(__m128i * xmm_crc0,__m128i * xmm_crc1,__m128i * xmm_crc2,__m128i * xmm_crc3,const uint8_t * src,size_t len,__m128i init_crc,int32_t first)116 size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1,
117     __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
118     __m128i init_crc, int32_t first) {
119     size_t len_tmp = len;
120     __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
121     __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
122     __m512i z0, z1, z2, z3;
123     __m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
124     const __m512i zmm_fold4 = _mm512_set4_epi32(
125         0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
126     const __m512i zmm_fold16 = _mm512_set4_epi32(
127         0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
128 
129     // zmm register init
130     zmm_crc0 = _mm512_setzero_si512();
131     zmm_t0 = _mm512_loadu_si512((__m512i *)src);
132     XOR_INITIAL(zmm_t0);
133     zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
134     zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
135     zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
136 
137     /* already have intermediate CRC in xmm registers
138         * fold4 with 4 xmm_crc to get zmm_crc0
139     */
140     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
141     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
142     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
143     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
144     z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
145     zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
146     zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
147     zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
148 
149     len -= 256;
150     src += 256;
151 
152     // fold-16 loops
153     while (len >= 256) {
154         zmm_t0 = _mm512_loadu_si512((__m512i *)src);
155         zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
156         zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
157         zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
158 
159         z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
160         z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
161         z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
162         z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
163 
164         zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
165         zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
166         zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
167         zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
168 
169         zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
170         zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
171         zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
172         zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
173 
174         zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
175         zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
176         zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
177         zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
178 
179         len -= 256;
180         src += 256;
181     }
182     // zmm_crc[0,1,2,3] -> zmm_crc0
183     z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
184     zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
185     zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
186     zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
187 
188     z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
189     zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
190     zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
191     zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
192 
193     z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
194     zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
195     zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
196     zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
197 
198     // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
199     *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
200     *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
201     *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
202     *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
203 
204     return (len_tmp - len);  // return n bytes processed
205 }
206 #endif
207