1 /* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
2  * Based on Brian Bockelman's AVX2 version
3  * Copyright (C) 1995-2011 Mark Adler
4  * Authors:
5  *   Adam Stylinski <[email protected]>
6  *   Brian Bockelman <[email protected]>
7  * For conditions of distribution and use, see copyright notice in zlib.h
8  */
9 
10 #ifdef X86_AVX512VNNI_ADLER32
11 
12 #include "../../zbuild.h"
13 #include "../../adler32_p.h"
14 #include "../../cpu_features.h"
15 #include "../../fallback_builtins.h"
16 #include <immintrin.h>
17 #include "../../adler32_fold.h"
18 #include "adler32_avx512_p.h"
19 #include "adler32_avx2_p.h"
20 
adler32_avx512_vnni(uint32_t adler,const uint8_t * src,size_t len)21 Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
22     if (src == NULL) return 1L;
23     if (len == 0) return adler;
24 
25     uint32_t adler0, adler1;
26     adler1 = (adler >> 16) & 0xffff;
27     adler0 = adler & 0xffff;
28 
29 rem_peel:
30     if (len < 32)
31 #if defined(X86_SSSE3_ADLER32)
32         return adler32_ssse3(adler, src, len);
33 #else
34         return adler32_len_16(adler0, src, len, adler1);
35 #endif
36 
37     if (len < 64)
38 #ifdef X86_AVX2_ADLER32
39         return adler32_avx2(adler, src, len);
40 #elif defined(X86_SSE3_ADLER32)
41         return adler32_ssse3(adler, src, len);
42 #else
43         return adler32_len_16(adler0, src, len, adler1);
44 #endif
45 
46     const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
47                                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
48                                           38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
49                                           56, 57, 58, 59, 60, 61, 62, 63, 64);
50 
51     const __m512i zero = _mm512_setzero_si512();
52     __m512i vs1, vs2;
53 
54     while (len >= 64) {
55         vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
56         vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
57         size_t k = MIN(len, NMAX);
58         k -= k % 64;
59         len -= k;
60         __m512i vs1_0 = vs1;
61         __m512i vs3 = _mm512_setzero_si512();
62         /* We might get a tad bit more ILP here if we sum to a second register in the loop */
63         __m512i vs2_1 = _mm512_setzero_si512();
64         __m512i vbuf0, vbuf1;
65 
66         /* Remainder peeling */
67         if (k % 128) {
68             vbuf1 = _mm512_loadu_si512((__m512i*)src);
69 
70             src += 64;
71             k -= 64;
72 
73             __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
74             vs1 = _mm512_add_epi32(vs1, vs1_sad);
75             vs3 = _mm512_add_epi32(vs3, vs1_0);
76             vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
77             vs1_0 = vs1;
78         }
79 
80         /* Manually unrolled this loop by 2 for an decent amount of ILP */
81         while (k >= 128) {
82             /*
83                vs1 = adler + sum(c[i])
84                vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
85             */
86             vbuf0 = _mm512_loadu_si512((__m512i*)src);
87             vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
88             src += 128;
89             k -= 128;
90 
91             __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
92             vs1 = _mm512_add_epi32(vs1, vs1_sad);
93             vs3 = _mm512_add_epi32(vs3, vs1_0);
94             /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
95              * instructions to eliminate them */
96             vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
97 
98             vs3 = _mm512_add_epi32(vs3, vs1);
99             vs1_sad = _mm512_sad_epu8(vbuf1, zero);
100             vs1 = _mm512_add_epi32(vs1, vs1_sad);
101             vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
102             vs1_0 = vs1;
103         }
104 
105         vs3 = _mm512_slli_epi32(vs3, 6);
106         vs2 = _mm512_add_epi32(vs2, vs3);
107         vs2 = _mm512_add_epi32(vs2, vs2_1);
108 
109         adler0 = partial_hsum(vs1) % BASE;
110         adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
111     }
112 
113     adler = adler0 | (adler1 << 16);
114 
115     /* Process tail (len < 64). */
116     if (len) {
117         goto rem_peel;
118     }
119 
120     return adler;
121 }
122 
adler32_fold_copy_avx512_vnni(uint32_t adler,uint8_t * dst,const uint8_t * src,size_t len)123 Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
124     if (src == NULL) return 1L;
125     if (len == 0) return adler;
126 
127     uint32_t adler0, adler1;
128     adler1 = (adler >> 16) & 0xffff;
129     adler0 = adler & 0xffff;
130 
131 rem_peel_copy:
132     if (len < 32) {
133         /* This handles the remaining copies, just call normal adler checksum after this */
134         __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
135         __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
136         _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
137 
138 #if defined(X86_SSSE3_ADLER32)
139         return adler32_ssse3(adler, src, len);
140 #else
141         return adler32_len_16(adler0, src, len, adler1);
142 #endif
143     }
144 
145     const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
146                                           20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
147 
148     const __m256i zero = _mm256_setzero_si256();
149     __m256i vs1, vs2;
150 
151     while (len >= 32) {
152         vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
153         vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
154         size_t k = MIN(len, NMAX);
155         k -= k % 32;
156         len -= k;
157         __m256i vs1_0 = vs1;
158         __m256i vs3 = _mm256_setzero_si256();
159         /* We might get a tad bit more ILP here if we sum to a second register in the loop */
160         __m256i vs2_1 = _mm256_setzero_si256();
161         __m256i vbuf0, vbuf1;
162 
163         /* Remainder peeling */
164         if (k % 64) {
165             vbuf1 = _mm256_loadu_si256((__m256i*)src);
166             _mm256_storeu_si256((__m256i*)dst, vbuf1);
167             dst += 32;
168 
169             src += 32;
170             k -= 32;
171 
172             __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
173             vs1 = _mm256_add_epi32(vs1, vs1_sad);
174             vs3 = _mm256_add_epi32(vs3, vs1_0);
175             vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
176             vs1_0 = vs1;
177         }
178 
179         /* Manually unrolled this loop by 2 for an decent amount of ILP */
180         while (k >= 64) {
181             /*
182                vs1 = adler + sum(c[i])
183                vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
184             */
185             vbuf0 = _mm256_loadu_si256((__m256i*)src);
186             vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
187             _mm256_storeu_si256((__m256i*)dst, vbuf0);
188             _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
189             dst += 64;
190             src += 64;
191             k -= 64;
192 
193             __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
194             vs1 = _mm256_add_epi32(vs1, vs1_sad);
195             vs3 = _mm256_add_epi32(vs3, vs1_0);
196             /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
197              * instructions to eliminate them */
198             vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
199 
200             vs3 = _mm256_add_epi32(vs3, vs1);
201             vs1_sad = _mm256_sad_epu8(vbuf1, zero);
202             vs1 = _mm256_add_epi32(vs1, vs1_sad);
203             vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
204             vs1_0 = vs1;
205         }
206 
207         vs3 = _mm256_slli_epi32(vs3, 5);
208         vs2 = _mm256_add_epi32(vs2, vs3);
209         vs2 = _mm256_add_epi32(vs2, vs2_1);
210 
211         adler0 = partial_hsum256(vs1) % BASE;
212         adler1 = hsum256(vs2) % BASE;
213     }
214 
215     adler = adler0 | (adler1 << 16);
216 
217     /* Process tail (len < 64). */
218     if (len) {
219         goto rem_peel_copy;
220     }
221 
222     return adler;
223 }
224 
225 #endif
226