xref: /aosp_15_r20/external/zlib/patches/0011-avx512.patch (revision 86ee64e75fa5f8bce2c8c356138035642429cd05)
1*86ee64e7SAndroid Build Coastguard WorkerFrom 87fc8e3e38323cfdabf8da3927488e3e57073b02 Mon Sep 17 00:00:00 2001
2*86ee64e7SAndroid Build Coastguard WorkerFrom: Jia Liu <[email protected]>
3*86ee64e7SAndroid Build Coastguard WorkerDate: Thu, 30 Mar 2023 11:13:16 +0800
4*86ee64e7SAndroid Build Coastguard WorkerSubject: [PATCH] Enabled AVX512 for CRC32
5*86ee64e7SAndroid Build Coastguard Worker
6*86ee64e7SAndroid Build Coastguard WorkerEnabled AVX512 for CRC32 that provide best of known performance
7*86ee64e7SAndroid Build Coastguard Workerbeyond current SSE SIMD optimization. It enables multiple folding
8*86ee64e7SAndroid Build Coastguard Workeroperations and AVX512 new instructions, providing ~3.5X CRC32
9*86ee64e7SAndroid Build Coastguard Workerperformance and ~3.7% gain on Zlib_bench gzip performance.
10*86ee64e7SAndroid Build Coastguard Worker---
11*86ee64e7SAndroid Build Coastguard Worker CMakeLists.txt |   8 +-
12*86ee64e7SAndroid Build Coastguard Worker cpu_features.c |   9 +++
13*86ee64e7SAndroid Build Coastguard Worker cpu_features.h |   1 +
14*86ee64e7SAndroid Build Coastguard Worker crc32.c        |  14 +++-
15*86ee64e7SAndroid Build Coastguard Worker crc32_simd.c   | 198 ++++++++++++++++++++++++++++++++++++++++++++++++-
16*86ee64e7SAndroid Build Coastguard Worker crc32_simd.h   |   6 ++
17*86ee64e7SAndroid Build Coastguard Worker 6 files changed, 230 insertions(+), 6 deletions(-)
18*86ee64e7SAndroid Build Coastguard Worker
19*86ee64e7SAndroid Build Coastguard Workerdiff --git a/CMakeLists.txt b/CMakeLists.txt
20*86ee64e7SAndroid Build Coastguard Workerindex f06e193..d45b902 100644
21*86ee64e7SAndroid Build Coastguard Worker--- a/CMakeLists.txt
22*86ee64e7SAndroid Build Coastguard Worker+++ b/CMakeLists.txt
23*86ee64e7SAndroid Build Coastguard Worker@@ -22,6 +22,7 @@ check_include_file(stdint.h    HAVE_STDINT_H)
24*86ee64e7SAndroid Build Coastguard Worker check_include_file(stddef.h    HAVE_STDDEF_H)
25*86ee64e7SAndroid Build Coastguard Worker
26*86ee64e7SAndroid Build Coastguard Worker option(ENABLE_SIMD_OPTIMIZATIONS "Enable all SIMD optimizations" OFF)
27*86ee64e7SAndroid Build Coastguard Worker+option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF)
28*86ee64e7SAndroid Build Coastguard Worker
29*86ee64e7SAndroid Build Coastguard Worker # TODO(cavalcantii): add support for other OSes (e.g. Android, fuchsia, osx)
30*86ee64e7SAndroid Build Coastguard Worker # and architectures (e.g. Arm).
31*86ee64e7SAndroid Build Coastguard Worker@@ -30,8 +31,13 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
32*86ee64e7SAndroid Build Coastguard Worker    add_definitions(-DADLER32_SIMD_SSSE3)
33*86ee64e7SAndroid Build Coastguard Worker    add_definitions(-DINFLATE_CHUNK_READ_64LE)
34*86ee64e7SAndroid Build Coastguard Worker    add_definitions(-DCRC32_SIMD_SSE42_PCLMUL)
35*86ee64e7SAndroid Build Coastguard Worker+   if (ENABLE_SIMD_AVX512)
36*86ee64e7SAndroid Build Coastguard Worker+    add_definitions(-DCRC32_SIMD_AVX512_PCLMUL)
37*86ee64e7SAndroid Build Coastguard Worker+    add_compile_options(-mvpclmulqdq -msse2 -mavx512f -mpclmul)
38*86ee64e7SAndroid Build Coastguard Worker+   else()
39*86ee64e7SAndroid Build Coastguard Worker+    add_compile_options(-msse4.2 -mpclmul)
40*86ee64e7SAndroid Build Coastguard Worker+   endif()
41*86ee64e7SAndroid Build Coastguard Worker    add_definitions(-DDEFLATE_SLIDE_HASH_SSE2)
42*86ee64e7SAndroid Build Coastguard Worker-   add_compile_options(-msse4.2 -mpclmul)
43*86ee64e7SAndroid Build Coastguard Worker    # Required by CPU features detection code.
44*86ee64e7SAndroid Build Coastguard Worker    add_definitions(-DX86_NOT_WINDOWS)
45*86ee64e7SAndroid Build Coastguard Worker    # Apparently some environments (e.g. CentOS) require to explicitly link
46*86ee64e7SAndroid Build Coastguard Workerdiff --git a/cpu_features.c b/cpu_features.c
47*86ee64e7SAndroid Build Coastguard Workerindex 877d5f2..ac6ee88 100644
48*86ee64e7SAndroid Build Coastguard Worker--- a/cpu_features.c
49*86ee64e7SAndroid Build Coastguard Worker+++ b/cpu_features.c
50*86ee64e7SAndroid Build Coastguard Worker@@ -31,6 +31,7 @@ int ZLIB_INTERNAL arm_cpu_enable_pmull = 0;
51*86ee64e7SAndroid Build Coastguard Worker int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0;
52*86ee64e7SAndroid Build Coastguard Worker int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
53*86ee64e7SAndroid Build Coastguard Worker int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
54*86ee64e7SAndroid Build Coastguard Worker+int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;
55*86ee64e7SAndroid Build Coastguard Worker
56*86ee64e7SAndroid Build Coastguard Worker #ifndef CPU_NO_SIMD
57*86ee64e7SAndroid Build Coastguard Worker
58*86ee64e7SAndroid Build Coastguard Worker@@ -138,6 +139,10 @@ static void _cpu_check_features(void)
59*86ee64e7SAndroid Build Coastguard Worker /* On x86 we simply use a instruction to check the CPU features.
60*86ee64e7SAndroid Build Coastguard Worker  * (i.e. CPUID).
61*86ee64e7SAndroid Build Coastguard Worker  */
62*86ee64e7SAndroid Build Coastguard Worker+#ifdef CRC32_SIMD_AVX512_PCLMUL
63*86ee64e7SAndroid Build Coastguard Worker+#include <immintrin.h>
64*86ee64e7SAndroid Build Coastguard Worker+#include <xsaveintrin.h>
65*86ee64e7SAndroid Build Coastguard Worker+#endif
66*86ee64e7SAndroid Build Coastguard Worker static void _cpu_check_features(void)
67*86ee64e7SAndroid Build Coastguard Worker {
68*86ee64e7SAndroid Build Coastguard Worker     int x86_cpu_has_sse2;
69*86ee64e7SAndroid Build Coastguard Worker@@ -164,6 +169,10 @@ static void _cpu_check_features(void)
70*86ee64e7SAndroid Build Coastguard Worker     x86_cpu_enable_simd = x86_cpu_has_sse2 &&
71*86ee64e7SAndroid Build Coastguard Worker                           x86_cpu_has_sse42 &&
72*86ee64e7SAndroid Build Coastguard Worker                           x86_cpu_has_pclmulqdq;
73*86ee64e7SAndroid Build Coastguard Worker+
74*86ee64e7SAndroid Build Coastguard Worker+#ifdef CRC32_SIMD_AVX512_PCLMUL
75*86ee64e7SAndroid Build Coastguard Worker+    x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040;
76*86ee64e7SAndroid Build Coastguard Worker+#endif
77*86ee64e7SAndroid Build Coastguard Worker }
78*86ee64e7SAndroid Build Coastguard Worker #endif
79*86ee64e7SAndroid Build Coastguard Worker #endif
80*86ee64e7SAndroid Build Coastguard Workerdiff --git a/cpu_features.h b/cpu_features.h
81*86ee64e7SAndroid Build Coastguard Workerindex 279246c..aed3e83 100644
82*86ee64e7SAndroid Build Coastguard Worker--- a/cpu_features.h
83*86ee64e7SAndroid Build Coastguard Worker+++ b/cpu_features.h
84*86ee64e7SAndroid Build Coastguard Worker@@ -14,5 +14,6 @@ extern int arm_cpu_enable_pmull;
85*86ee64e7SAndroid Build Coastguard Worker extern int x86_cpu_enable_sse2;
86*86ee64e7SAndroid Build Coastguard Worker extern int x86_cpu_enable_ssse3;
87*86ee64e7SAndroid Build Coastguard Worker extern int x86_cpu_enable_simd;
88*86ee64e7SAndroid Build Coastguard Worker+extern int x86_cpu_enable_avx512;
89*86ee64e7SAndroid Build Coastguard Worker
90*86ee64e7SAndroid Build Coastguard Worker void cpu_check_features(void);
91*86ee64e7SAndroid Build Coastguard Workerdiff --git a/crc32.c b/crc32.c
92*86ee64e7SAndroid Build Coastguard Workerindex 4486098..acb6972 100644
93*86ee64e7SAndroid Build Coastguard Worker--- a/crc32.c
94*86ee64e7SAndroid Build Coastguard Worker+++ b/crc32.c
95*86ee64e7SAndroid Build Coastguard Worker@@ -773,7 +773,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
96*86ee64e7SAndroid Build Coastguard Worker     }
97*86ee64e7SAndroid Build Coastguard Worker
98*86ee64e7SAndroid Build Coastguard Worker #endif
99*86ee64e7SAndroid Build Coastguard Worker-#if defined(CRC32_SIMD_SSE42_PCLMUL)
100*86ee64e7SAndroid Build Coastguard Worker+#if defined(CRC32_SIMD_AVX512_PCLMUL)
101*86ee64e7SAndroid Build Coastguard Worker+    if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
102*86ee64e7SAndroid Build Coastguard Worker+        /* crc32 64-byte chunks */
103*86ee64e7SAndroid Build Coastguard Worker+        z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
104*86ee64e7SAndroid Build Coastguard Worker+        crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
105*86ee64e7SAndroid Build Coastguard Worker+        /* check remaining data */
106*86ee64e7SAndroid Build Coastguard Worker+        len -= chunk_size;
107*86ee64e7SAndroid Build Coastguard Worker+        if (!len)
108*86ee64e7SAndroid Build Coastguard Worker+            return crc;
109*86ee64e7SAndroid Build Coastguard Worker+        /* Fall into the default crc32 for the remaining data. */
110*86ee64e7SAndroid Build Coastguard Worker+        buf += chunk_size;
111*86ee64e7SAndroid Build Coastguard Worker+    }
112*86ee64e7SAndroid Build Coastguard Worker+#elif defined(CRC32_SIMD_SSE42_PCLMUL)
113*86ee64e7SAndroid Build Coastguard Worker     if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
114*86ee64e7SAndroid Build Coastguard Worker         /* crc32 16-byte chunks */
115*86ee64e7SAndroid Build Coastguard Worker         z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
116*86ee64e7SAndroid Build Coastguard Workerdiff --git a/crc32_simd.c b/crc32_simd.c
117*86ee64e7SAndroid Build Coastguard Workerindex d80beba..7428270 100644
118*86ee64e7SAndroid Build Coastguard Worker--- a/crc32_simd.c
119*86ee64e7SAndroid Build Coastguard Worker+++ b/crc32_simd.c
120*86ee64e7SAndroid Build Coastguard Worker@@ -6,17 +6,207 @@
121*86ee64e7SAndroid Build Coastguard Worker  */
122*86ee64e7SAndroid Build Coastguard Worker
123*86ee64e7SAndroid Build Coastguard Worker #include "crc32_simd.h"
124*86ee64e7SAndroid Build Coastguard Worker-
125*86ee64e7SAndroid Build Coastguard Worker-#if defined(CRC32_SIMD_SSE42_PCLMUL)
126*86ee64e7SAndroid Build Coastguard Worker+#if defined(CRC32_SIMD_AVX512_PCLMUL)
127*86ee64e7SAndroid Build Coastguard Worker
128*86ee64e7SAndroid Build Coastguard Worker /*
129*86ee64e7SAndroid Build Coastguard Worker- * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
130*86ee64e7SAndroid Build Coastguard Worker- * length must be at least 64, and a multiple of 16. Based on:
131*86ee64e7SAndroid Build Coastguard Worker+ * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
132*86ee64e7SAndroid Build Coastguard Worker+ * length must be at least 256, and a multiple of 64. Based on:
133*86ee64e7SAndroid Build Coastguard Worker  *
134*86ee64e7SAndroid Build Coastguard Worker  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
135*86ee64e7SAndroid Build Coastguard Worker  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
136*86ee64e7SAndroid Build Coastguard Worker  */
137*86ee64e7SAndroid Build Coastguard Worker
138*86ee64e7SAndroid Build Coastguard Worker+#include <emmintrin.h>
139*86ee64e7SAndroid Build Coastguard Worker+#include <smmintrin.h>
140*86ee64e7SAndroid Build Coastguard Worker+#include <wmmintrin.h>
141*86ee64e7SAndroid Build Coastguard Worker+#include <immintrin.h>
142*86ee64e7SAndroid Build Coastguard Worker+
143*86ee64e7SAndroid Build Coastguard Worker+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(  /* AVX512+PCLMUL */
144*86ee64e7SAndroid Build Coastguard Worker+    const unsigned char *buf,
145*86ee64e7SAndroid Build Coastguard Worker+    z_size_t len,
146*86ee64e7SAndroid Build Coastguard Worker+    uint32_t crc)
147*86ee64e7SAndroid Build Coastguard Worker+{
148*86ee64e7SAndroid Build Coastguard Worker+    /*
149*86ee64e7SAndroid Build Coastguard Worker+     * Definitions of the bit-reflected domain constants k1,k2,k3,k4
150*86ee64e7SAndroid Build Coastguard Worker+     * are similar to those given at the end of the paper, and remaining
151*86ee64e7SAndroid Build Coastguard Worker+     * constants and CRC32+Barrett polynomials remain unchanged.
152*86ee64e7SAndroid Build Coastguard Worker+     *
153*86ee64e7SAndroid Build Coastguard Worker+     * Replace the index of x from 128 to 512. As follows:
154*86ee64e7SAndroid Build Coastguard Worker+     * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
155*86ee64e7SAndroid Build Coastguard Worker+     * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
156*86ee64e7SAndroid Build Coastguard Worker+     * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
157*86ee64e7SAndroid Build Coastguard Worker+     * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
158*86ee64e7SAndroid Build Coastguard Worker+     */
159*86ee64e7SAndroid Build Coastguard Worker+    static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
160*86ee64e7SAndroid Build Coastguard Worker+                                                0x011542778a, 0x01322d1430,
161*86ee64e7SAndroid Build Coastguard Worker+                                                0x011542778a, 0x01322d1430,
162*86ee64e7SAndroid Build Coastguard Worker+                                                0x011542778a, 0x01322d1430 };
163*86ee64e7SAndroid Build Coastguard Worker+    static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
164*86ee64e7SAndroid Build Coastguard Worker+                                                0x0154442bd4, 0x01c6e41596,
165*86ee64e7SAndroid Build Coastguard Worker+                                                0x0154442bd4, 0x01c6e41596,
166*86ee64e7SAndroid Build Coastguard Worker+                                                0x0154442bd4, 0x01c6e41596 };
167*86ee64e7SAndroid Build Coastguard Worker+    static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
168*86ee64e7SAndroid Build Coastguard Worker+    static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
169*86ee64e7SAndroid Build Coastguard Worker+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
170*86ee64e7SAndroid Build Coastguard Worker+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
171*86ee64e7SAndroid Build Coastguard Worker+    __m128i a0, a1, a2, a3;
172*86ee64e7SAndroid Build Coastguard Worker+
173*86ee64e7SAndroid Build Coastguard Worker+    /*
174*86ee64e7SAndroid Build Coastguard Worker+     * There's at least one block of 256.
175*86ee64e7SAndroid Build Coastguard Worker+     */
176*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
177*86ee64e7SAndroid Build Coastguard Worker+    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
178*86ee64e7SAndroid Build Coastguard Worker+    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
179*86ee64e7SAndroid Build Coastguard Worker+    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
180*86ee64e7SAndroid Build Coastguard Worker+
181*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
182*86ee64e7SAndroid Build Coastguard Worker+
183*86ee64e7SAndroid Build Coastguard Worker+    x0 = _mm512_load_si512((__m512i *)k1k2);
184*86ee64e7SAndroid Build Coastguard Worker+
185*86ee64e7SAndroid Build Coastguard Worker+    buf += 256;
186*86ee64e7SAndroid Build Coastguard Worker+    len -= 256;
187*86ee64e7SAndroid Build Coastguard Worker+
188*86ee64e7SAndroid Build Coastguard Worker+    /*
189*86ee64e7SAndroid Build Coastguard Worker+     * Parallel fold blocks of 256, if any.
190*86ee64e7SAndroid Build Coastguard Worker+     */
191*86ee64e7SAndroid Build Coastguard Worker+    while (len >= 256)
192*86ee64e7SAndroid Build Coastguard Worker+    {
193*86ee64e7SAndroid Build Coastguard Worker+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
194*86ee64e7SAndroid Build Coastguard Worker+        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
195*86ee64e7SAndroid Build Coastguard Worker+        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
196*86ee64e7SAndroid Build Coastguard Worker+        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
197*86ee64e7SAndroid Build Coastguard Worker+
198*86ee64e7SAndroid Build Coastguard Worker+
199*86ee64e7SAndroid Build Coastguard Worker+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
200*86ee64e7SAndroid Build Coastguard Worker+        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
201*86ee64e7SAndroid Build Coastguard Worker+        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
202*86ee64e7SAndroid Build Coastguard Worker+        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
203*86ee64e7SAndroid Build Coastguard Worker+
204*86ee64e7SAndroid Build Coastguard Worker+        y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
205*86ee64e7SAndroid Build Coastguard Worker+        y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
206*86ee64e7SAndroid Build Coastguard Worker+        y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
207*86ee64e7SAndroid Build Coastguard Worker+        y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
208*86ee64e7SAndroid Build Coastguard Worker+
209*86ee64e7SAndroid Build Coastguard Worker+        x1 = _mm512_xor_si512(x1, x5);
210*86ee64e7SAndroid Build Coastguard Worker+        x2 = _mm512_xor_si512(x2, x6);
211*86ee64e7SAndroid Build Coastguard Worker+        x3 = _mm512_xor_si512(x3, x7);
212*86ee64e7SAndroid Build Coastguard Worker+        x4 = _mm512_xor_si512(x4, x8);
213*86ee64e7SAndroid Build Coastguard Worker+
214*86ee64e7SAndroid Build Coastguard Worker+        x1 = _mm512_xor_si512(x1, y5);
215*86ee64e7SAndroid Build Coastguard Worker+        x2 = _mm512_xor_si512(x2, y6);
216*86ee64e7SAndroid Build Coastguard Worker+        x3 = _mm512_xor_si512(x3, y7);
217*86ee64e7SAndroid Build Coastguard Worker+        x4 = _mm512_xor_si512(x4, y8);
218*86ee64e7SAndroid Build Coastguard Worker+
219*86ee64e7SAndroid Build Coastguard Worker+        buf += 256;
220*86ee64e7SAndroid Build Coastguard Worker+        len -= 256;
221*86ee64e7SAndroid Build Coastguard Worker+    }
222*86ee64e7SAndroid Build Coastguard Worker+
223*86ee64e7SAndroid Build Coastguard Worker+    /*
224*86ee64e7SAndroid Build Coastguard Worker+     * Fold into 512-bits.
225*86ee64e7SAndroid Build Coastguard Worker+     */
226*86ee64e7SAndroid Build Coastguard Worker+    x0 = _mm512_load_si512((__m512i *)k3k4);
227*86ee64e7SAndroid Build Coastguard Worker+
228*86ee64e7SAndroid Build Coastguard Worker+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
229*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
230*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_xor_si512(x1, x2);
231*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_xor_si512(x1, x5);
232*86ee64e7SAndroid Build Coastguard Worker+
233*86ee64e7SAndroid Build Coastguard Worker+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
234*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
235*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_xor_si512(x1, x3);
236*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_xor_si512(x1, x5);
237*86ee64e7SAndroid Build Coastguard Worker+
238*86ee64e7SAndroid Build Coastguard Worker+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
239*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
240*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_xor_si512(x1, x4);
241*86ee64e7SAndroid Build Coastguard Worker+    x1 = _mm512_xor_si512(x1, x5);
242*86ee64e7SAndroid Build Coastguard Worker+
243*86ee64e7SAndroid Build Coastguard Worker+    /*
244*86ee64e7SAndroid Build Coastguard Worker+     * Single fold blocks of 64, if any.
245*86ee64e7SAndroid Build Coastguard Worker+     */
246*86ee64e7SAndroid Build Coastguard Worker+    while (len >= 64)
247*86ee64e7SAndroid Build Coastguard Worker+    {
248*86ee64e7SAndroid Build Coastguard Worker+        x2 = _mm512_loadu_si512((__m512i *)buf);
249*86ee64e7SAndroid Build Coastguard Worker+
250*86ee64e7SAndroid Build Coastguard Worker+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
251*86ee64e7SAndroid Build Coastguard Worker+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
252*86ee64e7SAndroid Build Coastguard Worker+        x1 = _mm512_xor_si512(x1, x2);
253*86ee64e7SAndroid Build Coastguard Worker+        x1 = _mm512_xor_si512(x1, x5);
254*86ee64e7SAndroid Build Coastguard Worker+
255*86ee64e7SAndroid Build Coastguard Worker+        buf += 64;
256*86ee64e7SAndroid Build Coastguard Worker+        len -= 64;
257*86ee64e7SAndroid Build Coastguard Worker+    }
258*86ee64e7SAndroid Build Coastguard Worker+
259*86ee64e7SAndroid Build Coastguard Worker+    /*
260*86ee64e7SAndroid Build Coastguard Worker+     * Fold 512-bits to 384-bits.
261*86ee64e7SAndroid Build Coastguard Worker+     */
262*86ee64e7SAndroid Build Coastguard Worker+    a0 = _mm_load_si128((__m128i *)k5k6);
263*86ee64e7SAndroid Build Coastguard Worker+
264*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm512_extracti32x4_epi32(x1, 0);
265*86ee64e7SAndroid Build Coastguard Worker+    a2 = _mm512_extracti32x4_epi32(x1, 1);
266*86ee64e7SAndroid Build Coastguard Worker+
267*86ee64e7SAndroid Build Coastguard Worker+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
268*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
269*86ee64e7SAndroid Build Coastguard Worker+
270*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_xor_si128(a1, a3);
271*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_xor_si128(a1, a2);
272*86ee64e7SAndroid Build Coastguard Worker+
273*86ee64e7SAndroid Build Coastguard Worker+    /*
274*86ee64e7SAndroid Build Coastguard Worker+     * Fold 384-bits to 256-bits.
275*86ee64e7SAndroid Build Coastguard Worker+     */
276*86ee64e7SAndroid Build Coastguard Worker+    a2 = _mm512_extracti32x4_epi32(x1, 2);
277*86ee64e7SAndroid Build Coastguard Worker+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
278*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
279*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_xor_si128(a1, a3);
280*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_xor_si128(a1, a2);
281*86ee64e7SAndroid Build Coastguard Worker+
282*86ee64e7SAndroid Build Coastguard Worker+    /*
283*86ee64e7SAndroid Build Coastguard Worker+     * Fold 256-bits to 128-bits.
284*86ee64e7SAndroid Build Coastguard Worker+     */
285*86ee64e7SAndroid Build Coastguard Worker+    a2 = _mm512_extracti32x4_epi32(x1, 3);
286*86ee64e7SAndroid Build Coastguard Worker+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
287*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
288*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_xor_si128(a1, a3);
289*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_xor_si128(a1, a2);
290*86ee64e7SAndroid Build Coastguard Worker+
291*86ee64e7SAndroid Build Coastguard Worker+    /*
292*86ee64e7SAndroid Build Coastguard Worker+     * Fold 128-bits to 64-bits.
293*86ee64e7SAndroid Build Coastguard Worker+     */
294*86ee64e7SAndroid Build Coastguard Worker+    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
295*86ee64e7SAndroid Build Coastguard Worker+    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
296*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_srli_si128(a1, 8);
297*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_xor_si128(a1, a2);
298*86ee64e7SAndroid Build Coastguard Worker+
299*86ee64e7SAndroid Build Coastguard Worker+    a0 = _mm_loadl_epi64((__m128i*)k7k8);
300*86ee64e7SAndroid Build Coastguard Worker+    a2 = _mm_srli_si128(a1, 4);
301*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_and_si128(a1, a3);
302*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
303*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_xor_si128(a1, a2);
304*86ee64e7SAndroid Build Coastguard Worker+
305*86ee64e7SAndroid Build Coastguard Worker+    /*
306*86ee64e7SAndroid Build Coastguard Worker+     * Barret reduce to 32-bits.
307*86ee64e7SAndroid Build Coastguard Worker+     */
308*86ee64e7SAndroid Build Coastguard Worker+    a0 = _mm_load_si128((__m128i*)poly);
309*86ee64e7SAndroid Build Coastguard Worker+
310*86ee64e7SAndroid Build Coastguard Worker+    a2 = _mm_and_si128(a1, a3);
311*86ee64e7SAndroid Build Coastguard Worker+    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
312*86ee64e7SAndroid Build Coastguard Worker+    a2 = _mm_and_si128(a2, a3);
313*86ee64e7SAndroid Build Coastguard Worker+    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
314*86ee64e7SAndroid Build Coastguard Worker+    a1 = _mm_xor_si128(a1, a2);
315*86ee64e7SAndroid Build Coastguard Worker+
316*86ee64e7SAndroid Build Coastguard Worker+    /*
317*86ee64e7SAndroid Build Coastguard Worker+     * Return the crc32.
318*86ee64e7SAndroid Build Coastguard Worker+     */
319*86ee64e7SAndroid Build Coastguard Worker+    return _mm_extract_epi32(a1, 1);
320*86ee64e7SAndroid Build Coastguard Worker+}
321*86ee64e7SAndroid Build Coastguard Worker+
322*86ee64e7SAndroid Build Coastguard Worker+#elif defined(CRC32_SIMD_SSE42_PCLMUL)
323*86ee64e7SAndroid Build Coastguard Worker+
324*86ee64e7SAndroid Build Coastguard Worker+/*
325*86ee64e7SAndroid Build Coastguard Worker+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
326*86ee64e7SAndroid Build Coastguard Worker+ * length must be at least 64, and a multiple of 16.
327*86ee64e7SAndroid Build Coastguard Worker+ */
328*86ee64e7SAndroid Build Coastguard Worker+
329*86ee64e7SAndroid Build Coastguard Worker #include <emmintrin.h>
330*86ee64e7SAndroid Build Coastguard Worker #include <smmintrin.h>
331*86ee64e7SAndroid Build Coastguard Worker #include <wmmintrin.h>
332*86ee64e7SAndroid Build Coastguard Workerdiff --git a/crc32_simd.h b/crc32_simd.h
333*86ee64e7SAndroid Build Coastguard Workerindex c0346dc..8462464 100644
334*86ee64e7SAndroid Build Coastguard Worker--- a/crc32_simd.h
335*86ee64e7SAndroid Build Coastguard Worker+++ b/crc32_simd.h
336*86ee64e7SAndroid Build Coastguard Worker@@ -19,12 +19,18 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
337*86ee64e7SAndroid Build Coastguard Worker                                          z_size_t len,
338*86ee64e7SAndroid Build Coastguard Worker                                          uint32_t crc);
339*86ee64e7SAndroid Build Coastguard Worker
340*86ee64e7SAndroid Build Coastguard Worker+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf,
341*86ee64e7SAndroid Build Coastguard Worker+                                          z_size_t len,
342*86ee64e7SAndroid Build Coastguard Worker+                                          uint32_t crc);
343*86ee64e7SAndroid Build Coastguard Worker+
344*86ee64e7SAndroid Build Coastguard Worker /*
345*86ee64e7SAndroid Build Coastguard Worker  * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
346*86ee64e7SAndroid Build Coastguard Worker  * for computing the crc32 of an arbitrary length buffer.
347*86ee64e7SAndroid Build Coastguard Worker  */
348*86ee64e7SAndroid Build Coastguard Worker #define Z_CRC32_SSE42_MINIMUM_LENGTH 64
349*86ee64e7SAndroid Build Coastguard Worker #define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
350*86ee64e7SAndroid Build Coastguard Worker+#define Z_CRC32_AVX512_MINIMUM_LENGTH 256
351*86ee64e7SAndroid Build Coastguard Worker+#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63
352*86ee64e7SAndroid Build Coastguard Worker
353*86ee64e7SAndroid Build Coastguard Worker /*
354*86ee64e7SAndroid Build Coastguard Worker  * CRC32 checksums using ARMv8-a crypto instructions.
355*86ee64e7SAndroid Build Coastguard Worker--
356*86ee64e7SAndroid Build Coastguard Worker2.34.1
357*86ee64e7SAndroid Build Coastguard Worker
358