1*86ee64e7SAndroid Build Coastguard WorkerFrom 87fc8e3e38323cfdabf8da3927488e3e57073b02 Mon Sep 17 00:00:00 2001 2*86ee64e7SAndroid Build Coastguard WorkerFrom: Jia Liu <[email protected]> 3*86ee64e7SAndroid Build Coastguard WorkerDate: Thu, 30 Mar 2023 11:13:16 +0800 4*86ee64e7SAndroid Build Coastguard WorkerSubject: [PATCH] Enabled AVX512 for CRC32 5*86ee64e7SAndroid Build Coastguard Worker 6*86ee64e7SAndroid Build Coastguard WorkerEnabled AVX512 for CRC32 that provide best of known performance 7*86ee64e7SAndroid Build Coastguard Workerbeyond current SSE SIMD optimization. It enables multiple folding 8*86ee64e7SAndroid Build Coastguard Workeroperations and AVX512 new instructions, providing ~3.5X CRC32 9*86ee64e7SAndroid Build Coastguard Workerperformance and ~3.7% gain on Zlib_bench gzip performance. 10*86ee64e7SAndroid Build Coastguard Worker--- 11*86ee64e7SAndroid Build Coastguard Worker CMakeLists.txt | 8 +- 12*86ee64e7SAndroid Build Coastguard Worker cpu_features.c | 9 +++ 13*86ee64e7SAndroid Build Coastguard Worker cpu_features.h | 1 + 14*86ee64e7SAndroid Build Coastguard Worker crc32.c | 14 +++- 15*86ee64e7SAndroid Build Coastguard Worker crc32_simd.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++- 16*86ee64e7SAndroid Build Coastguard Worker crc32_simd.h | 6 ++ 17*86ee64e7SAndroid Build Coastguard Worker 6 files changed, 230 insertions(+), 6 deletions(-) 18*86ee64e7SAndroid Build Coastguard Worker 19*86ee64e7SAndroid Build Coastguard Workerdiff --git a/CMakeLists.txt b/CMakeLists.txt 20*86ee64e7SAndroid Build Coastguard Workerindex f06e193..d45b902 100644 21*86ee64e7SAndroid Build Coastguard Worker--- a/CMakeLists.txt 22*86ee64e7SAndroid Build Coastguard Worker+++ b/CMakeLists.txt 23*86ee64e7SAndroid Build Coastguard Worker@@ -22,6 +22,7 @@ check_include_file(stdint.h HAVE_STDINT_H) 24*86ee64e7SAndroid Build Coastguard Worker check_include_file(stddef.h HAVE_STDDEF_H) 25*86ee64e7SAndroid Build Coastguard Worker 26*86ee64e7SAndroid Build Coastguard Worker option(ENABLE_SIMD_OPTIMIZATIONS "Enable all SIMD optimizations" OFF) 27*86ee64e7SAndroid Build Coastguard Worker+option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF) 28*86ee64e7SAndroid Build Coastguard Worker 29*86ee64e7SAndroid Build Coastguard Worker # TODO(cavalcantii): add support for other OSes (e.g. Android, fuchsia, osx) 30*86ee64e7SAndroid Build Coastguard Worker # and architectures (e.g. Arm). 31*86ee64e7SAndroid Build Coastguard Worker@@ -30,8 +31,13 @@ if (ENABLE_SIMD_OPTIMIZATIONS) 32*86ee64e7SAndroid Build Coastguard Worker add_definitions(-DADLER32_SIMD_SSSE3) 33*86ee64e7SAndroid Build Coastguard Worker add_definitions(-DINFLATE_CHUNK_READ_64LE) 34*86ee64e7SAndroid Build Coastguard Worker add_definitions(-DCRC32_SIMD_SSE42_PCLMUL) 35*86ee64e7SAndroid Build Coastguard Worker+ if (ENABLE_SIMD_AVX512) 36*86ee64e7SAndroid Build Coastguard Worker+ add_definitions(-DCRC32_SIMD_AVX512_PCLMUL) 37*86ee64e7SAndroid Build Coastguard Worker+ add_compile_options(-mvpclmulqdq -msse2 -mavx512f -mpclmul) 38*86ee64e7SAndroid Build Coastguard Worker+ else() 39*86ee64e7SAndroid Build Coastguard Worker+ add_compile_options(-msse4.2 -mpclmul) 40*86ee64e7SAndroid Build Coastguard Worker+ endif() 41*86ee64e7SAndroid Build Coastguard Worker add_definitions(-DDEFLATE_SLIDE_HASH_SSE2) 42*86ee64e7SAndroid Build Coastguard Worker- add_compile_options(-msse4.2 -mpclmul) 43*86ee64e7SAndroid Build Coastguard Worker # Required by CPU features detection code. 44*86ee64e7SAndroid Build Coastguard Worker add_definitions(-DX86_NOT_WINDOWS) 45*86ee64e7SAndroid Build Coastguard Worker # Apparently some environments (e.g. CentOS) require to explicitly link 46*86ee64e7SAndroid Build Coastguard Workerdiff --git a/cpu_features.c b/cpu_features.c 47*86ee64e7SAndroid Build Coastguard Workerindex 877d5f2..ac6ee88 100644 48*86ee64e7SAndroid Build Coastguard Worker--- a/cpu_features.c 49*86ee64e7SAndroid Build Coastguard Worker+++ b/cpu_features.c 50*86ee64e7SAndroid Build Coastguard Worker@@ -31,6 +31,7 @@ int ZLIB_INTERNAL arm_cpu_enable_pmull = 0; 51*86ee64e7SAndroid Build Coastguard Worker int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0; 52*86ee64e7SAndroid Build Coastguard Worker int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0; 53*86ee64e7SAndroid Build Coastguard Worker int ZLIB_INTERNAL x86_cpu_enable_simd = 0; 54*86ee64e7SAndroid Build Coastguard Worker+int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0; 55*86ee64e7SAndroid Build Coastguard Worker 56*86ee64e7SAndroid Build Coastguard Worker #ifndef CPU_NO_SIMD 57*86ee64e7SAndroid Build Coastguard Worker 58*86ee64e7SAndroid Build Coastguard Worker@@ -138,6 +139,10 @@ static void _cpu_check_features(void) 59*86ee64e7SAndroid Build Coastguard Worker /* On x86 we simply use a instruction to check the CPU features. 60*86ee64e7SAndroid Build Coastguard Worker * (i.e. CPUID). 61*86ee64e7SAndroid Build Coastguard Worker */ 62*86ee64e7SAndroid Build Coastguard Worker+#ifdef CRC32_SIMD_AVX512_PCLMUL 63*86ee64e7SAndroid Build Coastguard Worker+#include <immintrin.h> 64*86ee64e7SAndroid Build Coastguard Worker+#include <xsaveintrin.h> 65*86ee64e7SAndroid Build Coastguard Worker+#endif 66*86ee64e7SAndroid Build Coastguard Worker static void _cpu_check_features(void) 67*86ee64e7SAndroid Build Coastguard Worker { 68*86ee64e7SAndroid Build Coastguard Worker int x86_cpu_has_sse2; 69*86ee64e7SAndroid Build Coastguard Worker@@ -164,6 +169,10 @@ static void _cpu_check_features(void) 70*86ee64e7SAndroid Build Coastguard Worker x86_cpu_enable_simd = x86_cpu_has_sse2 && 71*86ee64e7SAndroid Build Coastguard Worker x86_cpu_has_sse42 && 72*86ee64e7SAndroid Build Coastguard Worker x86_cpu_has_pclmulqdq; 73*86ee64e7SAndroid Build Coastguard Worker+ 74*86ee64e7SAndroid Build Coastguard Worker+#ifdef CRC32_SIMD_AVX512_PCLMUL 75*86ee64e7SAndroid Build Coastguard Worker+ x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040; 76*86ee64e7SAndroid Build Coastguard Worker+#endif 77*86ee64e7SAndroid Build Coastguard Worker } 78*86ee64e7SAndroid Build Coastguard Worker #endif 79*86ee64e7SAndroid Build Coastguard Worker #endif 80*86ee64e7SAndroid Build Coastguard Workerdiff --git a/cpu_features.h b/cpu_features.h 81*86ee64e7SAndroid Build Coastguard Workerindex 279246c..aed3e83 100644 82*86ee64e7SAndroid Build Coastguard Worker--- a/cpu_features.h 83*86ee64e7SAndroid Build Coastguard Worker+++ b/cpu_features.h 84*86ee64e7SAndroid Build Coastguard Worker@@ -14,5 +14,6 @@ extern int arm_cpu_enable_pmull; 85*86ee64e7SAndroid Build Coastguard Worker extern int x86_cpu_enable_sse2; 86*86ee64e7SAndroid Build Coastguard Worker extern int x86_cpu_enable_ssse3; 87*86ee64e7SAndroid Build Coastguard Worker extern int x86_cpu_enable_simd; 88*86ee64e7SAndroid Build Coastguard Worker+extern int x86_cpu_enable_avx512; 89*86ee64e7SAndroid Build Coastguard Worker 90*86ee64e7SAndroid Build Coastguard Worker void cpu_check_features(void); 91*86ee64e7SAndroid Build Coastguard Workerdiff --git a/crc32.c b/crc32.c 92*86ee64e7SAndroid Build Coastguard Workerindex 4486098..acb6972 100644 93*86ee64e7SAndroid Build Coastguard Worker--- a/crc32.c 94*86ee64e7SAndroid Build Coastguard Worker+++ b/crc32.c 95*86ee64e7SAndroid Build Coastguard Worker@@ -773,7 +773,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len) 96*86ee64e7SAndroid Build Coastguard Worker } 97*86ee64e7SAndroid Build Coastguard Worker 98*86ee64e7SAndroid Build Coastguard Worker #endif 99*86ee64e7SAndroid Build Coastguard Worker-#if defined(CRC32_SIMD_SSE42_PCLMUL) 100*86ee64e7SAndroid Build Coastguard Worker+#if defined(CRC32_SIMD_AVX512_PCLMUL) 101*86ee64e7SAndroid Build Coastguard Worker+ if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) { 102*86ee64e7SAndroid Build Coastguard Worker+ /* crc32 64-byte chunks */ 103*86ee64e7SAndroid Build Coastguard Worker+ z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK; 104*86ee64e7SAndroid Build Coastguard Worker+ crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc); 105*86ee64e7SAndroid Build Coastguard Worker+ /* check remaining data */ 106*86ee64e7SAndroid Build Coastguard Worker+ len -= chunk_size; 107*86ee64e7SAndroid Build Coastguard Worker+ if (!len) 108*86ee64e7SAndroid Build Coastguard Worker+ return crc; 109*86ee64e7SAndroid Build Coastguard Worker+ /* Fall into the default crc32 for the remaining data. */ 110*86ee64e7SAndroid Build Coastguard Worker+ buf += chunk_size; 111*86ee64e7SAndroid Build Coastguard Worker+ } 112*86ee64e7SAndroid Build Coastguard Worker+#elif defined(CRC32_SIMD_SSE42_PCLMUL) 113*86ee64e7SAndroid Build Coastguard Worker if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) { 114*86ee64e7SAndroid Build Coastguard Worker /* crc32 16-byte chunks */ 115*86ee64e7SAndroid Build Coastguard Worker z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK; 116*86ee64e7SAndroid Build Coastguard Workerdiff --git a/crc32_simd.c b/crc32_simd.c 117*86ee64e7SAndroid Build Coastguard Workerindex d80beba..7428270 100644 118*86ee64e7SAndroid Build Coastguard Worker--- a/crc32_simd.c 119*86ee64e7SAndroid Build Coastguard Worker+++ b/crc32_simd.c 120*86ee64e7SAndroid Build Coastguard Worker@@ -6,17 +6,207 @@ 121*86ee64e7SAndroid Build Coastguard Worker */ 122*86ee64e7SAndroid Build Coastguard Worker 123*86ee64e7SAndroid Build Coastguard Worker #include "crc32_simd.h" 124*86ee64e7SAndroid Build Coastguard Worker- 125*86ee64e7SAndroid Build Coastguard Worker-#if defined(CRC32_SIMD_SSE42_PCLMUL) 126*86ee64e7SAndroid Build Coastguard Worker+#if defined(CRC32_SIMD_AVX512_PCLMUL) 127*86ee64e7SAndroid Build Coastguard Worker 128*86ee64e7SAndroid Build Coastguard Worker /* 129*86ee64e7SAndroid Build Coastguard Worker- * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer 130*86ee64e7SAndroid Build Coastguard Worker- * length must be at least 64, and a multiple of 16. Based on: 131*86ee64e7SAndroid Build Coastguard Worker+ * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer 132*86ee64e7SAndroid Build Coastguard Worker+ * length must be at least 256, and a multiple of 64. Based on: 133*86ee64e7SAndroid Build Coastguard Worker * 134*86ee64e7SAndroid Build Coastguard Worker * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" 135*86ee64e7SAndroid Build Coastguard Worker * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 136*86ee64e7SAndroid Build Coastguard Worker */ 137*86ee64e7SAndroid Build Coastguard Worker 138*86ee64e7SAndroid Build Coastguard Worker+#include <emmintrin.h> 139*86ee64e7SAndroid Build Coastguard Worker+#include <smmintrin.h> 140*86ee64e7SAndroid Build Coastguard Worker+#include <wmmintrin.h> 141*86ee64e7SAndroid Build Coastguard Worker+#include <immintrin.h> 142*86ee64e7SAndroid Build Coastguard Worker+ 143*86ee64e7SAndroid Build Coastguard Worker+uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */ 144*86ee64e7SAndroid Build Coastguard Worker+ const unsigned char *buf, 145*86ee64e7SAndroid Build Coastguard Worker+ z_size_t len, 146*86ee64e7SAndroid Build Coastguard Worker+ uint32_t crc) 147*86ee64e7SAndroid Build Coastguard Worker+{ 148*86ee64e7SAndroid Build Coastguard Worker+ /* 149*86ee64e7SAndroid Build Coastguard Worker+ * Definitions of the bit-reflected domain constants k1,k2,k3,k4 150*86ee64e7SAndroid Build Coastguard Worker+ * are similar to those given at the end of the paper, and remaining 151*86ee64e7SAndroid Build Coastguard Worker+ * constants and CRC32+Barrett polynomials remain unchanged. 152*86ee64e7SAndroid Build Coastguard Worker+ * 153*86ee64e7SAndroid Build Coastguard Worker+ * Replace the index of x from 128 to 512. As follows: 154*86ee64e7SAndroid Build Coastguard Worker+ * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a 155*86ee64e7SAndroid Build Coastguard Worker+ * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430 156*86ee64e7SAndroid Build Coastguard Worker+ * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4 157*86ee64e7SAndroid Build Coastguard Worker+ * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596 158*86ee64e7SAndroid Build Coastguard Worker+ */ 159*86ee64e7SAndroid Build Coastguard Worker+ static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430, 160*86ee64e7SAndroid Build Coastguard Worker+ 0x011542778a, 0x01322d1430, 161*86ee64e7SAndroid Build Coastguard Worker+ 0x011542778a, 0x01322d1430, 162*86ee64e7SAndroid Build Coastguard Worker+ 0x011542778a, 0x01322d1430 }; 163*86ee64e7SAndroid Build Coastguard Worker+ static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596, 164*86ee64e7SAndroid Build Coastguard Worker+ 0x0154442bd4, 0x01c6e41596, 165*86ee64e7SAndroid Build Coastguard Worker+ 0x0154442bd4, 0x01c6e41596, 166*86ee64e7SAndroid Build Coastguard Worker+ 0x0154442bd4, 0x01c6e41596 }; 167*86ee64e7SAndroid Build Coastguard Worker+ static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e }; 168*86ee64e7SAndroid Build Coastguard Worker+ static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 }; 169*86ee64e7SAndroid Build Coastguard Worker+ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; 170*86ee64e7SAndroid Build Coastguard Worker+ __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; 171*86ee64e7SAndroid Build Coastguard Worker+ __m128i a0, a1, a2, a3; 172*86ee64e7SAndroid Build Coastguard Worker+ 173*86ee64e7SAndroid Build Coastguard Worker+ /* 174*86ee64e7SAndroid Build Coastguard Worker+ * There's at least one block of 256. 175*86ee64e7SAndroid Build Coastguard Worker+ */ 176*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); 177*86ee64e7SAndroid Build Coastguard Worker+ x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); 178*86ee64e7SAndroid Build Coastguard Worker+ x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); 179*86ee64e7SAndroid Build Coastguard Worker+ x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); 180*86ee64e7SAndroid Build Coastguard Worker+ 181*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); 182*86ee64e7SAndroid Build Coastguard Worker+ 183*86ee64e7SAndroid Build Coastguard Worker+ x0 = _mm512_load_si512((__m512i *)k1k2); 184*86ee64e7SAndroid Build Coastguard Worker+ 185*86ee64e7SAndroid Build Coastguard Worker+ buf += 256; 186*86ee64e7SAndroid Build Coastguard Worker+ len -= 256; 187*86ee64e7SAndroid Build Coastguard Worker+ 188*86ee64e7SAndroid Build Coastguard Worker+ /* 189*86ee64e7SAndroid Build Coastguard Worker+ * Parallel fold blocks of 256, if any. 190*86ee64e7SAndroid Build Coastguard Worker+ */ 191*86ee64e7SAndroid Build Coastguard Worker+ while (len >= 256) 192*86ee64e7SAndroid Build Coastguard Worker+ { 193*86ee64e7SAndroid Build Coastguard Worker+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 194*86ee64e7SAndroid Build Coastguard Worker+ x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); 195*86ee64e7SAndroid Build Coastguard Worker+ x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); 196*86ee64e7SAndroid Build Coastguard Worker+ x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); 197*86ee64e7SAndroid Build Coastguard Worker+ 198*86ee64e7SAndroid Build Coastguard Worker+ 199*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 200*86ee64e7SAndroid Build Coastguard Worker+ x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); 201*86ee64e7SAndroid Build Coastguard Worker+ x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); 202*86ee64e7SAndroid Build Coastguard Worker+ x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); 203*86ee64e7SAndroid Build Coastguard Worker+ 204*86ee64e7SAndroid Build Coastguard Worker+ y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); 205*86ee64e7SAndroid Build Coastguard Worker+ y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); 206*86ee64e7SAndroid Build Coastguard Worker+ y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); 207*86ee64e7SAndroid Build Coastguard Worker+ y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); 208*86ee64e7SAndroid Build Coastguard Worker+ 209*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, x5); 210*86ee64e7SAndroid Build Coastguard Worker+ x2 = _mm512_xor_si512(x2, x6); 211*86ee64e7SAndroid Build Coastguard Worker+ x3 = _mm512_xor_si512(x3, x7); 212*86ee64e7SAndroid Build Coastguard Worker+ x4 = _mm512_xor_si512(x4, x8); 213*86ee64e7SAndroid Build Coastguard Worker+ 214*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, y5); 215*86ee64e7SAndroid Build Coastguard Worker+ x2 = _mm512_xor_si512(x2, y6); 216*86ee64e7SAndroid Build Coastguard Worker+ x3 = _mm512_xor_si512(x3, y7); 217*86ee64e7SAndroid Build Coastguard Worker+ x4 = _mm512_xor_si512(x4, y8); 218*86ee64e7SAndroid Build Coastguard Worker+ 219*86ee64e7SAndroid Build Coastguard Worker+ buf += 256; 220*86ee64e7SAndroid Build Coastguard Worker+ len -= 256; 221*86ee64e7SAndroid Build Coastguard Worker+ } 222*86ee64e7SAndroid Build Coastguard Worker+ 223*86ee64e7SAndroid Build Coastguard Worker+ /* 224*86ee64e7SAndroid Build Coastguard Worker+ * Fold into 512-bits. 225*86ee64e7SAndroid Build Coastguard Worker+ */ 226*86ee64e7SAndroid Build Coastguard Worker+ x0 = _mm512_load_si512((__m512i *)k3k4); 227*86ee64e7SAndroid Build Coastguard Worker+ 228*86ee64e7SAndroid Build Coastguard Worker+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 229*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 230*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, x2); 231*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, x5); 232*86ee64e7SAndroid Build Coastguard Worker+ 233*86ee64e7SAndroid Build Coastguard Worker+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 234*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 235*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, x3); 236*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, x5); 237*86ee64e7SAndroid Build Coastguard Worker+ 238*86ee64e7SAndroid Build Coastguard Worker+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 239*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 240*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, x4); 241*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, x5); 242*86ee64e7SAndroid Build Coastguard Worker+ 243*86ee64e7SAndroid Build Coastguard Worker+ /* 244*86ee64e7SAndroid Build Coastguard Worker+ * Single fold blocks of 64, if any. 245*86ee64e7SAndroid Build Coastguard Worker+ */ 246*86ee64e7SAndroid Build Coastguard Worker+ while (len >= 64) 247*86ee64e7SAndroid Build Coastguard Worker+ { 248*86ee64e7SAndroid Build Coastguard Worker+ x2 = _mm512_loadu_si512((__m512i *)buf); 249*86ee64e7SAndroid Build Coastguard Worker+ 250*86ee64e7SAndroid Build Coastguard Worker+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); 251*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); 252*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, x2); 253*86ee64e7SAndroid Build Coastguard Worker+ x1 = _mm512_xor_si512(x1, x5); 254*86ee64e7SAndroid Build Coastguard Worker+ 255*86ee64e7SAndroid Build Coastguard Worker+ buf += 64; 256*86ee64e7SAndroid Build Coastguard Worker+ len -= 64; 257*86ee64e7SAndroid Build Coastguard Worker+ } 258*86ee64e7SAndroid Build Coastguard Worker+ 259*86ee64e7SAndroid Build Coastguard Worker+ /* 260*86ee64e7SAndroid Build Coastguard Worker+ * Fold 512-bits to 384-bits. 261*86ee64e7SAndroid Build Coastguard Worker+ */ 262*86ee64e7SAndroid Build Coastguard Worker+ a0 = _mm_load_si128((__m128i *)k5k6); 263*86ee64e7SAndroid Build Coastguard Worker+ 264*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm512_extracti32x4_epi32(x1, 0); 265*86ee64e7SAndroid Build Coastguard Worker+ a2 = _mm512_extracti32x4_epi32(x1, 1); 266*86ee64e7SAndroid Build Coastguard Worker+ 267*86ee64e7SAndroid Build Coastguard Worker+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00); 268*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11); 269*86ee64e7SAndroid Build Coastguard Worker+ 270*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_xor_si128(a1, a3); 271*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_xor_si128(a1, a2); 272*86ee64e7SAndroid Build Coastguard Worker+ 273*86ee64e7SAndroid Build Coastguard Worker+ /* 274*86ee64e7SAndroid Build Coastguard Worker+ * Fold 384-bits to 256-bits. 275*86ee64e7SAndroid Build Coastguard Worker+ */ 276*86ee64e7SAndroid Build Coastguard Worker+ a2 = _mm512_extracti32x4_epi32(x1, 2); 277*86ee64e7SAndroid Build Coastguard Worker+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00); 278*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11); 279*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_xor_si128(a1, a3); 280*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_xor_si128(a1, a2); 281*86ee64e7SAndroid Build Coastguard Worker+ 282*86ee64e7SAndroid Build Coastguard Worker+ /* 283*86ee64e7SAndroid Build Coastguard Worker+ * Fold 256-bits to 128-bits. 284*86ee64e7SAndroid Build Coastguard Worker+ */ 285*86ee64e7SAndroid Build Coastguard Worker+ a2 = _mm512_extracti32x4_epi32(x1, 3); 286*86ee64e7SAndroid Build Coastguard Worker+ a3 = _mm_clmulepi64_si128(a1, a0, 0x00); 287*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_clmulepi64_si128(a1, a0, 0x11); 288*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_xor_si128(a1, a3); 289*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_xor_si128(a1, a2); 290*86ee64e7SAndroid Build Coastguard Worker+ 291*86ee64e7SAndroid Build Coastguard Worker+ /* 292*86ee64e7SAndroid Build Coastguard Worker+ * Fold 128-bits to 64-bits. 293*86ee64e7SAndroid Build Coastguard Worker+ */ 294*86ee64e7SAndroid Build Coastguard Worker+ a2 = _mm_clmulepi64_si128(a1, a0, 0x10); 295*86ee64e7SAndroid Build Coastguard Worker+ a3 = _mm_setr_epi32(~0, 0, ~0, 0); 296*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_srli_si128(a1, 8); 297*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_xor_si128(a1, a2); 298*86ee64e7SAndroid Build Coastguard Worker+ 299*86ee64e7SAndroid Build Coastguard Worker+ a0 = _mm_loadl_epi64((__m128i*)k7k8); 300*86ee64e7SAndroid Build Coastguard Worker+ a2 = _mm_srli_si128(a1, 4); 301*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_and_si128(a1, a3); 302*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_clmulepi64_si128(a1, a0, 0x00); 303*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_xor_si128(a1, a2); 304*86ee64e7SAndroid Build Coastguard Worker+ 305*86ee64e7SAndroid Build Coastguard Worker+ /* 306*86ee64e7SAndroid Build Coastguard Worker+ * Barret reduce to 32-bits. 307*86ee64e7SAndroid Build Coastguard Worker+ */ 308*86ee64e7SAndroid Build Coastguard Worker+ a0 = _mm_load_si128((__m128i*)poly); 309*86ee64e7SAndroid Build Coastguard Worker+ 310*86ee64e7SAndroid Build Coastguard Worker+ a2 = _mm_and_si128(a1, a3); 311*86ee64e7SAndroid Build Coastguard Worker+ a2 = _mm_clmulepi64_si128(a2, a0, 0x10); 312*86ee64e7SAndroid Build Coastguard Worker+ a2 = _mm_and_si128(a2, a3); 313*86ee64e7SAndroid Build Coastguard Worker+ a2 = _mm_clmulepi64_si128(a2, a0, 0x00); 314*86ee64e7SAndroid Build Coastguard Worker+ a1 = _mm_xor_si128(a1, a2); 315*86ee64e7SAndroid Build Coastguard Worker+ 316*86ee64e7SAndroid Build Coastguard Worker+ /* 317*86ee64e7SAndroid Build Coastguard Worker+ * Return the crc32. 318*86ee64e7SAndroid Build Coastguard Worker+ */ 319*86ee64e7SAndroid Build Coastguard Worker+ return _mm_extract_epi32(a1, 1); 320*86ee64e7SAndroid Build Coastguard Worker+} 321*86ee64e7SAndroid Build Coastguard Worker+ 322*86ee64e7SAndroid Build Coastguard Worker+#elif defined(CRC32_SIMD_SSE42_PCLMUL) 323*86ee64e7SAndroid Build Coastguard Worker+ 324*86ee64e7SAndroid Build Coastguard Worker+/* 325*86ee64e7SAndroid Build Coastguard Worker+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer 326*86ee64e7SAndroid Build Coastguard Worker+ * length must be at least 64, and a multiple of 16. 327*86ee64e7SAndroid Build Coastguard Worker+ */ 328*86ee64e7SAndroid Build Coastguard Worker+ 329*86ee64e7SAndroid Build Coastguard Worker #include <emmintrin.h> 330*86ee64e7SAndroid Build Coastguard Worker #include <smmintrin.h> 331*86ee64e7SAndroid Build Coastguard Worker #include <wmmintrin.h> 332*86ee64e7SAndroid Build Coastguard Workerdiff --git a/crc32_simd.h b/crc32_simd.h 333*86ee64e7SAndroid Build Coastguard Workerindex c0346dc..8462464 100644 334*86ee64e7SAndroid Build Coastguard Worker--- a/crc32_simd.h 335*86ee64e7SAndroid Build Coastguard Worker+++ b/crc32_simd.h 336*86ee64e7SAndroid Build Coastguard Worker@@ -19,12 +19,18 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf, 337*86ee64e7SAndroid Build Coastguard Worker z_size_t len, 338*86ee64e7SAndroid Build Coastguard Worker uint32_t crc); 339*86ee64e7SAndroid Build Coastguard Worker 340*86ee64e7SAndroid Build Coastguard Worker+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf, 341*86ee64e7SAndroid Build Coastguard Worker+ z_size_t len, 342*86ee64e7SAndroid Build Coastguard Worker+ uint32_t crc); 343*86ee64e7SAndroid Build Coastguard Worker+ 344*86ee64e7SAndroid Build Coastguard Worker /* 345*86ee64e7SAndroid Build Coastguard Worker * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c 346*86ee64e7SAndroid Build Coastguard Worker * for computing the crc32 of an arbitrary length buffer. 347*86ee64e7SAndroid Build Coastguard Worker */ 348*86ee64e7SAndroid Build Coastguard Worker #define Z_CRC32_SSE42_MINIMUM_LENGTH 64 349*86ee64e7SAndroid Build Coastguard Worker #define Z_CRC32_SSE42_CHUNKSIZE_MASK 15 350*86ee64e7SAndroid Build Coastguard Worker+#define Z_CRC32_AVX512_MINIMUM_LENGTH 256 351*86ee64e7SAndroid Build Coastguard Worker+#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63 352*86ee64e7SAndroid Build Coastguard Worker 353*86ee64e7SAndroid Build Coastguard Worker /* 354*86ee64e7SAndroid Build Coastguard Worker * CRC32 checksums using ARMv8-a crypto instructions. 355*86ee64e7SAndroid Build Coastguard Worker-- 356*86ee64e7SAndroid Build Coastguard Worker2.34.1 357*86ee64e7SAndroid Build Coastguard Worker 358