/* functable.c -- Choose relevant optimized functions at runtime * Copyright (C) 2017 Hans Kristian Rosbach * For conditions of distribution and use, see copyright notice in zlib.h */ #include "zbuild.h" #include "zendian.h" #include "crc32_braid_p.h" #include "deflate.h" #include "deflate_p.h" #include "functable.h" #include "cpu_features.h" Z_INTERNAL Z_TLS struct functable_s functable; /* stub functions */ Z_INTERNAL uint32_t update_hash_stub(deflate_state *const s, uint32_t h, uint32_t val) { // Initialize default functable.update_hash = &update_hash_c; cpu_check_features(); #ifdef X86_SSE42_CRC_HASH if (x86_cpu_has_sse42) functable.update_hash = &update_hash_sse4; #elif defined(ARM_ACLE_CRC_HASH) if (arm_cpu_has_crc32) functable.update_hash = &update_hash_acle; #endif return functable.update_hash(s, h, val); } Z_INTERNAL void insert_string_stub(deflate_state *const s, uint32_t str, uint32_t count) { // Initialize default functable.insert_string = &insert_string_c; cpu_check_features(); #ifdef X86_SSE42_CRC_HASH if (x86_cpu_has_sse42) functable.insert_string = &insert_string_sse4; #elif defined(ARM_ACLE_CRC_HASH) if (arm_cpu_has_crc32) functable.insert_string = &insert_string_acle; #endif functable.insert_string(s, str, count); } Z_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const uint32_t str) { functable.quick_insert_string = &quick_insert_string_c; #ifdef X86_SSE42_CRC_HASH if (x86_cpu_has_sse42) functable.quick_insert_string = &quick_insert_string_sse4; #elif defined(ARM_ACLE_CRC_HASH) if (arm_cpu_has_crc32) functable.quick_insert_string = &quick_insert_string_acle; #endif return functable.quick_insert_string(s, str); } Z_INTERNAL void slide_hash_stub(deflate_state *s) { functable.slide_hash = &slide_hash_c; cpu_check_features(); #ifdef X86_SSE2 # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.slide_hash = &slide_hash_sse2; #elif defined(ARM_NEON_SLIDEHASH) # ifndef ARM_NOCHECK_NEON if (arm_cpu_has_neon) # endif functable.slide_hash = &slide_hash_neon; #endif #ifdef X86_AVX2 if (x86_cpu_has_avx2) functable.slide_hash = &slide_hash_avx2; #endif #ifdef PPC_VMX_SLIDEHASH if (power_cpu_has_altivec) functable.slide_hash = &slide_hash_vmx; #endif #ifdef POWER8_VSX_SLIDEHASH if (power_cpu_has_arch_2_07) functable.slide_hash = &slide_hash_power8; #endif functable.slide_hash(s); } Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { #ifdef UNALIGNED_OK # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) functable.longest_match = &longest_match_unaligned_64; # elif defined(HAVE_BUILTIN_CTZ) functable.longest_match = &longest_match_unaligned_32; # else functable.longest_match = &longest_match_unaligned_16; # endif #else functable.longest_match = &longest_match_c; #endif #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_sse2) functable.longest_match = &longest_match_sse2; #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_avx2) functable.longest_match = &longest_match_avx2; #endif #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) if (arm_cpu_has_neon) functable.longest_match = &longest_match_neon; #endif #ifdef POWER9 if (power_cpu_has_arch_3_00) functable.longest_match = &longest_match_power9; #endif return functable.longest_match(s, cur_match); } Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_match) { #ifdef UNALIGNED_OK # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) functable.longest_match_slow = &longest_match_slow_unaligned_64; # elif defined(HAVE_BUILTIN_CTZ) functable.longest_match_slow = &longest_match_slow_unaligned_32; # else functable.longest_match_slow = &longest_match_slow_unaligned_16; # endif #else functable.longest_match_slow = &longest_match_slow_c; #endif #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_sse2) functable.longest_match_slow = &longest_match_slow_sse2; #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_avx2) functable.longest_match_slow = &longest_match_slow_avx2; #endif #if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL) if (arm_cpu_has_neon) functable.longest_match_slow = &longest_match_slow_neon; #endif #ifdef POWER9 if (power_cpu_has_arch_3_00) functable.longest_match_slow = &longest_match_slow_power9; #endif return functable.longest_match_slow(s, cur_match); } Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) { // Initialize default functable.adler32 = &adler32_c; cpu_check_features(); #ifdef ARM_NEON_ADLER32 # ifndef ARM_NOCHECK_NEON if (arm_cpu_has_neon) # endif functable.adler32 = &adler32_neon; #endif #ifdef X86_SSSE3_ADLER32 if (x86_cpu_has_ssse3) functable.adler32 = &adler32_ssse3; #endif #ifdef X86_AVX2_ADLER32 if (x86_cpu_has_avx2) functable.adler32 = &adler32_avx2; #endif #ifdef X86_AVX512_ADLER32 if (x86_cpu_has_avx512) functable.adler32 = &adler32_avx512; #endif #ifdef X86_AVX512VNNI_ADLER32 if (x86_cpu_has_avx512vnni) { functable.adler32 = &adler32_avx512_vnni; } #endif #ifdef PPC_VMX_ADLER32 if (power_cpu_has_altivec) functable.adler32 = &adler32_vmx; #endif #ifdef POWER8_VSX_ADLER32 if (power_cpu_has_arch_2_07) functable.adler32 = &adler32_power8; #endif return functable.adler32(adler, buf, len); } Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { functable.adler32_fold_copy = &adler32_fold_copy_c; #if (defined X86_SSE42_ADLER32) if (x86_cpu_has_sse42) functable.adler32_fold_copy = &adler32_fold_copy_sse42; #endif #ifdef X86_AVX2_ADLER32 if (x86_cpu_has_avx2) functable.adler32_fold_copy = &adler32_fold_copy_avx2; #endif #ifdef X86_AVX512_ADLER32 if (x86_cpu_has_avx512) functable.adler32_fold_copy = &adler32_fold_copy_avx512; #endif #ifdef X86_AVX512VNNI_ADLER32 if (x86_cpu_has_avx512vnni) functable.adler32_fold_copy = &adler32_fold_copy_avx512_vnni; #endif return functable.adler32_fold_copy(adler, dst, src, len); } Z_INTERNAL uint32_t crc32_fold_reset_stub(crc32_fold *crc) { functable.crc32_fold_reset = &crc32_fold_reset_c; cpu_check_features(); #ifdef X86_PCLMULQDQ_CRC if (x86_cpu_has_pclmulqdq) functable.crc32_fold_reset = &crc32_fold_reset_pclmulqdq; #endif return functable.crc32_fold_reset(crc); } Z_INTERNAL void crc32_fold_copy_stub(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { functable.crc32_fold_copy = &crc32_fold_copy_c; cpu_check_features(); #ifdef X86_PCLMULQDQ_CRC if (x86_cpu_has_pclmulqdq) functable.crc32_fold_copy = &crc32_fold_copy_pclmulqdq; #endif functable.crc32_fold_copy(crc, dst, src, len); } Z_INTERNAL void crc32_fold_stub(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { functable.crc32_fold = &crc32_fold_c; cpu_check_features(); #ifdef X86_PCLMULQDQ_CRC if (x86_cpu_has_pclmulqdq) functable.crc32_fold = &crc32_fold_pclmulqdq; #endif functable.crc32_fold(crc, src, len, init_crc); } Z_INTERNAL uint32_t crc32_fold_final_stub(crc32_fold *crc) { functable.crc32_fold_final = &crc32_fold_final_c; cpu_check_features(); #ifdef X86_PCLMULQDQ_CRC if (x86_cpu_has_pclmulqdq) functable.crc32_fold_final = &crc32_fold_final_pclmulqdq; #endif return functable.crc32_fold_final(crc); } Z_INTERNAL uint32_t chunksize_stub(void) { // Initialize default functable.chunksize = &chunksize_c; cpu_check_features(); #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunksize = &chunksize_sse2; #endif #ifdef X86_AVX_CHUNKSET if (x86_cpu_has_avx2) functable.chunksize = &chunksize_avx; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunksize = &chunksize_neon; #endif #ifdef POWER8_VSX_CHUNKSET if (power_cpu_has_arch_2_07) functable.chunksize = &chunksize_power8; #endif return functable.chunksize(); } Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) { // Initialize default functable.chunkcopy = &chunkcopy_c; #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunkcopy = &chunkcopy_sse2; #endif #ifdef X86_AVX_CHUNKSET if (x86_cpu_has_avx2) functable.chunkcopy = &chunkcopy_avx; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkcopy = &chunkcopy_neon; #endif #ifdef POWER8_VSX_CHUNKSET if (power_cpu_has_arch_2_07) functable.chunkcopy = &chunkcopy_power8; #endif return functable.chunkcopy(out, from, len); } Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) { // Initialize default functable.chunkunroll = &chunkunroll_c; #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunkunroll = &chunkunroll_sse2; #endif #ifdef X86_AVX_CHUNKSET if (x86_cpu_has_avx2) functable.chunkunroll = &chunkunroll_avx; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkunroll = &chunkunroll_neon; #endif #ifdef POWER8_VSX_CHUNKSET if (power_cpu_has_arch_2_07) functable.chunkunroll = &chunkunroll_power8; #endif return functable.chunkunroll(out, dist, len); } Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) { // Initialize default functable.chunkmemset = &chunkmemset_c; #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunkmemset = &chunkmemset_sse2; #endif #if defined(X86_SSE41) && defined(X86_SSE2) if (x86_cpu_has_sse41) functable.chunkmemset = &chunkmemset_sse41; #endif #ifdef X86_AVX_CHUNKSET if (x86_cpu_has_avx2) functable.chunkmemset = &chunkmemset_avx; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkmemset = &chunkmemset_neon; #endif #ifdef POWER8_VSX_CHUNKSET if (power_cpu_has_arch_2_07) functable.chunkmemset = &chunkmemset_power8; #endif return functable.chunkmemset(out, dist, len); } Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) { // Initialize default functable.chunkmemset_safe = &chunkmemset_safe_c; #ifdef X86_SSE2_CHUNKSET # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) if (x86_cpu_has_sse2) # endif functable.chunkmemset_safe = &chunkmemset_safe_sse2; #endif #if defined(X86_SSE41) && defined(X86_SSE2) if (x86_cpu_has_sse41) functable.chunkmemset_safe = &chunkmemset_safe_sse41; #endif #ifdef X86_AVX_CHUNKSET if (x86_cpu_has_avx2) functable.chunkmemset_safe = &chunkmemset_safe_avx; #endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkmemset_safe = &chunkmemset_safe_neon; #endif #ifdef POWER8_VSX_CHUNKSET if (power_cpu_has_arch_2_07) functable.chunkmemset_safe = &chunkmemset_safe_power8; #endif return functable.chunkmemset_safe(out, dist, len, left); } Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) { Assert(sizeof(uint64_t) >= sizeof(size_t), "crc32_z takes size_t but internally we have a uint64_t len"); functable.crc32 = &crc32_braid; cpu_check_features(); #ifdef ARM_ACLE_CRC_HASH if (arm_cpu_has_crc32) functable.crc32 = &crc32_acle; #elif defined(POWER8_VSX_CRC32) if (power_cpu_has_arch_2_07) functable.crc32 = &crc32_power8; #elif defined(S390_CRC32_VX) if (PREFIX(s390_cpu_has_vx)) functable.crc32 = &PREFIX(s390_crc32_vx); #elif defined(X86_PCLMULQDQ_CRC) if (x86_cpu_has_pclmulqdq) functable.crc32 = &crc32_pclmulqdq; #endif return functable.crc32(crc, buf, len); } Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) { #ifdef UNALIGNED_OK # if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) functable.compare256 = &compare256_unaligned_64; # elif defined(HAVE_BUILTIN_CTZ) functable.compare256 = &compare256_unaligned_32; # else functable.compare256 = &compare256_unaligned_16; # endif #else functable.compare256 = &compare256_c; #endif #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_sse2) functable.compare256 = &compare256_sse2; #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) if (x86_cpu_has_avx2) functable.compare256 = &compare256_avx2; #endif #ifdef POWER9 if (power_cpu_has_arch_3_00) functable.compare256 = &compare256_power9; #endif return functable.compare256(src0, src1); } /* functable init */ Z_INTERNAL Z_TLS struct functable_s functable = { adler32_stub, adler32_fold_copy_stub, crc32_stub, crc32_fold_reset_stub, crc32_fold_copy_stub, crc32_fold_stub, crc32_fold_final_stub, compare256_stub, chunksize_stub, chunkcopy_stub, chunkunroll_stub, chunkmemset_stub, chunkmemset_safe_stub, insert_string_stub, longest_match_stub, longest_match_slow_stub, quick_insert_string_stub, slide_hash_stub, update_hash_stub };