1 #ifndef FALLBACK_BUILTINS_H
2 #define FALLBACK_BUILTINS_H
3 
4 #if defined(_MSC_VER) && !defined(__clang__)
5 #if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) ||  defined(_M_ARM) || defined(_M_ARM64)
6 
7 #include <intrin.h>
8 #ifdef X86_FEATURES
9 #  include "arch/x86/x86_features.h"
10 #endif
11 
12 /* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0
13  * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward is not checked
14  */
__builtin_ctz(uint32_t value)15 static __forceinline unsigned long __builtin_ctz(uint32_t value) {
16 #ifdef X86_FEATURES
17 #  ifndef X86_NOCHECK_TZCNT
18     if (x86_cpu_has_tzcnt)
19 #  endif
20         return _tzcnt_u32(value);
21 #endif
22     unsigned long trailing_zero;
23     _BitScanForward(&trailing_zero, value);
24     return trailing_zero;
25 }
26 #define HAVE_BUILTIN_CTZ
27 
28 #ifdef _M_AMD64
29 /* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0
30  * Because of that assumption trailing_zero is not initialized and the return value of _BitScanForward64 is not checked
31  */
__builtin_ctzll(uint64_t value)32 static __forceinline unsigned long long __builtin_ctzll(uint64_t value) {
33 #ifdef X86_FEATURES
34 #  ifndef X86_NOCHECK_TZCNT
35     if (x86_cpu_has_tzcnt)
36 #  endif
37         return _tzcnt_u64(value);
38 #endif
39     unsigned long trailing_zero;
40     _BitScanForward64(&trailing_zero, value);
41     return trailing_zero;
42 }
43 #define HAVE_BUILTIN_CTZLL
44 #endif // Microsoft AMD64
45 
46 #endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
47 #endif // _MSC_VER & !clang
48 
49 /* Unfortunately GCC didn't support these things until version 10.
50  * Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
51  */
52 #ifdef __AVX2__
53 #include <immintrin.h>
54 
55 #if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
56     || (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
_mm256_zextsi128_si256(__m128i a)57 static inline __m256i _mm256_zextsi128_si256(__m128i a) {
58     __m128i r;
59     __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
60     return _mm256_castsi128_si256(r);
61 }
62 
63 #ifdef __AVX512F__
_mm512_zextsi128_si512(__m128i a)64 static inline __m512i _mm512_zextsi128_si512(__m128i a) {
65     __m128i r;
66     __asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
67     return _mm512_castsi128_si512(r);
68 }
69 #endif // __AVX512F__
70 #endif // gcc/AppleClang version test
71 
72 #endif // __AVX2__
73 
74 #if defined(ARM_NEON_ADLER32) && !defined(__aarch64__)
75 /* Compatibility shim for the _high family of functions */
76 #define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
77 #define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))
78 #define vmlal_high_u16(a, b, c) vmlal_u16(a, vget_high_u16(b), vget_high_u16(c))
79 #define vaddw_high_u8(a, b) vaddw_u8(a, vget_high_u8(b))
80 #endif
81 
82 #ifdef ARM_NEON_SLIDEHASH
83 
84 #define vqsubq_u16_x4_x1(out, a, b) do { \
85     out.val[0] = vqsubq_u16(a.val[0], b); \
86     out.val[1] = vqsubq_u16(a.val[1], b); \
87     out.val[2] = vqsubq_u16(a.val[2], b); \
88     out.val[3] = vqsubq_u16(a.val[3], b); \
89 } while (0)
90 
91 /* Have to check for hard float ABI on GCC/clang, but not
92  * on MSVC (we don't compile for the soft float ABI on windows)
93  */
94 #if !defined(ARM_NEON_HASLD4) && (defined(__ARM_FP) || defined(_MSC_VER))
95 
96 #ifdef _M_ARM64
97 #  include <arm64_neon.h>
98 #else
99 #  include <arm_neon.h>
100 #endif
101 
vld1q_u16_x4(uint16_t * a)102 static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) {
103     uint16x8x4_t ret = (uint16x8x4_t) {{
104                           vld1q_u16(a),
105                           vld1q_u16(a+8),
106                           vld1q_u16(a+16),
107                           vld1q_u16(a+24)}};
108     return ret;
109 }
110 
vld1q_u8_x4(uint8_t * a)111 static inline uint8x16x4_t vld1q_u8_x4(uint8_t *a) {
112     uint8x16x4_t ret = (uint8x16x4_t) {{
113                           vld1q_u8(a),
114                           vld1q_u8(a+16),
115                           vld1q_u8(a+32),
116                           vld1q_u8(a+48)}};
117     return ret;
118 }
119 
vst1q_u16_x4(uint16_t * p,uint16x8x4_t a)120 static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
121     vst1q_u16(p, a.val[0]);
122     vst1q_u16(p + 8, a.val[1]);
123     vst1q_u16(p + 16, a.val[2]);
124     vst1q_u16(p + 24, a.val[3]);
125 }
126 #endif // HASLD4 check and hard float
127 #endif // ARM_NEON_SLIDEHASH
128 
129 #endif // include guard FALLBACK_BUILTINS_H
130