xref: /aosp_15_r20/external/XNNPACK/src/xnnpack/intrinsics-polyfill.h (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker // Copyright 2019 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker //
3*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker 
6*4bdc9457SAndroid Build Coastguard Worker #pragma once
7*4bdc9457SAndroid Build Coastguard Worker 
8*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/common.h>
9*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/unaligned.h>
10*4bdc9457SAndroid Build Coastguard Worker 
11*4bdc9457SAndroid Build Coastguard Worker 
12*4bdc9457SAndroid Build Coastguard Worker #if defined(__SSE2__)
13*4bdc9457SAndroid Build Coastguard Worker #include <emmintrin.h>
14*4bdc9457SAndroid Build Coastguard Worker 
15*4bdc9457SAndroid Build Coastguard Worker // GCC pre-11, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16
16*4bdc9457SAndroid Build Coastguard Worker #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && __GNUC__ < 11) || \
17*4bdc9457SAndroid Build Coastguard Worker     (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \
18*4bdc9457SAndroid Build Coastguard Worker     (defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) || \
19*4bdc9457SAndroid Build Coastguard Worker     (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \
20*4bdc9457SAndroid Build Coastguard Worker     (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1600))
21*4bdc9457SAndroid Build Coastguard Worker 
22*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
_mm_storeu_si32(void * address,__m128i v)23*4bdc9457SAndroid Build Coastguard Worker void _mm_storeu_si32(void* address, __m128i v) {
24*4bdc9457SAndroid Build Coastguard Worker   unaligned_store_u32(address, (uint32_t) _mm_cvtsi128_si32(v));
25*4bdc9457SAndroid Build Coastguard Worker }
26*4bdc9457SAndroid Build Coastguard Worker 
27*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
_mm_storeu_si16(void * address,__m128i v)28*4bdc9457SAndroid Build Coastguard Worker void _mm_storeu_si16(void* address, __m128i v) {
29*4bdc9457SAndroid Build Coastguard Worker   unaligned_store_u16(address, (uint16_t) _mm_extract_epi16(v, 0));
30*4bdc9457SAndroid Build Coastguard Worker }
31*4bdc9457SAndroid Build Coastguard Worker #endif  // GCC pre-11, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-16
32*4bdc9457SAndroid Build Coastguard Worker #endif  // SSE2
33*4bdc9457SAndroid Build Coastguard Worker 
34*4bdc9457SAndroid Build Coastguard Worker #ifdef __AVX512F__
35*4bdc9457SAndroid Build Coastguard Worker #include <immintrin.h>
36*4bdc9457SAndroid Build Coastguard Worker 
37*4bdc9457SAndroid Build Coastguard Worker // GCC pre-7, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, ICC pre-18, and MSVC pre-2019
38*4bdc9457SAndroid Build Coastguard Worker #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \
39*4bdc9457SAndroid Build Coastguard Worker     (defined(__clang__) && !defined(__apple_build_version__) && (__clang_major__ < 8)) || \
40*4bdc9457SAndroid Build Coastguard Worker     (defined(__clang__) && defined(__ANDROID__) && (__clang_major__ == 8) && (__clang_minor__ == 0) && (__clang_patchlevel__ < 7)) || \
41*4bdc9457SAndroid Build Coastguard Worker     (defined(__clang__) && defined(__apple_build_version__) && (__apple_build_version__ < 11000000)) || \
42*4bdc9457SAndroid Build Coastguard Worker     (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800)) || \
43*4bdc9457SAndroid Build Coastguard Worker     (defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__) && (_MSC_VER <= 1916))
44*4bdc9457SAndroid Build Coastguard Worker 
45*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
_cvtu32_mask16(unsigned int mask)46*4bdc9457SAndroid Build Coastguard Worker __mmask16 _cvtu32_mask16(unsigned int mask) {
47*4bdc9457SAndroid Build Coastguard Worker   return (__mmask16) mask;
48*4bdc9457SAndroid Build Coastguard Worker }
49*4bdc9457SAndroid Build Coastguard Worker 
50*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
_cvtu64_mask64(unsigned long long mask)51*4bdc9457SAndroid Build Coastguard Worker __mmask64 _cvtu64_mask64(unsigned long long mask) {
52*4bdc9457SAndroid Build Coastguard Worker   return (__mmask64) mask;
53*4bdc9457SAndroid Build Coastguard Worker }
54*4bdc9457SAndroid Build Coastguard Worker 
55*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
_kshiftli_mask64(__mmask64 a,unsigned int count)56*4bdc9457SAndroid Build Coastguard Worker __mmask64 _kshiftli_mask64(__mmask64 a, unsigned int count) {
57*4bdc9457SAndroid Build Coastguard Worker   return (__mmask64) ((unsigned long long) a << count);
58*4bdc9457SAndroid Build Coastguard Worker }
59*4bdc9457SAndroid Build Coastguard Worker 
60*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
_kshiftri_mask64(__mmask64 a,unsigned int count)61*4bdc9457SAndroid Build Coastguard Worker __mmask64 _kshiftri_mask64(__mmask64 a, unsigned int count) {
62*4bdc9457SAndroid Build Coastguard Worker   return (__mmask64) ((unsigned long long) a >> count);
63*4bdc9457SAndroid Build Coastguard Worker }
64*4bdc9457SAndroid Build Coastguard Worker 
65*4bdc9457SAndroid Build Coastguard Worker #endif  // GCC pre-7, Clang pre-8, Android NDK Clang pre-8.0.7, Apple Clang pre-11, and ICC pre-18
66*4bdc9457SAndroid Build Coastguard Worker 
67*4bdc9457SAndroid Build Coastguard Worker // GCC pre-7, Clang pre-4, and ICC pre-18
68*4bdc9457SAndroid Build Coastguard Worker #if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 7)) || \
69*4bdc9457SAndroid Build Coastguard Worker     (defined(__clang__) && (__clang_major__ < 4)) || \
70*4bdc9457SAndroid Build Coastguard Worker     (defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1800))
71*4bdc9457SAndroid Build Coastguard Worker 
72*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
_mm512_reduce_add_ps(__m512 v)73*4bdc9457SAndroid Build Coastguard Worker float _mm512_reduce_add_ps(__m512 v) {
74*4bdc9457SAndroid Build Coastguard Worker #if __AVX512DQ__
75*4bdc9457SAndroid Build Coastguard Worker   const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
76*4bdc9457SAndroid Build Coastguard Worker #else
77*4bdc9457SAndroid Build Coastguard Worker   const __m256 sum2 = _mm256_add_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
78*4bdc9457SAndroid Build Coastguard Worker #endif
79*4bdc9457SAndroid Build Coastguard Worker   const __m128 sum4 = _mm_add_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
80*4bdc9457SAndroid Build Coastguard Worker   const __m128 sum8 = _mm_add_ps(sum4, _mm_movehl_ps(sum4, sum4));
81*4bdc9457SAndroid Build Coastguard Worker   const __m128 sum16 = _mm_add_ss(sum8, _mm_movehdup_ps(sum8));
82*4bdc9457SAndroid Build Coastguard Worker   return _mm_cvtss_f32(sum16);
83*4bdc9457SAndroid Build Coastguard Worker }
84*4bdc9457SAndroid Build Coastguard Worker 
85*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
_mm512_reduce_max_ps(__m512 v)86*4bdc9457SAndroid Build Coastguard Worker float _mm512_reduce_max_ps(__m512 v) {
87*4bdc9457SAndroid Build Coastguard Worker #if __AVX512DQ__
88*4bdc9457SAndroid Build Coastguard Worker   const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm512_extractf32x8_ps(v, 1));
89*4bdc9457SAndroid Build Coastguard Worker #else
90*4bdc9457SAndroid Build Coastguard Worker   const __m256 sum2 = _mm256_max_ps(_mm512_castps512_ps256(v), _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(v), 1)));
91*4bdc9457SAndroid Build Coastguard Worker #endif
92*4bdc9457SAndroid Build Coastguard Worker   const __m128 sum4 = _mm_max_ps(_mm256_castps256_ps128(sum2), _mm256_extractf128_ps(sum2, 1));
93*4bdc9457SAndroid Build Coastguard Worker   const __m128 sum8 = _mm_max_ps(sum4, _mm_movehl_ps(sum4, sum4));
94*4bdc9457SAndroid Build Coastguard Worker   const __m128 sum16 = _mm_max_ss(sum8, _mm_movehdup_ps(sum8));
95*4bdc9457SAndroid Build Coastguard Worker   return _mm_cvtss_f32(sum16);
96*4bdc9457SAndroid Build Coastguard Worker }
97*4bdc9457SAndroid Build Coastguard Worker 
98*4bdc9457SAndroid Build Coastguard Worker #endif  // GCC pre-7, Clang pre-4, and ICC pre-18
99*4bdc9457SAndroid Build Coastguard Worker 
100*4bdc9457SAndroid Build Coastguard Worker #if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 9)
101*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
_mm512_set_epi8(char e63,char e62,char e61,char e60,char e59,char e58,char e57,char e56,char e55,char e54,char e53,char e52,char e51,char e50,char e49,char e48,char e47,char e46,char e45,char e44,char e43,char e42,char e41,char e40,char e39,char e38,char e37,char e36,char e35,char e34,char e33,char e32,char e31,char e30,char e29,char e28,char e27,char e26,char e25,char e24,char e23,char e22,char e21,char e20,char e19,char e18,char e17,char e16,char e15,char e14,char e13,char e12,char e11,char e10,char e09,char e08,char e07,char e06,char e05,char e04,char e03,char e02,char e01,char e00)102*4bdc9457SAndroid Build Coastguard Worker __m512i _mm512_set_epi8(
103*4bdc9457SAndroid Build Coastguard Worker   char e63, char e62, char e61, char e60,
104*4bdc9457SAndroid Build Coastguard Worker   char e59, char e58, char e57, char e56,
105*4bdc9457SAndroid Build Coastguard Worker   char e55, char e54, char e53, char e52,
106*4bdc9457SAndroid Build Coastguard Worker   char e51, char e50, char e49, char e48,
107*4bdc9457SAndroid Build Coastguard Worker   char e47, char e46, char e45, char e44,
108*4bdc9457SAndroid Build Coastguard Worker   char e43, char e42, char e41, char e40,
109*4bdc9457SAndroid Build Coastguard Worker   char e39, char e38, char e37, char e36,
110*4bdc9457SAndroid Build Coastguard Worker   char e35, char e34, char e33, char e32,
111*4bdc9457SAndroid Build Coastguard Worker   char e31, char e30, char e29, char e28,
112*4bdc9457SAndroid Build Coastguard Worker   char e27, char e26, char e25, char e24,
113*4bdc9457SAndroid Build Coastguard Worker   char e23, char e22, char e21, char e20,
114*4bdc9457SAndroid Build Coastguard Worker   char e19, char e18, char e17, char e16,
115*4bdc9457SAndroid Build Coastguard Worker   char e15, char e14, char e13, char e12,
116*4bdc9457SAndroid Build Coastguard Worker   char e11, char e10, char e09, char e08,
117*4bdc9457SAndroid Build Coastguard Worker   char e07, char e06, char e05, char e04,
118*4bdc9457SAndroid Build Coastguard Worker   char e03, char e02, char e01, char e00)
119*4bdc9457SAndroid Build Coastguard Worker {
120*4bdc9457SAndroid Build Coastguard Worker   return (__m512i) (__v64qi) {
121*4bdc9457SAndroid Build Coastguard Worker     e00, e01, e02, e03, e04, e05, e06, e07,
122*4bdc9457SAndroid Build Coastguard Worker     e08, e09, e10, e11, e12, e13, e14, e15,
123*4bdc9457SAndroid Build Coastguard Worker     e16, e17, e18, e19, e20, e21, e22, e23,
124*4bdc9457SAndroid Build Coastguard Worker     e24, e25, e26, e27, e28, e29, e30, e31,
125*4bdc9457SAndroid Build Coastguard Worker     e32, e33, e34, e35, e36, e37, e38, e39,
126*4bdc9457SAndroid Build Coastguard Worker     e40, e41, e42, e43, e44, e45, e46, e47,
127*4bdc9457SAndroid Build Coastguard Worker     e48, e49, e50, e51, e52, e53, e54, e55,
128*4bdc9457SAndroid Build Coastguard Worker     e56, e57, e58, e59, e60, e61, e62, e63
129*4bdc9457SAndroid Build Coastguard Worker   };
130*4bdc9457SAndroid Build Coastguard Worker }
131*4bdc9457SAndroid Build Coastguard Worker #endif  // GCC pre-9
132*4bdc9457SAndroid Build Coastguard Worker 
133*4bdc9457SAndroid Build Coastguard Worker #endif  // __AVX512F__
134*4bdc9457SAndroid Build Coastguard Worker 
135*4bdc9457SAndroid Build Coastguard Worker #if XNN_ARCH_ARM
136*4bdc9457SAndroid Build Coastguard Worker 
137*4bdc9457SAndroid Build Coastguard Worker // AArch32 GCC 10+ implements arm_acle.h header, but lacks __ror intrinsic
138*4bdc9457SAndroid Build Coastguard Worker #if defined(__GNUC__) && !defined(__clang__)
__ror(uint32_t x,uint32_t y)139*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC uint32_t __ror(uint32_t x, uint32_t y) {
140*4bdc9457SAndroid Build Coastguard Worker    return (x >> y) | (x << (32 - y));
141*4bdc9457SAndroid Build Coastguard Worker }
142*4bdc9457SAndroid Build Coastguard Worker #endif  // AArch32 GCC
143*4bdc9457SAndroid Build Coastguard Worker 
144*4bdc9457SAndroid Build Coastguard Worker #endif  // ARM
145*4bdc9457SAndroid Build Coastguard Worker 
146*4bdc9457SAndroid Build Coastguard Worker #if XNN_ARCH_ARM && (defined(__ARM_NEON) || defined(__ARM_NEON__))
147*4bdc9457SAndroid Build Coastguard Worker #include <arm_neon.h>
148*4bdc9457SAndroid Build Coastguard Worker 
149*4bdc9457SAndroid Build Coastguard Worker // AArch32 GCC targeting ARMv8 NEON, see
150*4bdc9457SAndroid Build Coastguard Worker // - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71233
151*4bdc9457SAndroid Build Coastguard Worker // - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95399
152*4bdc9457SAndroid Build Coastguard Worker #if defined(__GNUC__) && !defined(__clang__) && (__ARM_ARCH >= 8)
153*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
vcvtnq_s32_f32(float32x4_t v)154*4bdc9457SAndroid Build Coastguard Worker int32x4_t vcvtnq_s32_f32(float32x4_t v) {
155*4bdc9457SAndroid Build Coastguard Worker   return vcvtq_s32_f32(vrndnq_f32(v));
156*4bdc9457SAndroid Build Coastguard Worker }
157*4bdc9457SAndroid Build Coastguard Worker #endif  // AArch32 GCC targeting ARMv8 NEON
158*4bdc9457SAndroid Build Coastguard Worker 
159*4bdc9457SAndroid Build Coastguard Worker #endif  // ARM NEON
160*4bdc9457SAndroid Build Coastguard Worker 
161*4bdc9457SAndroid Build Coastguard Worker #if XNN_ARCH_ARM64
162*4bdc9457SAndroid Build Coastguard Worker #include <arm_neon.h>
163*4bdc9457SAndroid Build Coastguard Worker 
164*4bdc9457SAndroid Build Coastguard Worker // AArch64 GCC pre-8, 8.1-8.4, 9.1-9.3
165*4bdc9457SAndroid Build Coastguard Worker #if defined(__GNUC__) && !defined(__clang__) && \
166*4bdc9457SAndroid Build Coastguard Worker   (__GNUC__ < 8 || __GNUC__ == 8 && __GNUC_MINOR__ < 5 || __GNUC__ == 9 && __GNUC_MINOR__ < 4)
167*4bdc9457SAndroid Build Coastguard Worker static XNN_INTRINSIC
vld1q_u8_x4(const uint8_t * address)168*4bdc9457SAndroid Build Coastguard Worker uint8x16x4_t vld1q_u8_x4(const uint8_t* address) {
169*4bdc9457SAndroid Build Coastguard Worker   uint8x16x4_t result;
170*4bdc9457SAndroid Build Coastguard Worker   result.val[0] = vld1q_u8(address);
171*4bdc9457SAndroid Build Coastguard Worker   result.val[1] = vld1q_u8(address + 16);
172*4bdc9457SAndroid Build Coastguard Worker   result.val[2] = vld1q_u8(address + 32);
173*4bdc9457SAndroid Build Coastguard Worker   result.val[3] = vld1q_u8(address + 48);
174*4bdc9457SAndroid Build Coastguard Worker   return result;
175*4bdc9457SAndroid Build Coastguard Worker }
176*4bdc9457SAndroid Build Coastguard Worker #endif  // AArch64 GCC pre-8, 8.1-8.4, 9.1-9.3
177*4bdc9457SAndroid Build Coastguard Worker 
178*4bdc9457SAndroid Build Coastguard Worker #endif  // ARM64 NEON
179