xref: /aosp_15_r20/external/pffft/sse2neon.h (revision 3f1979aa0d7ad34fcf3763de7b7b8f8cd67e5bdd)
1*3f1979aaSAndroid Build Coastguard Worker #ifndef SSE2NEON_H
2*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_H
3*3f1979aaSAndroid Build Coastguard Worker 
4*3f1979aaSAndroid Build Coastguard Worker // This header file provides a simple API translation layer
5*3f1979aaSAndroid Build Coastguard Worker // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6*3f1979aaSAndroid Build Coastguard Worker //
7*3f1979aaSAndroid Build Coastguard Worker // This header file does not yet translate all of the SSE intrinsics.
8*3f1979aaSAndroid Build Coastguard Worker //
9*3f1979aaSAndroid Build Coastguard Worker // Contributors to this work are:
10*3f1979aaSAndroid Build Coastguard Worker //   John W. Ratcliff <[email protected]>
11*3f1979aaSAndroid Build Coastguard Worker //   Brandon Rowlett <[email protected]>
12*3f1979aaSAndroid Build Coastguard Worker //   Ken Fast <[email protected]>
13*3f1979aaSAndroid Build Coastguard Worker //   Eric van Beurden <[email protected]>
14*3f1979aaSAndroid Build Coastguard Worker //   Alexander Potylitsin <[email protected]>
15*3f1979aaSAndroid Build Coastguard Worker //   Hasindu Gamaarachchi <[email protected]>
16*3f1979aaSAndroid Build Coastguard Worker //   Jim Huang <[email protected]>
17*3f1979aaSAndroid Build Coastguard Worker //   Mark Cheng <[email protected]>
18*3f1979aaSAndroid Build Coastguard Worker //   Malcolm James MacLeod <[email protected]>
19*3f1979aaSAndroid Build Coastguard Worker //   Devin Hussey (easyaspi314) <[email protected]>
20*3f1979aaSAndroid Build Coastguard Worker //   Sebastian Pop <[email protected]>
21*3f1979aaSAndroid Build Coastguard Worker //   Developer Ecosystem Engineering <[email protected]>
22*3f1979aaSAndroid Build Coastguard Worker //   Danila Kutenin <[email protected]>
23*3f1979aaSAndroid Build Coastguard Worker //   François Turban (JishinMaster) <[email protected]>
24*3f1979aaSAndroid Build Coastguard Worker //   Pei-Hsuan Hung <[email protected]>
25*3f1979aaSAndroid Build Coastguard Worker //   Yang-Hao Yuan <[email protected]>
26*3f1979aaSAndroid Build Coastguard Worker 
27*3f1979aaSAndroid Build Coastguard Worker /*
28*3f1979aaSAndroid Build Coastguard Worker  * sse2neon is freely redistributable under the MIT License.
29*3f1979aaSAndroid Build Coastguard Worker  *
30*3f1979aaSAndroid Build Coastguard Worker  * Permission is hereby granted, free of charge, to any person obtaining a copy
31*3f1979aaSAndroid Build Coastguard Worker  * of this software and associated documentation files (the "Software"), to deal
32*3f1979aaSAndroid Build Coastguard Worker  * in the Software without restriction, including without limitation the rights
33*3f1979aaSAndroid Build Coastguard Worker  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34*3f1979aaSAndroid Build Coastguard Worker  * copies of the Software, and to permit persons to whom the Software is
35*3f1979aaSAndroid Build Coastguard Worker  * furnished to do so, subject to the following conditions:
36*3f1979aaSAndroid Build Coastguard Worker  *
37*3f1979aaSAndroid Build Coastguard Worker  * The above copyright notice and this permission notice shall be included in
38*3f1979aaSAndroid Build Coastguard Worker  * all copies or substantial portions of the Software.
39*3f1979aaSAndroid Build Coastguard Worker  *
40*3f1979aaSAndroid Build Coastguard Worker  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41*3f1979aaSAndroid Build Coastguard Worker  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42*3f1979aaSAndroid Build Coastguard Worker  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43*3f1979aaSAndroid Build Coastguard Worker  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44*3f1979aaSAndroid Build Coastguard Worker  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45*3f1979aaSAndroid Build Coastguard Worker  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
46*3f1979aaSAndroid Build Coastguard Worker  * SOFTWARE.
47*3f1979aaSAndroid Build Coastguard Worker  */
48*3f1979aaSAndroid Build Coastguard Worker 
49*3f1979aaSAndroid Build Coastguard Worker /* Tunable configurations */
50*3f1979aaSAndroid Build Coastguard Worker 
51*3f1979aaSAndroid Build Coastguard Worker /* Enable precise implementation of _mm_min_ps and _mm_max_ps
52*3f1979aaSAndroid Build Coastguard Worker  * This would slow down the computation a bit, but gives consistent result with
53*3f1979aaSAndroid Build Coastguard Worker  * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
54*3f1979aaSAndroid Build Coastguard Worker  */
55*3f1979aaSAndroid Build Coastguard Worker #ifndef SSE2NEON_PRECISE_MINMAX
56*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_PRECISE_MINMAX (0)
57*3f1979aaSAndroid Build Coastguard Worker #endif
58*3f1979aaSAndroid Build Coastguard Worker 
59*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__) || defined(__clang__)
60*3f1979aaSAndroid Build Coastguard Worker #pragma push_macro("FORCE_INLINE")
61*3f1979aaSAndroid Build Coastguard Worker #pragma push_macro("ALIGN_STRUCT")
62*3f1979aaSAndroid Build Coastguard Worker #define FORCE_INLINE static inline __attribute__((always_inline))
63*3f1979aaSAndroid Build Coastguard Worker #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
64*3f1979aaSAndroid Build Coastguard Worker #else
65*3f1979aaSAndroid Build Coastguard Worker #error "Macro name collisions may happen with unsupported compiler."
66*3f1979aaSAndroid Build Coastguard Worker #ifdef FORCE_INLINE
67*3f1979aaSAndroid Build Coastguard Worker #undef FORCE_INLINE
68*3f1979aaSAndroid Build Coastguard Worker #endif
69*3f1979aaSAndroid Build Coastguard Worker #define FORCE_INLINE static inline
70*3f1979aaSAndroid Build Coastguard Worker #ifndef ALIGN_STRUCT
71*3f1979aaSAndroid Build Coastguard Worker #define ALIGN_STRUCT(x) __declspec(align(x))
72*3f1979aaSAndroid Build Coastguard Worker #endif
73*3f1979aaSAndroid Build Coastguard Worker #endif
74*3f1979aaSAndroid Build Coastguard Worker 
75*3f1979aaSAndroid Build Coastguard Worker #include <stdint.h>
76*3f1979aaSAndroid Build Coastguard Worker #include <stdlib.h>
77*3f1979aaSAndroid Build Coastguard Worker 
78*3f1979aaSAndroid Build Coastguard Worker /* Architecture-specific build options */
79*3f1979aaSAndroid Build Coastguard Worker /* FIXME: #pragma GCC push_options is only available on GCC */
80*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__)
81*3f1979aaSAndroid Build Coastguard Worker #if defined(__arm__) && __ARM_ARCH == 7
82*3f1979aaSAndroid Build Coastguard Worker /* According to ARM C Language Extensions Architecture specification,
83*3f1979aaSAndroid Build Coastguard Worker  * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
84*3f1979aaSAndroid Build Coastguard Worker  * architecture supported.
85*3f1979aaSAndroid Build Coastguard Worker  */
86*3f1979aaSAndroid Build Coastguard Worker #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
87*3f1979aaSAndroid Build Coastguard Worker #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
88*3f1979aaSAndroid Build Coastguard Worker #endif
89*3f1979aaSAndroid Build Coastguard Worker #pragma GCC push_options
90*3f1979aaSAndroid Build Coastguard Worker #pragma GCC target("fpu=neon")
91*3f1979aaSAndroid Build Coastguard Worker #elif defined(__aarch64__)
92*3f1979aaSAndroid Build Coastguard Worker #pragma GCC push_options
93*3f1979aaSAndroid Build Coastguard Worker #pragma GCC target("+simd")
94*3f1979aaSAndroid Build Coastguard Worker #else
95*3f1979aaSAndroid Build Coastguard Worker #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
96*3f1979aaSAndroid Build Coastguard Worker #endif
97*3f1979aaSAndroid Build Coastguard Worker #endif
98*3f1979aaSAndroid Build Coastguard Worker 
99*3f1979aaSAndroid Build Coastguard Worker #include <arm_neon.h>
100*3f1979aaSAndroid Build Coastguard Worker 
101*3f1979aaSAndroid Build Coastguard Worker /* Rounding functions require either Aarch64 instructions or libm failback */
102*3f1979aaSAndroid Build Coastguard Worker #if !defined(__aarch64__)
103*3f1979aaSAndroid Build Coastguard Worker #include <math.h>
104*3f1979aaSAndroid Build Coastguard Worker #endif
105*3f1979aaSAndroid Build Coastguard Worker 
106*3f1979aaSAndroid Build Coastguard Worker /* "__has_builtin" can be used to query support for built-in functions
107*3f1979aaSAndroid Build Coastguard Worker  * provided by gcc/clang and other compilers that support it.
108*3f1979aaSAndroid Build Coastguard Worker  */
109*3f1979aaSAndroid Build Coastguard Worker #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
110*3f1979aaSAndroid Build Coastguard Worker /* Compatibility with gcc <= 9 */
111*3f1979aaSAndroid Build Coastguard Worker #if __GNUC__ <= 9
112*3f1979aaSAndroid Build Coastguard Worker #define __has_builtin(x) HAS##x
113*3f1979aaSAndroid Build Coastguard Worker #define HAS__builtin_popcount 1
114*3f1979aaSAndroid Build Coastguard Worker #define HAS__builtin_popcountll 1
115*3f1979aaSAndroid Build Coastguard Worker #else
116*3f1979aaSAndroid Build Coastguard Worker #define __has_builtin(x) 0
117*3f1979aaSAndroid Build Coastguard Worker #endif
118*3f1979aaSAndroid Build Coastguard Worker #endif
119*3f1979aaSAndroid Build Coastguard Worker 
120*3f1979aaSAndroid Build Coastguard Worker /**
121*3f1979aaSAndroid Build Coastguard Worker  * MACRO for shuffle parameter for _mm_shuffle_ps().
122*3f1979aaSAndroid Build Coastguard Worker  * Argument fp3 is a digit[0123] that represents the fp from argument "b"
123*3f1979aaSAndroid Build Coastguard Worker  * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
124*3f1979aaSAndroid Build Coastguard Worker  * for fp2 in result. fp1 is a digit[0123] that represents the fp from
125*3f1979aaSAndroid Build Coastguard Worker  * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
126*3f1979aaSAndroid Build Coastguard Worker  * fp0 is the same for fp0 of result.
127*3f1979aaSAndroid Build Coastguard Worker  */
128*3f1979aaSAndroid Build Coastguard Worker #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
129*3f1979aaSAndroid Build Coastguard Worker     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
130*3f1979aaSAndroid Build Coastguard Worker 
131*3f1979aaSAndroid Build Coastguard Worker /* Rounding mode macros. */
132*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_TO_NEAREST_INT 0x00
133*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_TO_NEG_INF 0x01
134*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_TO_POS_INF 0x02
135*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_TO_ZERO 0x03
136*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_CUR_DIRECTION 0x04
137*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_NO_EXC 0x08
138*3f1979aaSAndroid Build Coastguard Worker 
139*3f1979aaSAndroid Build Coastguard Worker /* indicate immediate constant argument in a given range */
140*3f1979aaSAndroid Build Coastguard Worker #define __constrange(a, b) const
141*3f1979aaSAndroid Build Coastguard Worker 
142*3f1979aaSAndroid Build Coastguard Worker /* A few intrinsics accept traditional data types like ints or floats, but
143*3f1979aaSAndroid Build Coastguard Worker  * most operate on data types that are specific to SSE.
144*3f1979aaSAndroid Build Coastguard Worker  * If a vector type ends in d, it contains doubles, and if it does not have
145*3f1979aaSAndroid Build Coastguard Worker  * a suffix, it contains floats. An integer vector type can contain any type
146*3f1979aaSAndroid Build Coastguard Worker  * of integer, from chars to shorts to unsigned long longs.
147*3f1979aaSAndroid Build Coastguard Worker  */
148*3f1979aaSAndroid Build Coastguard Worker typedef int64x1_t __m64;
149*3f1979aaSAndroid Build Coastguard Worker typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
150*3f1979aaSAndroid Build Coastguard Worker // On ARM 32-bit architecture, the float64x2_t is not supported.
151*3f1979aaSAndroid Build Coastguard Worker // The data type __m128d should be represented in a different way for related
152*3f1979aaSAndroid Build Coastguard Worker // intrinsic conversion.
153*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
154*3f1979aaSAndroid Build Coastguard Worker typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
155*3f1979aaSAndroid Build Coastguard Worker #else
156*3f1979aaSAndroid Build Coastguard Worker typedef float32x4_t __m128d;
157*3f1979aaSAndroid Build Coastguard Worker #endif
158*3f1979aaSAndroid Build Coastguard Worker typedef int64x2_t __m128i; /* 128-bit vector containing integers */
159*3f1979aaSAndroid Build Coastguard Worker 
160*3f1979aaSAndroid Build Coastguard Worker /* type-safe casting between types */
161*3f1979aaSAndroid Build Coastguard Worker 
162*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
163*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_f32(x) (x)
164*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
165*3f1979aaSAndroid Build Coastguard Worker 
166*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
167*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
168*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
169*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
170*3f1979aaSAndroid Build Coastguard Worker 
171*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
172*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
173*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
174*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
175*3f1979aaSAndroid Build Coastguard Worker 
176*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
177*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_f32_m128(x) (x)
178*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
179*3f1979aaSAndroid Build Coastguard Worker 
180*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
181*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
182*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
183*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
184*3f1979aaSAndroid Build Coastguard Worker 
185*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
186*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
187*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
188*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
189*3f1979aaSAndroid Build Coastguard Worker 
190*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
191*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
192*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
193*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_s64(x) (x)
194*3f1979aaSAndroid Build Coastguard Worker 
195*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
196*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
197*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
198*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
199*3f1979aaSAndroid Build Coastguard Worker 
200*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
201*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
202*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
203*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s64_m128i(x) (x)
204*3f1979aaSAndroid Build Coastguard Worker 
205*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
206*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
207*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
208*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
209*3f1979aaSAndroid Build Coastguard Worker 
210*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
211*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
212*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
213*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_s64(x) (x)
214*3f1979aaSAndroid Build Coastguard Worker 
215*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
216*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
217*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
218*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
219*3f1979aaSAndroid Build Coastguard Worker 
220*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
221*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
222*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
223*3f1979aaSAndroid Build Coastguard Worker 
224*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
225*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
226*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
227*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
228*3f1979aaSAndroid Build Coastguard Worker 
229*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
230*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
231*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
232*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_s64_m64(x) (x)
233*3f1979aaSAndroid Build Coastguard Worker 
234*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
235*3f1979aaSAndroid Build Coastguard Worker 
236*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
237*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
238*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
239*3f1979aaSAndroid Build Coastguard Worker 
240*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_f64(x) (x)
241*3f1979aaSAndroid Build Coastguard Worker 
242*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
243*3f1979aaSAndroid Build Coastguard Worker 
244*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_f64_m128d(x) (x)
245*3f1979aaSAndroid Build Coastguard Worker #else
246*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
247*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
248*3f1979aaSAndroid Build Coastguard Worker 
249*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_f32(x) (x)
250*3f1979aaSAndroid Build Coastguard Worker 
251*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
252*3f1979aaSAndroid Build Coastguard Worker 
253*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_f32_m128d(x) (x)
254*3f1979aaSAndroid Build Coastguard Worker #endif
255*3f1979aaSAndroid Build Coastguard Worker 
256*3f1979aaSAndroid Build Coastguard Worker // A struct is defined in this header file called 'SIMDVec' which can be used
257*3f1979aaSAndroid Build Coastguard Worker // by applications which attempt to access the contents of an _m128 struct
258*3f1979aaSAndroid Build Coastguard Worker // directly.  It is important to note that accessing the __m128 struct directly
259*3f1979aaSAndroid Build Coastguard Worker // is bad coding practice by Microsoft: @see:
260*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
261*3f1979aaSAndroid Build Coastguard Worker //
262*3f1979aaSAndroid Build Coastguard Worker // However, some legacy source code may try to access the contents of an __m128
263*3f1979aaSAndroid Build Coastguard Worker // struct directly so the developer can use the SIMDVec as an alias for it.  Any
264*3f1979aaSAndroid Build Coastguard Worker // casting must be done manually by the developer, as you cannot cast or
265*3f1979aaSAndroid Build Coastguard Worker // otherwise alias the base NEON data type for intrinsic operations.
266*3f1979aaSAndroid Build Coastguard Worker //
267*3f1979aaSAndroid Build Coastguard Worker // union intended to allow direct access to an __m128 variable using the names
268*3f1979aaSAndroid Build Coastguard Worker // that the MSVC compiler provides.  This union should really only be used when
269*3f1979aaSAndroid Build Coastguard Worker // trying to access the members of the vector as integer values.  GCC/clang
270*3f1979aaSAndroid Build Coastguard Worker // allow native access to the float members through a simple array access
271*3f1979aaSAndroid Build Coastguard Worker // operator (in C since 4.6, in C++ since 4.8).
272*3f1979aaSAndroid Build Coastguard Worker //
273*3f1979aaSAndroid Build Coastguard Worker // Ideally direct accesses to SIMD vectors should not be used since it can cause
274*3f1979aaSAndroid Build Coastguard Worker // a performance hit.  If it really is needed however, the original __m128
275*3f1979aaSAndroid Build Coastguard Worker // variable can be aliased with a pointer to this union and used to access
276*3f1979aaSAndroid Build Coastguard Worker // individual components.  The use of this union should be hidden behind a macro
277*3f1979aaSAndroid Build Coastguard Worker // that is used throughout the codebase to access the members instead of always
278*3f1979aaSAndroid Build Coastguard Worker // declaring this type of variable.
279*3f1979aaSAndroid Build Coastguard Worker typedef union ALIGN_STRUCT(16) SIMDVec {
280*3f1979aaSAndroid Build Coastguard Worker     float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
281*3f1979aaSAndroid Build Coastguard Worker     int8_t m128_i8[16];    // as signed 8-bit integers.
282*3f1979aaSAndroid Build Coastguard Worker     int16_t m128_i16[8];   // as signed 16-bit integers.
283*3f1979aaSAndroid Build Coastguard Worker     int32_t m128_i32[4];   // as signed 32-bit integers.
284*3f1979aaSAndroid Build Coastguard Worker     int64_t m128_i64[2];   // as signed 64-bit integers.
285*3f1979aaSAndroid Build Coastguard Worker     uint8_t m128_u8[16];   // as unsigned 8-bit integers.
286*3f1979aaSAndroid Build Coastguard Worker     uint16_t m128_u16[8];  // as unsigned 16-bit integers.
287*3f1979aaSAndroid Build Coastguard Worker     uint32_t m128_u32[4];  // as unsigned 32-bit integers.
288*3f1979aaSAndroid Build Coastguard Worker     uint64_t m128_u64[2];  // as unsigned 64-bit integers.
289*3f1979aaSAndroid Build Coastguard Worker } SIMDVec;
290*3f1979aaSAndroid Build Coastguard Worker 
291*3f1979aaSAndroid Build Coastguard Worker // casting using SIMDVec
292*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
293*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
294*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
295*3f1979aaSAndroid Build Coastguard Worker 
296*3f1979aaSAndroid Build Coastguard Worker /* Backwards compatibility for compilers with lack of specific type support */
297*3f1979aaSAndroid Build Coastguard Worker 
298*3f1979aaSAndroid Build Coastguard Worker // Older gcc does not define vld1q_u8_x4 type
299*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__) && !defined(__clang__)
300*3f1979aaSAndroid Build Coastguard Worker #if __GNUC__ <= 9
vld1q_u8_x4(const uint8_t * p)301*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
302*3f1979aaSAndroid Build Coastguard Worker {
303*3f1979aaSAndroid Build Coastguard Worker     uint8x16x4_t ret;
304*3f1979aaSAndroid Build Coastguard Worker     ret.val[0] = vld1q_u8(p + 0);
305*3f1979aaSAndroid Build Coastguard Worker     ret.val[1] = vld1q_u8(p + 16);
306*3f1979aaSAndroid Build Coastguard Worker     ret.val[2] = vld1q_u8(p + 32);
307*3f1979aaSAndroid Build Coastguard Worker     ret.val[3] = vld1q_u8(p + 48);
308*3f1979aaSAndroid Build Coastguard Worker     return ret;
309*3f1979aaSAndroid Build Coastguard Worker }
310*3f1979aaSAndroid Build Coastguard Worker #endif
311*3f1979aaSAndroid Build Coastguard Worker #endif
312*3f1979aaSAndroid Build Coastguard Worker 
313*3f1979aaSAndroid Build Coastguard Worker /* Function Naming Conventions
314*3f1979aaSAndroid Build Coastguard Worker  * The naming convention of SSE intrinsics is straightforward. A generic SSE
315*3f1979aaSAndroid Build Coastguard Worker  * intrinsic function is given as follows:
316*3f1979aaSAndroid Build Coastguard Worker  *   _mm_<name>_<data_type>
317*3f1979aaSAndroid Build Coastguard Worker  *
318*3f1979aaSAndroid Build Coastguard Worker  * The parts of this format are given as follows:
319*3f1979aaSAndroid Build Coastguard Worker  * 1. <name> describes the operation performed by the intrinsic
320*3f1979aaSAndroid Build Coastguard Worker  * 2. <data_type> identifies the data type of the function's primary arguments
321*3f1979aaSAndroid Build Coastguard Worker  *
322*3f1979aaSAndroid Build Coastguard Worker  * This last part, <data_type>, is a little complicated. It identifies the
323*3f1979aaSAndroid Build Coastguard Worker  * content of the input values, and can be set to any of the following values:
324*3f1979aaSAndroid Build Coastguard Worker  * + ps - vectors contain floats (ps stands for packed single-precision)
325*3f1979aaSAndroid Build Coastguard Worker  * + pd - vectors cantain doubles (pd stands for packed double-precision)
326*3f1979aaSAndroid Build Coastguard Worker  * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
327*3f1979aaSAndroid Build Coastguard Worker  *                            signed integers
328*3f1979aaSAndroid Build Coastguard Worker  * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
329*3f1979aaSAndroid Build Coastguard Worker  *                            unsigned integers
330*3f1979aaSAndroid Build Coastguard Worker  * + si128 - unspecified 128-bit vector or 256-bit vector
331*3f1979aaSAndroid Build Coastguard Worker  * + m128/m128i/m128d - identifies input vector types when they are different
332*3f1979aaSAndroid Build Coastguard Worker  *                      than the type of the returned vector
333*3f1979aaSAndroid Build Coastguard Worker  *
334*3f1979aaSAndroid Build Coastguard Worker  * For example, _mm_setzero_ps. The _mm implies that the function returns
335*3f1979aaSAndroid Build Coastguard Worker  * a 128-bit vector. The _ps at the end implies that the argument vectors
336*3f1979aaSAndroid Build Coastguard Worker  * contain floats.
337*3f1979aaSAndroid Build Coastguard Worker  *
338*3f1979aaSAndroid Build Coastguard Worker  * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
339*3f1979aaSAndroid Build Coastguard Worker  *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
340*3f1979aaSAndroid Build Coastguard Worker  *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
341*3f1979aaSAndroid Build Coastguard Worker  *   // Set packed 8-bit integers
342*3f1979aaSAndroid Build Coastguard Worker  *   // 128 bits, 16 chars, per 8 bits
343*3f1979aaSAndroid Build Coastguard Worker  *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
344*3f1979aaSAndroid Build Coastguard Worker  *                                  4, 5, 12, 13, 6, 7, 14, 15);
345*3f1979aaSAndroid Build Coastguard Worker  *   // Shuffle packed 8-bit integers
346*3f1979aaSAndroid Build Coastguard Worker  *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
347*3f1979aaSAndroid Build Coastguard Worker  *
348*3f1979aaSAndroid Build Coastguard Worker  * Data (Number, Binary, Byte Index):
349*3f1979aaSAndroid Build Coastguard Worker     +------+------+-------------+------+------+-------------+
350*3f1979aaSAndroid Build Coastguard Worker     |      1      |      2      |      3      |      4      | Number
351*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
352*3f1979aaSAndroid Build Coastguard Worker     | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
353*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
354*3f1979aaSAndroid Build Coastguard Worker     |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
355*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
356*3f1979aaSAndroid Build Coastguard Worker 
357*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
358*3f1979aaSAndroid Build Coastguard Worker     |      5      |      6      |      7      |      8      | Number
359*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
360*3f1979aaSAndroid Build Coastguard Worker     | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
361*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
362*3f1979aaSAndroid Build Coastguard Worker     |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
363*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
364*3f1979aaSAndroid Build Coastguard Worker  * Index (Byte Index):
365*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
366*3f1979aaSAndroid Build Coastguard Worker     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
367*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
368*3f1979aaSAndroid Build Coastguard Worker 
369*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
370*3f1979aaSAndroid Build Coastguard Worker     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
371*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
372*3f1979aaSAndroid Build Coastguard Worker  * Result:
373*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
374*3f1979aaSAndroid Build Coastguard Worker     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
375*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
376*3f1979aaSAndroid Build Coastguard Worker     | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
377*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
378*3f1979aaSAndroid Build Coastguard Worker     |     256     |      2      |      5      |      6      | Number
379*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
380*3f1979aaSAndroid Build Coastguard Worker 
381*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
382*3f1979aaSAndroid Build Coastguard Worker     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
383*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
384*3f1979aaSAndroid Build Coastguard Worker     | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
385*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+------+------+
386*3f1979aaSAndroid Build Coastguard Worker     |      3      |      7      |      4      |      8      | Number
387*3f1979aaSAndroid Build Coastguard Worker     +------+------+------+------+------+------+-------------+
388*3f1979aaSAndroid Build Coastguard Worker  */
389*3f1979aaSAndroid Build Coastguard Worker 
390*3f1979aaSAndroid Build Coastguard Worker /* Set/get methods */
391*3f1979aaSAndroid Build Coastguard Worker 
392*3f1979aaSAndroid Build Coastguard Worker /* Constants for use with _mm_prefetch.  */
393*3f1979aaSAndroid Build Coastguard Worker enum _mm_hint {
394*3f1979aaSAndroid Build Coastguard Worker     _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
395*3f1979aaSAndroid Build Coastguard Worker     _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
396*3f1979aaSAndroid Build Coastguard Worker     _MM_HINT_T1 = 2,   /* load data to L2 cache only */
397*3f1979aaSAndroid Build Coastguard Worker     _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
398*3f1979aaSAndroid Build Coastguard Worker     _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
399*3f1979aaSAndroid Build Coastguard Worker     _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
400*3f1979aaSAndroid Build Coastguard Worker     _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
401*3f1979aaSAndroid Build Coastguard Worker     _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
402*3f1979aaSAndroid Build Coastguard Worker };
403*3f1979aaSAndroid Build Coastguard Worker 
404*3f1979aaSAndroid Build Coastguard Worker // Loads one cache line of data from address p to a location closer to the
405*3f1979aaSAndroid Build Coastguard Worker // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
_mm_prefetch(const void * p,int i)406*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_prefetch(const void *p, int i)
407*3f1979aaSAndroid Build Coastguard Worker {
408*3f1979aaSAndroid Build Coastguard Worker     (void) i;
409*3f1979aaSAndroid Build Coastguard Worker     __builtin_prefetch(p);
410*3f1979aaSAndroid Build Coastguard Worker }
411*3f1979aaSAndroid Build Coastguard Worker 
412*3f1979aaSAndroid Build Coastguard Worker // Copy the lower single-precision (32-bit) floating-point element of a to dst.
413*3f1979aaSAndroid Build Coastguard Worker //
414*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := a[31:0]
415*3f1979aaSAndroid Build Coastguard Worker //
416*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
_mm_cvtss_f32(__m128 a)417*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE float _mm_cvtss_f32(__m128 a)
418*3f1979aaSAndroid Build Coastguard Worker {
419*3f1979aaSAndroid Build Coastguard Worker     return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
420*3f1979aaSAndroid Build Coastguard Worker }
421*3f1979aaSAndroid Build Coastguard Worker 
422*3f1979aaSAndroid Build Coastguard Worker // Sets the 128-bit value to zero
423*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
_mm_setzero_si128(void)424*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_setzero_si128(void)
425*3f1979aaSAndroid Build Coastguard Worker {
426*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vdupq_n_s32(0));
427*3f1979aaSAndroid Build Coastguard Worker }
428*3f1979aaSAndroid Build Coastguard Worker 
429*3f1979aaSAndroid Build Coastguard Worker // Clears the four single-precision, floating-point values.
430*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
_mm_setzero_ps(void)431*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_setzero_ps(void)
432*3f1979aaSAndroid Build Coastguard Worker {
433*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vdupq_n_f32(0));
434*3f1979aaSAndroid Build Coastguard Worker }
435*3f1979aaSAndroid Build Coastguard Worker 
436*3f1979aaSAndroid Build Coastguard Worker // Sets the four single-precision, floating-point values to w.
437*3f1979aaSAndroid Build Coastguard Worker //
438*3f1979aaSAndroid Build Coastguard Worker //   r0 := r1 := r2 := r3 := w
439*3f1979aaSAndroid Build Coastguard Worker //
440*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set1_ps(float _w)441*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_set1_ps(float _w)
442*3f1979aaSAndroid Build Coastguard Worker {
443*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
444*3f1979aaSAndroid Build Coastguard Worker }
445*3f1979aaSAndroid Build Coastguard Worker 
446*3f1979aaSAndroid Build Coastguard Worker // Sets the four single-precision, floating-point values to w.
447*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set_ps1(float _w)448*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_set_ps1(float _w)
449*3f1979aaSAndroid Build Coastguard Worker {
450*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
451*3f1979aaSAndroid Build Coastguard Worker }
452*3f1979aaSAndroid Build Coastguard Worker 
453*3f1979aaSAndroid Build Coastguard Worker // Sets the four single-precision, floating-point values to the four inputs.
454*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
_mm_set_ps(float w,float z,float y,float x)455*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
456*3f1979aaSAndroid Build Coastguard Worker {
457*3f1979aaSAndroid Build Coastguard Worker     float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
458*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vld1q_f32(data));
459*3f1979aaSAndroid Build Coastguard Worker }
460*3f1979aaSAndroid Build Coastguard Worker 
461*3f1979aaSAndroid Build Coastguard Worker // Copy single-precision (32-bit) floating-point element a to the lower element
462*3f1979aaSAndroid Build Coastguard Worker // of dst, and zero the upper 3 elements.
463*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
_mm_set_ss(float a)464*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_set_ss(float a)
465*3f1979aaSAndroid Build Coastguard Worker {
466*3f1979aaSAndroid Build Coastguard Worker     float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
467*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vld1q_f32(data));
468*3f1979aaSAndroid Build Coastguard Worker }
469*3f1979aaSAndroid Build Coastguard Worker 
470*3f1979aaSAndroid Build Coastguard Worker // Sets the four single-precision, floating-point values to the four inputs in
471*3f1979aaSAndroid Build Coastguard Worker // reverse order.
472*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
_mm_setr_ps(float w,float z,float y,float x)473*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
474*3f1979aaSAndroid Build Coastguard Worker {
475*3f1979aaSAndroid Build Coastguard Worker     float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
476*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vld1q_f32(data));
477*3f1979aaSAndroid Build Coastguard Worker }
478*3f1979aaSAndroid Build Coastguard Worker 
479*3f1979aaSAndroid Build Coastguard Worker // Sets the 8 signed 16-bit integer values in reverse order.
480*3f1979aaSAndroid Build Coastguard Worker //
481*3f1979aaSAndroid Build Coastguard Worker // Return Value
482*3f1979aaSAndroid Build Coastguard Worker //   r0 := w0
483*3f1979aaSAndroid Build Coastguard Worker //   r1 := w1
484*3f1979aaSAndroid Build Coastguard Worker //   ...
485*3f1979aaSAndroid Build Coastguard Worker //   r7 := w7
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)486*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_setr_epi16(short w0,
487*3f1979aaSAndroid Build Coastguard Worker                                     short w1,
488*3f1979aaSAndroid Build Coastguard Worker                                     short w2,
489*3f1979aaSAndroid Build Coastguard Worker                                     short w3,
490*3f1979aaSAndroid Build Coastguard Worker                                     short w4,
491*3f1979aaSAndroid Build Coastguard Worker                                     short w5,
492*3f1979aaSAndroid Build Coastguard Worker                                     short w6,
493*3f1979aaSAndroid Build Coastguard Worker                                     short w7)
494*3f1979aaSAndroid Build Coastguard Worker {
495*3f1979aaSAndroid Build Coastguard Worker     int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
496*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
497*3f1979aaSAndroid Build Coastguard Worker }
498*3f1979aaSAndroid Build Coastguard Worker 
499*3f1979aaSAndroid Build Coastguard Worker // Sets the 4 signed 32-bit integer values in reverse order
500*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
_mm_setr_epi32(int i3,int i2,int i1,int i0)501*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
502*3f1979aaSAndroid Build Coastguard Worker {
503*3f1979aaSAndroid Build Coastguard Worker     int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
504*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vld1q_s32(data));
505*3f1979aaSAndroid Build Coastguard Worker }
506*3f1979aaSAndroid Build Coastguard Worker 
507*3f1979aaSAndroid Build Coastguard Worker // Set packed 64-bit integers in dst with the supplied values in reverse order.
508*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
_mm_setr_epi64(__m64 e1,__m64 e0)509*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
510*3f1979aaSAndroid Build Coastguard Worker {
511*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
512*3f1979aaSAndroid Build Coastguard Worker }
513*3f1979aaSAndroid Build Coastguard Worker 
514*3f1979aaSAndroid Build Coastguard Worker // Sets the 16 signed 8-bit integer values to b.
515*3f1979aaSAndroid Build Coastguard Worker //
516*3f1979aaSAndroid Build Coastguard Worker //   r0 := b
517*3f1979aaSAndroid Build Coastguard Worker //   r1 := b
518*3f1979aaSAndroid Build Coastguard Worker //   ...
519*3f1979aaSAndroid Build Coastguard Worker //   r15 := b
520*3f1979aaSAndroid Build Coastguard Worker //
521*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
_mm_set1_epi8(signed char w)522*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
523*3f1979aaSAndroid Build Coastguard Worker {
524*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(vdupq_n_s8(w));
525*3f1979aaSAndroid Build Coastguard Worker }
526*3f1979aaSAndroid Build Coastguard Worker 
527*3f1979aaSAndroid Build Coastguard Worker // Sets the 8 signed 16-bit integer values to w.
528*3f1979aaSAndroid Build Coastguard Worker //
529*3f1979aaSAndroid Build Coastguard Worker //   r0 := w
530*3f1979aaSAndroid Build Coastguard Worker //   r1 := w
531*3f1979aaSAndroid Build Coastguard Worker //   ...
532*3f1979aaSAndroid Build Coastguard Worker //   r7 := w
533*3f1979aaSAndroid Build Coastguard Worker //
534*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
_mm_set1_epi16(short w)535*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set1_epi16(short w)
536*3f1979aaSAndroid Build Coastguard Worker {
537*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vdupq_n_s16(w));
538*3f1979aaSAndroid Build Coastguard Worker }
539*3f1979aaSAndroid Build Coastguard Worker 
540*3f1979aaSAndroid Build Coastguard Worker // Sets the 16 signed 8-bit integer values.
541*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
_mm_set_epi8(signed char b15,signed char b14,signed char b13,signed char b12,signed char b11,signed char b10,signed char b9,signed char b8,signed char b7,signed char b6,signed char b5,signed char b4,signed char b3,signed char b2,signed char b1,signed char b0)542*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
543*3f1979aaSAndroid Build Coastguard Worker                                   signed char b14,
544*3f1979aaSAndroid Build Coastguard Worker                                   signed char b13,
545*3f1979aaSAndroid Build Coastguard Worker                                   signed char b12,
546*3f1979aaSAndroid Build Coastguard Worker                                   signed char b11,
547*3f1979aaSAndroid Build Coastguard Worker                                   signed char b10,
548*3f1979aaSAndroid Build Coastguard Worker                                   signed char b9,
549*3f1979aaSAndroid Build Coastguard Worker                                   signed char b8,
550*3f1979aaSAndroid Build Coastguard Worker                                   signed char b7,
551*3f1979aaSAndroid Build Coastguard Worker                                   signed char b6,
552*3f1979aaSAndroid Build Coastguard Worker                                   signed char b5,
553*3f1979aaSAndroid Build Coastguard Worker                                   signed char b4,
554*3f1979aaSAndroid Build Coastguard Worker                                   signed char b3,
555*3f1979aaSAndroid Build Coastguard Worker                                   signed char b2,
556*3f1979aaSAndroid Build Coastguard Worker                                   signed char b1,
557*3f1979aaSAndroid Build Coastguard Worker                                   signed char b0)
558*3f1979aaSAndroid Build Coastguard Worker {
559*3f1979aaSAndroid Build Coastguard Worker     int8_t ALIGN_STRUCT(16)
560*3f1979aaSAndroid Build Coastguard Worker         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
561*3f1979aaSAndroid Build Coastguard Worker                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
562*3f1979aaSAndroid Build Coastguard Worker                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
563*3f1979aaSAndroid Build Coastguard Worker                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
564*3f1979aaSAndroid Build Coastguard Worker     return (__m128i) vld1q_s8(data);
565*3f1979aaSAndroid Build Coastguard Worker }
566*3f1979aaSAndroid Build Coastguard Worker 
567*3f1979aaSAndroid Build Coastguard Worker // Sets the 8 signed 16-bit integer values.
568*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
_mm_set_epi16(short i7,short i6,short i5,short i4,short i3,short i2,short i1,short i0)569*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set_epi16(short i7,
570*3f1979aaSAndroid Build Coastguard Worker                                    short i6,
571*3f1979aaSAndroid Build Coastguard Worker                                    short i5,
572*3f1979aaSAndroid Build Coastguard Worker                                    short i4,
573*3f1979aaSAndroid Build Coastguard Worker                                    short i3,
574*3f1979aaSAndroid Build Coastguard Worker                                    short i2,
575*3f1979aaSAndroid Build Coastguard Worker                                    short i1,
576*3f1979aaSAndroid Build Coastguard Worker                                    short i0)
577*3f1979aaSAndroid Build Coastguard Worker {
578*3f1979aaSAndroid Build Coastguard Worker     int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
579*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vld1q_s16(data));
580*3f1979aaSAndroid Build Coastguard Worker }
581*3f1979aaSAndroid Build Coastguard Worker 
582*3f1979aaSAndroid Build Coastguard Worker // Sets the 16 signed 8-bit integer values in reverse order.
583*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
_mm_setr_epi8(signed char b0,signed char b1,signed char b2,signed char b3,signed char b4,signed char b5,signed char b6,signed char b7,signed char b8,signed char b9,signed char b10,signed char b11,signed char b12,signed char b13,signed char b14,signed char b15)584*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
585*3f1979aaSAndroid Build Coastguard Worker                                    signed char b1,
586*3f1979aaSAndroid Build Coastguard Worker                                    signed char b2,
587*3f1979aaSAndroid Build Coastguard Worker                                    signed char b3,
588*3f1979aaSAndroid Build Coastguard Worker                                    signed char b4,
589*3f1979aaSAndroid Build Coastguard Worker                                    signed char b5,
590*3f1979aaSAndroid Build Coastguard Worker                                    signed char b6,
591*3f1979aaSAndroid Build Coastguard Worker                                    signed char b7,
592*3f1979aaSAndroid Build Coastguard Worker                                    signed char b8,
593*3f1979aaSAndroid Build Coastguard Worker                                    signed char b9,
594*3f1979aaSAndroid Build Coastguard Worker                                    signed char b10,
595*3f1979aaSAndroid Build Coastguard Worker                                    signed char b11,
596*3f1979aaSAndroid Build Coastguard Worker                                    signed char b12,
597*3f1979aaSAndroid Build Coastguard Worker                                    signed char b13,
598*3f1979aaSAndroid Build Coastguard Worker                                    signed char b14,
599*3f1979aaSAndroid Build Coastguard Worker                                    signed char b15)
600*3f1979aaSAndroid Build Coastguard Worker {
601*3f1979aaSAndroid Build Coastguard Worker     int8_t ALIGN_STRUCT(16)
602*3f1979aaSAndroid Build Coastguard Worker         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
603*3f1979aaSAndroid Build Coastguard Worker                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
604*3f1979aaSAndroid Build Coastguard Worker                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
605*3f1979aaSAndroid Build Coastguard Worker                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
606*3f1979aaSAndroid Build Coastguard Worker     return (__m128i) vld1q_s8(data);
607*3f1979aaSAndroid Build Coastguard Worker }
608*3f1979aaSAndroid Build Coastguard Worker 
609*3f1979aaSAndroid Build Coastguard Worker // Sets the 4 signed 32-bit integer values to i.
610*3f1979aaSAndroid Build Coastguard Worker //
611*3f1979aaSAndroid Build Coastguard Worker //   r0 := i
612*3f1979aaSAndroid Build Coastguard Worker //   r1 := i
613*3f1979aaSAndroid Build Coastguard Worker //   r2 := i
614*3f1979aaSAndroid Build Coastguard Worker //   r3 := I
615*3f1979aaSAndroid Build Coastguard Worker //
616*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
_mm_set1_epi32(int _i)617*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set1_epi32(int _i)
618*3f1979aaSAndroid Build Coastguard Worker {
619*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
620*3f1979aaSAndroid Build Coastguard Worker }
621*3f1979aaSAndroid Build Coastguard Worker 
622*3f1979aaSAndroid Build Coastguard Worker // Sets the 2 signed 64-bit integer values to i.
623*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
_mm_set1_epi64(__m64 _i)624*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
625*3f1979aaSAndroid Build Coastguard Worker {
626*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
627*3f1979aaSAndroid Build Coastguard Worker }
628*3f1979aaSAndroid Build Coastguard Worker 
629*3f1979aaSAndroid Build Coastguard Worker // Sets the 2 signed 64-bit integer values to i.
630*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
_mm_set1_epi64x(int64_t _i)631*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
632*3f1979aaSAndroid Build Coastguard Worker {
633*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
634*3f1979aaSAndroid Build Coastguard Worker }
635*3f1979aaSAndroid Build Coastguard Worker 
636*3f1979aaSAndroid Build Coastguard Worker // Sets the 4 signed 32-bit integer values.
637*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
_mm_set_epi32(int i3,int i2,int i1,int i0)638*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
639*3f1979aaSAndroid Build Coastguard Worker {
640*3f1979aaSAndroid Build Coastguard Worker     int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
641*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vld1q_s32(data));
642*3f1979aaSAndroid Build Coastguard Worker }
643*3f1979aaSAndroid Build Coastguard Worker 
644*3f1979aaSAndroid Build Coastguard Worker // Returns the __m128i structure with its two 64-bit integer values
645*3f1979aaSAndroid Build Coastguard Worker // initialized to the values of the two 64-bit integers passed in.
646*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64x(int64_t i1,int64_t i2)647*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
648*3f1979aaSAndroid Build Coastguard Worker {
649*3f1979aaSAndroid Build Coastguard Worker     int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
650*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vld1q_s64(data));
651*3f1979aaSAndroid Build Coastguard Worker }
652*3f1979aaSAndroid Build Coastguard Worker 
653*3f1979aaSAndroid Build Coastguard Worker // Returns the __m128i structure with its two 64-bit integer values
654*3f1979aaSAndroid Build Coastguard Worker // initialized to the values of the two 64-bit integers passed in.
655*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64(__m64 i1,__m64 i2)656*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
657*3f1979aaSAndroid Build Coastguard Worker {
658*3f1979aaSAndroid Build Coastguard Worker     return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
659*3f1979aaSAndroid Build Coastguard Worker }
660*3f1979aaSAndroid Build Coastguard Worker 
661*3f1979aaSAndroid Build Coastguard Worker // Set packed double-precision (64-bit) floating-point elements in dst with the
662*3f1979aaSAndroid Build Coastguard Worker // supplied values.
663*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
_mm_set_pd(double e1,double e0)664*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
665*3f1979aaSAndroid Build Coastguard Worker {
666*3f1979aaSAndroid Build Coastguard Worker     double ALIGN_STRUCT(16) data[2] = {e0, e1};
667*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
668*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
669*3f1979aaSAndroid Build Coastguard Worker #else
670*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
671*3f1979aaSAndroid Build Coastguard Worker #endif
672*3f1979aaSAndroid Build Coastguard Worker }
673*3f1979aaSAndroid Build Coastguard Worker 
674*3f1979aaSAndroid Build Coastguard Worker // Stores four single-precision, floating-point values.
675*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
_mm_store_ps(float * p,__m128 a)676*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
677*3f1979aaSAndroid Build Coastguard Worker {
678*3f1979aaSAndroid Build Coastguard Worker     vst1q_f32(p, vreinterpretq_f32_m128(a));
679*3f1979aaSAndroid Build Coastguard Worker }
680*3f1979aaSAndroid Build Coastguard Worker 
681*3f1979aaSAndroid Build Coastguard Worker // Stores four single-precision, floating-point values.
682*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
_mm_storeu_ps(float * p,__m128 a)683*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
684*3f1979aaSAndroid Build Coastguard Worker {
685*3f1979aaSAndroid Build Coastguard Worker     vst1q_f32(p, vreinterpretq_f32_m128(a));
686*3f1979aaSAndroid Build Coastguard Worker }
687*3f1979aaSAndroid Build Coastguard Worker 
688*3f1979aaSAndroid Build Coastguard Worker // Stores four 32-bit integer values as (as a __m128i value) at the address p.
689*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_store_si128(__m128i * p,__m128i a)690*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
691*3f1979aaSAndroid Build Coastguard Worker {
692*3f1979aaSAndroid Build Coastguard Worker     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
693*3f1979aaSAndroid Build Coastguard Worker }
694*3f1979aaSAndroid Build Coastguard Worker 
695*3f1979aaSAndroid Build Coastguard Worker // Stores four 32-bit integer values as (as a __m128i value) at the address p.
696*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_storeu_si128(__m128i * p,__m128i a)697*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
698*3f1979aaSAndroid Build Coastguard Worker {
699*3f1979aaSAndroid Build Coastguard Worker     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
700*3f1979aaSAndroid Build Coastguard Worker }
701*3f1979aaSAndroid Build Coastguard Worker 
702*3f1979aaSAndroid Build Coastguard Worker // Stores the lower single - precision, floating - point value.
703*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
_mm_store_ss(float * p,__m128 a)704*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
705*3f1979aaSAndroid Build Coastguard Worker {
706*3f1979aaSAndroid Build Coastguard Worker     vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
707*3f1979aaSAndroid Build Coastguard Worker }
708*3f1979aaSAndroid Build Coastguard Worker 
709*3f1979aaSAndroid Build Coastguard Worker // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
710*3f1979aaSAndroid Build Coastguard Worker // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
711*3f1979aaSAndroid Build Coastguard Worker // or a general-protection exception may be generated.
712*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
_mm_store_pd(double * mem_addr,__m128d a)713*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
714*3f1979aaSAndroid Build Coastguard Worker {
715*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
716*3f1979aaSAndroid Build Coastguard Worker     vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
717*3f1979aaSAndroid Build Coastguard Worker #else
718*3f1979aaSAndroid Build Coastguard Worker     vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
719*3f1979aaSAndroid Build Coastguard Worker #endif
720*3f1979aaSAndroid Build Coastguard Worker }
721*3f1979aaSAndroid Build Coastguard Worker 
722*3f1979aaSAndroid Build Coastguard Worker // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
723*3f1979aaSAndroid Build Coastguard Worker // elements) from a into memory. mem_addr does not need to be aligned on any
724*3f1979aaSAndroid Build Coastguard Worker // particular boundary.
725*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
_mm_storeu_pd(double * mem_addr,__m128d a)726*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
727*3f1979aaSAndroid Build Coastguard Worker {
728*3f1979aaSAndroid Build Coastguard Worker     _mm_store_pd(mem_addr, a);
729*3f1979aaSAndroid Build Coastguard Worker }
730*3f1979aaSAndroid Build Coastguard Worker 
731*3f1979aaSAndroid Build Coastguard Worker // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
732*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
_mm_storel_epi64(__m128i * a,__m128i b)733*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
734*3f1979aaSAndroid Build Coastguard Worker {
735*3f1979aaSAndroid Build Coastguard Worker     uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
736*3f1979aaSAndroid Build Coastguard Worker     uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
737*3f1979aaSAndroid Build Coastguard Worker     *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
738*3f1979aaSAndroid Build Coastguard Worker }
739*3f1979aaSAndroid Build Coastguard Worker 
740*3f1979aaSAndroid Build Coastguard Worker // Stores the lower two single-precision floating point values of a to the
741*3f1979aaSAndroid Build Coastguard Worker // address p.
742*3f1979aaSAndroid Build Coastguard Worker //
743*3f1979aaSAndroid Build Coastguard Worker //   *p0 := a0
744*3f1979aaSAndroid Build Coastguard Worker //   *p1 := a1
745*3f1979aaSAndroid Build Coastguard Worker //
746*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
_mm_storel_pi(__m64 * p,__m128 a)747*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
748*3f1979aaSAndroid Build Coastguard Worker {
749*3f1979aaSAndroid Build Coastguard Worker     *p = vreinterpret_m64_f32(vget_low_f32(a));
750*3f1979aaSAndroid Build Coastguard Worker }
751*3f1979aaSAndroid Build Coastguard Worker 
752*3f1979aaSAndroid Build Coastguard Worker // Stores the upper two single-precision, floating-point values of a to the
753*3f1979aaSAndroid Build Coastguard Worker // address p.
754*3f1979aaSAndroid Build Coastguard Worker //
755*3f1979aaSAndroid Build Coastguard Worker //   *p0 := a2
756*3f1979aaSAndroid Build Coastguard Worker //   *p1 := a3
757*3f1979aaSAndroid Build Coastguard Worker //
758*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
_mm_storeh_pi(__m64 * p,__m128 a)759*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
760*3f1979aaSAndroid Build Coastguard Worker {
761*3f1979aaSAndroid Build Coastguard Worker     *p = vreinterpret_m64_f32(vget_high_f32(a));
762*3f1979aaSAndroid Build Coastguard Worker }
763*3f1979aaSAndroid Build Coastguard Worker 
764*3f1979aaSAndroid Build Coastguard Worker // Loads a single single-precision, floating-point value, copying it into all
765*3f1979aaSAndroid Build Coastguard Worker // four words
766*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
_mm_load1_ps(const float * p)767*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_load1_ps(const float *p)
768*3f1979aaSAndroid Build Coastguard Worker {
769*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vld1q_dup_f32(p));
770*3f1979aaSAndroid Build Coastguard Worker }
771*3f1979aaSAndroid Build Coastguard Worker 
772*3f1979aaSAndroid Build Coastguard Worker // Load a single-precision (32-bit) floating-point element from memory into all
773*3f1979aaSAndroid Build Coastguard Worker // elements of dst.
774*3f1979aaSAndroid Build Coastguard Worker //
775*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := MEM[mem_addr+31:mem_addr]
776*3f1979aaSAndroid Build Coastguard Worker //   dst[63:32] := MEM[mem_addr+31:mem_addr]
777*3f1979aaSAndroid Build Coastguard Worker //   dst[95:64] := MEM[mem_addr+31:mem_addr]
778*3f1979aaSAndroid Build Coastguard Worker //   dst[127:96] := MEM[mem_addr+31:mem_addr]
779*3f1979aaSAndroid Build Coastguard Worker //
780*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
781*3f1979aaSAndroid Build Coastguard Worker #define _mm_load_ps1 _mm_load1_ps
782*3f1979aaSAndroid Build Coastguard Worker 
783*3f1979aaSAndroid Build Coastguard Worker // Sets the lower two single-precision, floating-point values with 64
784*3f1979aaSAndroid Build Coastguard Worker // bits of data loaded from the address p; the upper two values are passed
785*3f1979aaSAndroid Build Coastguard Worker // through from a.
786*3f1979aaSAndroid Build Coastguard Worker //
787*3f1979aaSAndroid Build Coastguard Worker // Return Value
788*3f1979aaSAndroid Build Coastguard Worker //   r0 := *p0
789*3f1979aaSAndroid Build Coastguard Worker //   r1 := *p1
790*3f1979aaSAndroid Build Coastguard Worker //   r2 := a2
791*3f1979aaSAndroid Build Coastguard Worker //   r3 := a3
792*3f1979aaSAndroid Build Coastguard Worker //
793*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
_mm_loadl_pi(__m128 a,__m64 const * p)794*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
795*3f1979aaSAndroid Build Coastguard Worker {
796*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
797*3f1979aaSAndroid Build Coastguard Worker         vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
798*3f1979aaSAndroid Build Coastguard Worker }
799*3f1979aaSAndroid Build Coastguard Worker 
800*3f1979aaSAndroid Build Coastguard Worker // Load 4 single-precision (32-bit) floating-point elements from memory into dst
801*3f1979aaSAndroid Build Coastguard Worker // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
802*3f1979aaSAndroid Build Coastguard Worker // general-protection exception may be generated.
803*3f1979aaSAndroid Build Coastguard Worker //
804*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
805*3f1979aaSAndroid Build Coastguard Worker //   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
806*3f1979aaSAndroid Build Coastguard Worker //   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
807*3f1979aaSAndroid Build Coastguard Worker //   dst[127:96] := MEM[mem_addr+31:mem_addr]
808*3f1979aaSAndroid Build Coastguard Worker //
809*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
_mm_loadr_ps(const float * p)810*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
811*3f1979aaSAndroid Build Coastguard Worker {
812*3f1979aaSAndroid Build Coastguard Worker     float32x4_t v = vrev64q_f32(vld1q_f32(p));
813*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
814*3f1979aaSAndroid Build Coastguard Worker }
815*3f1979aaSAndroid Build Coastguard Worker 
816*3f1979aaSAndroid Build Coastguard Worker // Sets the upper two single-precision, floating-point values with 64
817*3f1979aaSAndroid Build Coastguard Worker // bits of data loaded from the address p; the lower two values are passed
818*3f1979aaSAndroid Build Coastguard Worker // through from a.
819*3f1979aaSAndroid Build Coastguard Worker //
820*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0
821*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1
822*3f1979aaSAndroid Build Coastguard Worker //   r2 := *p0
823*3f1979aaSAndroid Build Coastguard Worker //   r3 := *p1
824*3f1979aaSAndroid Build Coastguard Worker //
825*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
_mm_loadh_pi(__m128 a,__m64 const * p)826*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
827*3f1979aaSAndroid Build Coastguard Worker {
828*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
829*3f1979aaSAndroid Build Coastguard Worker         vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
830*3f1979aaSAndroid Build Coastguard Worker }
831*3f1979aaSAndroid Build Coastguard Worker 
832*3f1979aaSAndroid Build Coastguard Worker // Loads four single-precision, floating-point values.
833*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
_mm_load_ps(const float * p)834*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_load_ps(const float *p)
835*3f1979aaSAndroid Build Coastguard Worker {
836*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vld1q_f32(p));
837*3f1979aaSAndroid Build Coastguard Worker }
838*3f1979aaSAndroid Build Coastguard Worker 
839*3f1979aaSAndroid Build Coastguard Worker // Loads four single-precision, floating-point values.
840*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
_mm_loadu_ps(const float * p)841*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
842*3f1979aaSAndroid Build Coastguard Worker {
843*3f1979aaSAndroid Build Coastguard Worker     // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
844*3f1979aaSAndroid Build Coastguard Worker     // equivalent for neon
845*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vld1q_f32(p));
846*3f1979aaSAndroid Build Coastguard Worker }
847*3f1979aaSAndroid Build Coastguard Worker 
848*3f1979aaSAndroid Build Coastguard Worker // Load unaligned 16-bit integer from memory into the first element of dst.
849*3f1979aaSAndroid Build Coastguard Worker //
850*3f1979aaSAndroid Build Coastguard Worker //   dst[15:0] := MEM[mem_addr+15:mem_addr]
851*3f1979aaSAndroid Build Coastguard Worker //   dst[MAX:16] := 0
852*3f1979aaSAndroid Build Coastguard Worker //
853*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
_mm_loadu_si16(const void * p)854*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
855*3f1979aaSAndroid Build Coastguard Worker {
856*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
857*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
858*3f1979aaSAndroid Build Coastguard Worker }
859*3f1979aaSAndroid Build Coastguard Worker 
860*3f1979aaSAndroid Build Coastguard Worker // Load unaligned 64-bit integer from memory into the first element of dst.
861*3f1979aaSAndroid Build Coastguard Worker //
862*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := MEM[mem_addr+63:mem_addr]
863*3f1979aaSAndroid Build Coastguard Worker //   dst[MAX:64] := 0
864*3f1979aaSAndroid Build Coastguard Worker //
865*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
_mm_loadu_si64(const void * p)866*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
867*3f1979aaSAndroid Build Coastguard Worker {
868*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(
869*3f1979aaSAndroid Build Coastguard Worker         vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
870*3f1979aaSAndroid Build Coastguard Worker }
871*3f1979aaSAndroid Build Coastguard Worker 
872*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into the
873*3f1979aaSAndroid Build Coastguard Worker // lower of dst, and zero the upper element. mem_addr does not need to be
874*3f1979aaSAndroid Build Coastguard Worker // aligned on any particular boundary.
875*3f1979aaSAndroid Build Coastguard Worker //
876*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := MEM[mem_addr+63:mem_addr]
877*3f1979aaSAndroid Build Coastguard Worker //   dst[127:64] := 0
878*3f1979aaSAndroid Build Coastguard Worker //
879*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
_mm_load_sd(const double * p)880*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_load_sd(const double *p)
881*3f1979aaSAndroid Build Coastguard Worker {
882*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
883*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
884*3f1979aaSAndroid Build Coastguard Worker #else
885*3f1979aaSAndroid Build Coastguard Worker     const float *fp = (const float *) p;
886*3f1979aaSAndroid Build Coastguard Worker     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
887*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f32(vld1q_f32(data));
888*3f1979aaSAndroid Build Coastguard Worker #endif
889*3f1979aaSAndroid Build Coastguard Worker }
890*3f1979aaSAndroid Build Coastguard Worker 
891*3f1979aaSAndroid Build Coastguard Worker // Loads two double-precision from 16-byte aligned memory, floating-point
892*3f1979aaSAndroid Build Coastguard Worker // values.
893*3f1979aaSAndroid Build Coastguard Worker //
894*3f1979aaSAndroid Build Coastguard Worker //   dst[127:0] := MEM[mem_addr+127:mem_addr]
895*3f1979aaSAndroid Build Coastguard Worker //
896*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
_mm_load_pd(const double * p)897*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_load_pd(const double *p)
898*3f1979aaSAndroid Build Coastguard Worker {
899*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
900*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f64(vld1q_f64(p));
901*3f1979aaSAndroid Build Coastguard Worker #else
902*3f1979aaSAndroid Build Coastguard Worker     const float *fp = (const float *) p;
903*3f1979aaSAndroid Build Coastguard Worker     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
904*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f32(vld1q_f32(data));
905*3f1979aaSAndroid Build Coastguard Worker #endif
906*3f1979aaSAndroid Build Coastguard Worker }
907*3f1979aaSAndroid Build Coastguard Worker 
908*3f1979aaSAndroid Build Coastguard Worker // Loads two double-precision from unaligned memory, floating-point values.
909*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
_mm_loadu_pd(const double * p)910*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
911*3f1979aaSAndroid Build Coastguard Worker {
912*3f1979aaSAndroid Build Coastguard Worker     return _mm_load_pd(p);
913*3f1979aaSAndroid Build Coastguard Worker }
914*3f1979aaSAndroid Build Coastguard Worker 
915*3f1979aaSAndroid Build Coastguard Worker // Loads an single - precision, floating - point value into the low word and
916*3f1979aaSAndroid Build Coastguard Worker // clears the upper three words.
917*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
_mm_load_ss(const float * p)918*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_load_ss(const float *p)
919*3f1979aaSAndroid Build Coastguard Worker {
920*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
921*3f1979aaSAndroid Build Coastguard Worker }
922*3f1979aaSAndroid Build Coastguard Worker 
_mm_loadl_epi64(__m128i const * p)923*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
924*3f1979aaSAndroid Build Coastguard Worker {
925*3f1979aaSAndroid Build Coastguard Worker     /* Load the lower 64 bits of the value pointed to by p into the
926*3f1979aaSAndroid Build Coastguard Worker      * lower 64 bits of the result, zeroing the upper 64 bits of the result.
927*3f1979aaSAndroid Build Coastguard Worker      */
928*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
929*3f1979aaSAndroid Build Coastguard Worker         vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
930*3f1979aaSAndroid Build Coastguard Worker }
931*3f1979aaSAndroid Build Coastguard Worker 
932*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into the
933*3f1979aaSAndroid Build Coastguard Worker // lower element of dst, and copy the upper element from a to dst. mem_addr does
934*3f1979aaSAndroid Build Coastguard Worker // not need to be aligned on any particular boundary.
935*3f1979aaSAndroid Build Coastguard Worker //
936*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := MEM[mem_addr+63:mem_addr]
937*3f1979aaSAndroid Build Coastguard Worker //   dst[127:64] := a[127:64]
938*3f1979aaSAndroid Build Coastguard Worker //
939*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
_mm_loadl_pd(__m128d a,const double * p)940*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
941*3f1979aaSAndroid Build Coastguard Worker {
942*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
943*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f64(
944*3f1979aaSAndroid Build Coastguard Worker         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
945*3f1979aaSAndroid Build Coastguard Worker #else
946*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f32(
947*3f1979aaSAndroid Build Coastguard Worker         vcombine_f32(vld1_f32((const float *) p),
948*3f1979aaSAndroid Build Coastguard Worker                      vget_high_f32(vreinterpretq_f32_m128d(a))));
949*3f1979aaSAndroid Build Coastguard Worker #endif
950*3f1979aaSAndroid Build Coastguard Worker }
951*3f1979aaSAndroid Build Coastguard Worker 
952*3f1979aaSAndroid Build Coastguard Worker // Load 2 double-precision (64-bit) floating-point elements from memory into dst
953*3f1979aaSAndroid Build Coastguard Worker // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
954*3f1979aaSAndroid Build Coastguard Worker // general-protection exception may be generated.
955*3f1979aaSAndroid Build Coastguard Worker //
956*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
957*3f1979aaSAndroid Build Coastguard Worker //   dst[127:64] := MEM[mem_addr+63:mem_addr]
958*3f1979aaSAndroid Build Coastguard Worker //
959*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
_mm_loadr_pd(const double * p)960*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
961*3f1979aaSAndroid Build Coastguard Worker {
962*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
963*3f1979aaSAndroid Build Coastguard Worker     float64x2_t v = vld1q_f64(p);
964*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
965*3f1979aaSAndroid Build Coastguard Worker #else
966*3f1979aaSAndroid Build Coastguard Worker     int64x2_t v = vld1q_s64((const int64_t *) p);
967*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
968*3f1979aaSAndroid Build Coastguard Worker #endif
969*3f1979aaSAndroid Build Coastguard Worker }
970*3f1979aaSAndroid Build Coastguard Worker 
971*3f1979aaSAndroid Build Coastguard Worker // Sets the low word to the single-precision, floating-point value of b
972*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
_mm_move_ss(__m128 a,__m128 b)973*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
974*3f1979aaSAndroid Build Coastguard Worker {
975*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
976*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
977*3f1979aaSAndroid Build Coastguard Worker                        vreinterpretq_f32_m128(a), 0));
978*3f1979aaSAndroid Build Coastguard Worker }
979*3f1979aaSAndroid Build Coastguard Worker 
980*3f1979aaSAndroid Build Coastguard Worker // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
981*3f1979aaSAndroid Build Coastguard Worker // upper element.
982*3f1979aaSAndroid Build Coastguard Worker //
983*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := a[63:0]
984*3f1979aaSAndroid Build Coastguard Worker //   dst[127:64] := 0
985*3f1979aaSAndroid Build Coastguard Worker //
986*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
_mm_move_epi64(__m128i a)987*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
988*3f1979aaSAndroid Build Coastguard Worker {
989*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(
990*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
991*3f1979aaSAndroid Build Coastguard Worker }
992*3f1979aaSAndroid Build Coastguard Worker 
993*3f1979aaSAndroid Build Coastguard Worker // Return vector of type __m128 with undefined elements.
994*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
_mm_undefined_ps(void)995*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_undefined_ps(void)
996*3f1979aaSAndroid Build Coastguard Worker {
997*3f1979aaSAndroid Build Coastguard Worker     __m128 a;
998*3f1979aaSAndroid Build Coastguard Worker     return a;
999*3f1979aaSAndroid Build Coastguard Worker }
1000*3f1979aaSAndroid Build Coastguard Worker 
1001*3f1979aaSAndroid Build Coastguard Worker /* Logic/Binary operations */
1002*3f1979aaSAndroid Build Coastguard Worker 
1003*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise AND-NOT of the four single-precision, floating-point
1004*3f1979aaSAndroid Build Coastguard Worker // values of a and b.
1005*3f1979aaSAndroid Build Coastguard Worker //
1006*3f1979aaSAndroid Build Coastguard Worker //   r0 := ~a0 & b0
1007*3f1979aaSAndroid Build Coastguard Worker //   r1 := ~a1 & b1
1008*3f1979aaSAndroid Build Coastguard Worker //   r2 := ~a2 & b2
1009*3f1979aaSAndroid Build Coastguard Worker //   r3 := ~a3 & b3
1010*3f1979aaSAndroid Build Coastguard Worker //
1011*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
_mm_andnot_ps(__m128 a,__m128 b)1012*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1013*3f1979aaSAndroid Build Coastguard Worker {
1014*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_s32(
1015*3f1979aaSAndroid Build Coastguard Worker         vbicq_s32(vreinterpretq_s32_m128(b),
1016*3f1979aaSAndroid Build Coastguard Worker                   vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
1017*3f1979aaSAndroid Build Coastguard Worker }
1018*3f1979aaSAndroid Build Coastguard Worker 
1019*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
1020*3f1979aaSAndroid Build Coastguard Worker // elements in a and then AND with b, and store the results in dst.
1021*3f1979aaSAndroid Build Coastguard Worker //
1022*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 1
1023*3f1979aaSAndroid Build Coastguard Worker // 	     i := j*64
1024*3f1979aaSAndroid Build Coastguard Worker // 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
1025*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1026*3f1979aaSAndroid Build Coastguard Worker //
1027*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
_mm_andnot_pd(__m128d a,__m128d b)1028*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
1029*3f1979aaSAndroid Build Coastguard Worker {
1030*3f1979aaSAndroid Build Coastguard Worker     // *NOTE* argument swap
1031*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_s64(
1032*3f1979aaSAndroid Build Coastguard Worker         vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
1033*3f1979aaSAndroid Build Coastguard Worker }
1034*3f1979aaSAndroid Build Coastguard Worker 
1035*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
1036*3f1979aaSAndroid Build Coastguard Worker // 128-bit value in a.
1037*3f1979aaSAndroid Build Coastguard Worker //
1038*3f1979aaSAndroid Build Coastguard Worker //   r := (~a) & b
1039*3f1979aaSAndroid Build Coastguard Worker //
1040*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
_mm_andnot_si128(__m128i a,__m128i b)1041*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
1042*3f1979aaSAndroid Build Coastguard Worker {
1043*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
1044*3f1979aaSAndroid Build Coastguard Worker         vbicq_s32(vreinterpretq_s32_m128i(b),
1045*3f1979aaSAndroid Build Coastguard Worker                   vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
1046*3f1979aaSAndroid Build Coastguard Worker }
1047*3f1979aaSAndroid Build Coastguard Worker 
1048*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
1049*3f1979aaSAndroid Build Coastguard Worker // b.
1050*3f1979aaSAndroid Build Coastguard Worker //
1051*3f1979aaSAndroid Build Coastguard Worker //   r := a & b
1052*3f1979aaSAndroid Build Coastguard Worker //
1053*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
_mm_and_si128(__m128i a,__m128i b)1054*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
1055*3f1979aaSAndroid Build Coastguard Worker {
1056*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
1057*3f1979aaSAndroid Build Coastguard Worker         vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1058*3f1979aaSAndroid Build Coastguard Worker }
1059*3f1979aaSAndroid Build Coastguard Worker 
1060*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise AND of the four single-precision, floating-point values
1061*3f1979aaSAndroid Build Coastguard Worker // of a and b.
1062*3f1979aaSAndroid Build Coastguard Worker //
1063*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 & b0
1064*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 & b1
1065*3f1979aaSAndroid Build Coastguard Worker //   r2 := a2 & b2
1066*3f1979aaSAndroid Build Coastguard Worker //   r3 := a3 & b3
1067*3f1979aaSAndroid Build Coastguard Worker //
1068*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
_mm_and_ps(__m128 a,__m128 b)1069*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1070*3f1979aaSAndroid Build Coastguard Worker {
1071*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_s32(
1072*3f1979aaSAndroid Build Coastguard Worker         vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1073*3f1979aaSAndroid Build Coastguard Worker }
1074*3f1979aaSAndroid Build Coastguard Worker 
1075*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise AND of packed double-precision (64-bit) floating-point
1076*3f1979aaSAndroid Build Coastguard Worker // elements in a and b, and store the results in dst.
1077*3f1979aaSAndroid Build Coastguard Worker //
1078*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 1
1079*3f1979aaSAndroid Build Coastguard Worker //     i := j*64
1080*3f1979aaSAndroid Build Coastguard Worker //     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
1081*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1082*3f1979aaSAndroid Build Coastguard Worker //
1083*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
_mm_and_pd(__m128d a,__m128d b)1084*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
1085*3f1979aaSAndroid Build Coastguard Worker {
1086*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_s64(
1087*3f1979aaSAndroid Build Coastguard Worker         vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1088*3f1979aaSAndroid Build Coastguard Worker }
1089*3f1979aaSAndroid Build Coastguard Worker 
1090*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise OR of the four single-precision, floating-point values
1091*3f1979aaSAndroid Build Coastguard Worker // of a and b.
1092*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
_mm_or_ps(__m128 a,__m128 b)1093*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
1094*3f1979aaSAndroid Build Coastguard Worker {
1095*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_s32(
1096*3f1979aaSAndroid Build Coastguard Worker         vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1097*3f1979aaSAndroid Build Coastguard Worker }
1098*3f1979aaSAndroid Build Coastguard Worker 
1099*3f1979aaSAndroid Build Coastguard Worker // Computes bitwise EXOR (exclusive-or) of the four single-precision,
1100*3f1979aaSAndroid Build Coastguard Worker // floating-point values of a and b.
1101*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
_mm_xor_ps(__m128 a,__m128 b)1102*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
1103*3f1979aaSAndroid Build Coastguard Worker {
1104*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_s32(
1105*3f1979aaSAndroid Build Coastguard Worker         veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1106*3f1979aaSAndroid Build Coastguard Worker }
1107*3f1979aaSAndroid Build Coastguard Worker 
1108*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
1109*3f1979aaSAndroid Build Coastguard Worker // elements in a and b, and store the results in dst.
1110*3f1979aaSAndroid Build Coastguard Worker //
1111*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 1
1112*3f1979aaSAndroid Build Coastguard Worker //      i := j*64
1113*3f1979aaSAndroid Build Coastguard Worker //      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
1114*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1115*3f1979aaSAndroid Build Coastguard Worker //
1116*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
_mm_xor_pd(__m128d a,__m128d b)1117*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
1118*3f1979aaSAndroid Build Coastguard Worker {
1119*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_s64(
1120*3f1979aaSAndroid Build Coastguard Worker         veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1121*3f1979aaSAndroid Build Coastguard Worker }
1122*3f1979aaSAndroid Build Coastguard Worker 
1123*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
1124*3f1979aaSAndroid Build Coastguard Worker //
1125*3f1979aaSAndroid Build Coastguard Worker //   r := a | b
1126*3f1979aaSAndroid Build Coastguard Worker //
1127*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
_mm_or_si128(__m128i a,__m128i b)1128*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
1129*3f1979aaSAndroid Build Coastguard Worker {
1130*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
1131*3f1979aaSAndroid Build Coastguard Worker         vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1132*3f1979aaSAndroid Build Coastguard Worker }
1133*3f1979aaSAndroid Build Coastguard Worker 
1134*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
1135*3f1979aaSAndroid Build Coastguard Worker // b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
_mm_xor_si128(__m128i a,__m128i b)1136*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
1137*3f1979aaSAndroid Build Coastguard Worker {
1138*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
1139*3f1979aaSAndroid Build Coastguard Worker         veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1140*3f1979aaSAndroid Build Coastguard Worker }
1141*3f1979aaSAndroid Build Coastguard Worker 
1142*3f1979aaSAndroid Build Coastguard Worker // Duplicate odd-indexed single-precision (32-bit) floating-point elements
1143*3f1979aaSAndroid Build Coastguard Worker // from a, and store the results in dst.
1144*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
_mm_movehdup_ps(__m128 a)1145*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
1146*3f1979aaSAndroid Build Coastguard Worker {
1147*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1148*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(__builtin_shufflevector(
1149*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
1150*3f1979aaSAndroid Build Coastguard Worker #else
1151*3f1979aaSAndroid Build Coastguard Worker     float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
1152*3f1979aaSAndroid Build Coastguard Worker     float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
1153*3f1979aaSAndroid Build Coastguard Worker     float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
1154*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vld1q_f32(data));
1155*3f1979aaSAndroid Build Coastguard Worker #endif
1156*3f1979aaSAndroid Build Coastguard Worker }
1157*3f1979aaSAndroid Build Coastguard Worker 
1158*3f1979aaSAndroid Build Coastguard Worker // Duplicate even-indexed single-precision (32-bit) floating-point elements
1159*3f1979aaSAndroid Build Coastguard Worker // from a, and store the results in dst.
1160*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
_mm_moveldup_ps(__m128 a)1161*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
1162*3f1979aaSAndroid Build Coastguard Worker {
1163*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1164*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(__builtin_shufflevector(
1165*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
1166*3f1979aaSAndroid Build Coastguard Worker #else
1167*3f1979aaSAndroid Build Coastguard Worker     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1168*3f1979aaSAndroid Build Coastguard Worker     float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
1169*3f1979aaSAndroid Build Coastguard Worker     float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
1170*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vld1q_f32(data));
1171*3f1979aaSAndroid Build Coastguard Worker #endif
1172*3f1979aaSAndroid Build Coastguard Worker }
1173*3f1979aaSAndroid Build Coastguard Worker 
1174*3f1979aaSAndroid Build Coastguard Worker // Moves the upper two values of B into the lower two values of A.
1175*3f1979aaSAndroid Build Coastguard Worker //
1176*3f1979aaSAndroid Build Coastguard Worker //   r3 := a3
1177*3f1979aaSAndroid Build Coastguard Worker //   r2 := a2
1178*3f1979aaSAndroid Build Coastguard Worker //   r1 := b3
1179*3f1979aaSAndroid Build Coastguard Worker //   r0 := b2
_mm_movehl_ps(__m128 __A,__m128 __B)1180*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
1181*3f1979aaSAndroid Build Coastguard Worker {
1182*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
1183*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
1184*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
1185*3f1979aaSAndroid Build Coastguard Worker }
1186*3f1979aaSAndroid Build Coastguard Worker 
1187*3f1979aaSAndroid Build Coastguard Worker // Moves the lower two values of B into the upper two values of A.
1188*3f1979aaSAndroid Build Coastguard Worker //
1189*3f1979aaSAndroid Build Coastguard Worker //   r3 := b1
1190*3f1979aaSAndroid Build Coastguard Worker //   r2 := b0
1191*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1
1192*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0
_mm_movelh_ps(__m128 __A,__m128 __B)1193*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
1194*3f1979aaSAndroid Build Coastguard Worker {
1195*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
1196*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
1197*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1198*3f1979aaSAndroid Build Coastguard Worker }
1199*3f1979aaSAndroid Build Coastguard Worker 
1200*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 32-bit integers in a, and store
1201*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1202*3f1979aaSAndroid Build Coastguard Worker //
1203*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
1204*3f1979aaSAndroid Build Coastguard Worker //     i := j*32
1205*3f1979aaSAndroid Build Coastguard Worker //     dst[i+31:i] := ABS(a[i+31:i])
1206*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1207*3f1979aaSAndroid Build Coastguard Worker //
1208*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
_mm_abs_epi32(__m128i a)1209*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
1210*3f1979aaSAndroid Build Coastguard Worker {
1211*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
1212*3f1979aaSAndroid Build Coastguard Worker }
1213*3f1979aaSAndroid Build Coastguard Worker 
1214*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 16-bit integers in a, and store
1215*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1216*3f1979aaSAndroid Build Coastguard Worker //
1217*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
1218*3f1979aaSAndroid Build Coastguard Worker //     i := j*16
1219*3f1979aaSAndroid Build Coastguard Worker //     dst[i+15:i] := ABS(a[i+15:i])
1220*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1221*3f1979aaSAndroid Build Coastguard Worker //
1222*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
_mm_abs_epi16(__m128i a)1223*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
1224*3f1979aaSAndroid Build Coastguard Worker {
1225*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
1226*3f1979aaSAndroid Build Coastguard Worker }
1227*3f1979aaSAndroid Build Coastguard Worker 
1228*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 8-bit integers in a, and store
1229*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1230*3f1979aaSAndroid Build Coastguard Worker //
1231*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 15
1232*3f1979aaSAndroid Build Coastguard Worker //     i := j*8
1233*3f1979aaSAndroid Build Coastguard Worker //     dst[i+7:i] := ABS(a[i+7:i])
1234*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1235*3f1979aaSAndroid Build Coastguard Worker //
1236*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
_mm_abs_epi8(__m128i a)1237*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
1238*3f1979aaSAndroid Build Coastguard Worker {
1239*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
1240*3f1979aaSAndroid Build Coastguard Worker }
1241*3f1979aaSAndroid Build Coastguard Worker 
1242*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 32-bit integers in a, and store
1243*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1244*3f1979aaSAndroid Build Coastguard Worker //
1245*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 1
1246*3f1979aaSAndroid Build Coastguard Worker //     i := j*32
1247*3f1979aaSAndroid Build Coastguard Worker //     dst[i+31:i] := ABS(a[i+31:i])
1248*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1249*3f1979aaSAndroid Build Coastguard Worker //
1250*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
_mm_abs_pi32(__m64 a)1251*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
1252*3f1979aaSAndroid Build Coastguard Worker {
1253*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
1254*3f1979aaSAndroid Build Coastguard Worker }
1255*3f1979aaSAndroid Build Coastguard Worker 
1256*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 16-bit integers in a, and store
1257*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1258*3f1979aaSAndroid Build Coastguard Worker //
1259*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
1260*3f1979aaSAndroid Build Coastguard Worker //     i := j*16
1261*3f1979aaSAndroid Build Coastguard Worker //     dst[i+15:i] := ABS(a[i+15:i])
1262*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1263*3f1979aaSAndroid Build Coastguard Worker //
1264*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
_mm_abs_pi16(__m64 a)1265*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
1266*3f1979aaSAndroid Build Coastguard Worker {
1267*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
1268*3f1979aaSAndroid Build Coastguard Worker }
1269*3f1979aaSAndroid Build Coastguard Worker 
1270*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 8-bit integers in a, and store
1271*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1272*3f1979aaSAndroid Build Coastguard Worker //
1273*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
1274*3f1979aaSAndroid Build Coastguard Worker //     i := j*8
1275*3f1979aaSAndroid Build Coastguard Worker //     dst[i+7:i] := ABS(a[i+7:i])
1276*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1277*3f1979aaSAndroid Build Coastguard Worker //
1278*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
_mm_abs_pi8(__m64 a)1279*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
1280*3f1979aaSAndroid Build Coastguard Worker {
1281*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
1282*3f1979aaSAndroid Build Coastguard Worker }
1283*3f1979aaSAndroid Build Coastguard Worker 
1284*3f1979aaSAndroid Build Coastguard Worker // Takes the upper 64 bits of a and places it in the low end of the result
1285*3f1979aaSAndroid Build Coastguard Worker // Takes the lower 64 bits of b and places it into the high end of the result.
_mm_shuffle_ps_1032(__m128 a,__m128 b)1286*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
1287*3f1979aaSAndroid Build Coastguard Worker {
1288*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1289*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1290*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
1291*3f1979aaSAndroid Build Coastguard Worker }
1292*3f1979aaSAndroid Build Coastguard Worker 
1293*3f1979aaSAndroid Build Coastguard Worker // takes the lower two 32-bit values from a and swaps them and places in high
1294*3f1979aaSAndroid Build Coastguard Worker // end of result takes the higher two 32 bit values from b and swaps them and
1295*3f1979aaSAndroid Build Coastguard Worker // places in low end of result.
_mm_shuffle_ps_2301(__m128 a,__m128 b)1296*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
1297*3f1979aaSAndroid Build Coastguard Worker {
1298*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1299*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
1300*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
1301*3f1979aaSAndroid Build Coastguard Worker }
1302*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_0321(__m128 a,__m128 b)1303*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
1304*3f1979aaSAndroid Build Coastguard Worker {
1305*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a21 = vget_high_f32(
1306*3f1979aaSAndroid Build Coastguard Worker         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1307*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b03 = vget_low_f32(
1308*3f1979aaSAndroid Build Coastguard Worker         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1309*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
1310*3f1979aaSAndroid Build Coastguard Worker }
1311*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_2103(__m128 a,__m128 b)1312*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
1313*3f1979aaSAndroid Build Coastguard Worker {
1314*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a03 = vget_low_f32(
1315*3f1979aaSAndroid Build Coastguard Worker         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1316*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b21 = vget_high_f32(
1317*3f1979aaSAndroid Build Coastguard Worker         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1318*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
1319*3f1979aaSAndroid Build Coastguard Worker }
1320*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_1010(__m128 a,__m128 b)1321*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
1322*3f1979aaSAndroid Build Coastguard Worker {
1323*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1324*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1325*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1326*3f1979aaSAndroid Build Coastguard Worker }
1327*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_1001(__m128 a,__m128 b)1328*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
1329*3f1979aaSAndroid Build Coastguard Worker {
1330*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1331*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1332*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
1333*3f1979aaSAndroid Build Coastguard Worker }
1334*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_0101(__m128 a,__m128 b)1335*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
1336*3f1979aaSAndroid Build Coastguard Worker {
1337*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1338*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
1339*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
1340*3f1979aaSAndroid Build Coastguard Worker }
1341*3f1979aaSAndroid Build Coastguard Worker 
1342*3f1979aaSAndroid Build Coastguard Worker // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
1343*3f1979aaSAndroid Build Coastguard Worker // high
_mm_shuffle_ps_3210(__m128 a,__m128 b)1344*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
1345*3f1979aaSAndroid Build Coastguard Worker {
1346*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1347*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1348*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
1349*3f1979aaSAndroid Build Coastguard Worker }
1350*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_0011(__m128 a,__m128 b)1351*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
1352*3f1979aaSAndroid Build Coastguard Worker {
1353*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
1354*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1355*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
1356*3f1979aaSAndroid Build Coastguard Worker }
1357*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_0022(__m128 a,__m128 b)1358*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
1359*3f1979aaSAndroid Build Coastguard Worker {
1360*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a22 =
1361*3f1979aaSAndroid Build Coastguard Worker         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1362*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1363*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
1364*3f1979aaSAndroid Build Coastguard Worker }
1365*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_2200(__m128 a,__m128 b)1366*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
1367*3f1979aaSAndroid Build Coastguard Worker {
1368*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
1369*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b22 =
1370*3f1979aaSAndroid Build Coastguard Worker         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
1371*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
1372*3f1979aaSAndroid Build Coastguard Worker }
1373*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_3202(__m128 a,__m128 b)1374*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
1375*3f1979aaSAndroid Build Coastguard Worker {
1376*3f1979aaSAndroid Build Coastguard Worker     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1377*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a22 =
1378*3f1979aaSAndroid Build Coastguard Worker         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1379*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
1380*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1381*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
1382*3f1979aaSAndroid Build Coastguard Worker }
1383*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_1133(__m128 a,__m128 b)1384*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
1385*3f1979aaSAndroid Build Coastguard Worker {
1386*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a33 =
1387*3f1979aaSAndroid Build Coastguard Worker         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
1388*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
1389*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
1390*3f1979aaSAndroid Build Coastguard Worker }
1391*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_2010(__m128 a,__m128 b)1392*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
1393*3f1979aaSAndroid Build Coastguard Worker {
1394*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1395*3f1979aaSAndroid Build Coastguard Worker     float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
1396*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1397*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1398*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
1399*3f1979aaSAndroid Build Coastguard Worker }
1400*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_2001(__m128 a,__m128 b)1401*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
1402*3f1979aaSAndroid Build Coastguard Worker {
1403*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1404*3f1979aaSAndroid Build Coastguard Worker     float32_t b2 = vgetq_lane_f32(b, 2);
1405*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1406*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1407*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
1408*3f1979aaSAndroid Build Coastguard Worker }
1409*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_ps_2032(__m128 a,__m128 b)1410*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
1411*3f1979aaSAndroid Build Coastguard Worker {
1412*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1413*3f1979aaSAndroid Build Coastguard Worker     float32_t b2 = vgetq_lane_f32(b, 2);
1414*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1415*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1416*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
1417*3f1979aaSAndroid Build Coastguard Worker }
1418*3f1979aaSAndroid Build Coastguard Worker 
1419*3f1979aaSAndroid Build Coastguard Worker // NEON does not support a general purpose permute intrinsic
1420*3f1979aaSAndroid Build Coastguard Worker // Selects four specific single-precision, floating-point values from a and b,
1421*3f1979aaSAndroid Build Coastguard Worker // based on the mask i.
1422*3f1979aaSAndroid Build Coastguard Worker //
1423*3f1979aaSAndroid Build Coastguard Worker // C equivalent:
1424*3f1979aaSAndroid Build Coastguard Worker //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1425*3f1979aaSAndroid Build Coastguard Worker //                                 __constrange(0, 255) int imm) {
1426*3f1979aaSAndroid Build Coastguard Worker //       __m128 ret;
1427*3f1979aaSAndroid Build Coastguard Worker //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
1428*3f1979aaSAndroid Build Coastguard Worker //       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
1429*3f1979aaSAndroid Build Coastguard Worker //       return ret;
1430*3f1979aaSAndroid Build Coastguard Worker //   }
1431*3f1979aaSAndroid Build Coastguard Worker //
1432*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
1433*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_ps_default(a, b, imm)                                  \
1434*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                        \
1435*3f1979aaSAndroid Build Coastguard Worker         float32x4_t ret;                                                   \
1436*3f1979aaSAndroid Build Coastguard Worker         ret = vmovq_n_f32(                                                 \
1437*3f1979aaSAndroid Build Coastguard Worker             vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
1438*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_f32(                                              \
1439*3f1979aaSAndroid Build Coastguard Worker             vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1440*3f1979aaSAndroid Build Coastguard Worker             ret, 1);                                                       \
1441*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_f32(                                              \
1442*3f1979aaSAndroid Build Coastguard Worker             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1443*3f1979aaSAndroid Build Coastguard Worker             ret, 2);                                                       \
1444*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_f32(                                              \
1445*3f1979aaSAndroid Build Coastguard Worker             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1446*3f1979aaSAndroid Build Coastguard Worker             ret, 3);                                                       \
1447*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128_f32(ret);                                       \
1448*3f1979aaSAndroid Build Coastguard Worker     })
1449*3f1979aaSAndroid Build Coastguard Worker 
1450*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
1451*3f1979aaSAndroid Build Coastguard Worker // int imm)
1452*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1453*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_ps(a, b, imm)                                \
1454*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                              \
1455*3f1979aaSAndroid Build Coastguard Worker         float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
1456*3f1979aaSAndroid Build Coastguard Worker         float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
1457*3f1979aaSAndroid Build Coastguard Worker         float32x4_t _shuf = __builtin_shufflevector(             \
1458*3f1979aaSAndroid Build Coastguard Worker             _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1459*3f1979aaSAndroid Build Coastguard Worker             (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
1460*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128_f32(_shuf);                           \
1461*3f1979aaSAndroid Build Coastguard Worker     })
1462*3f1979aaSAndroid Build Coastguard Worker #else  // generic
1463*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_ps(a, b, imm)                          \
1464*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                        \
1465*3f1979aaSAndroid Build Coastguard Worker         __m128 ret;                                        \
1466*3f1979aaSAndroid Build Coastguard Worker         switch (imm) {                                     \
1467*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(1, 0, 3, 2):                      \
1468*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_1032((a), (b));           \
1469*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1470*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(2, 3, 0, 1):                      \
1471*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_2301((a), (b));           \
1472*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1473*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(0, 3, 2, 1):                      \
1474*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_0321((a), (b));           \
1475*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1476*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(2, 1, 0, 3):                      \
1477*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_2103((a), (b));           \
1478*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1479*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(1, 0, 1, 0):                      \
1480*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_movelh_ps((a), (b));                 \
1481*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1482*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(1, 0, 0, 1):                      \
1483*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_1001((a), (b));           \
1484*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1485*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(0, 1, 0, 1):                      \
1486*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_0101((a), (b));           \
1487*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1488*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(3, 2, 1, 0):                      \
1489*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_3210((a), (b));           \
1490*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1491*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(0, 0, 1, 1):                      \
1492*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_0011((a), (b));           \
1493*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1494*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(0, 0, 2, 2):                      \
1495*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_0022((a), (b));           \
1496*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1497*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(2, 2, 0, 0):                      \
1498*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_2200((a), (b));           \
1499*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1500*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(3, 2, 0, 2):                      \
1501*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_3202((a), (b));           \
1502*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1503*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(3, 2, 3, 2):                      \
1504*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_movehl_ps((b), (a));                 \
1505*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1506*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(1, 1, 3, 3):                      \
1507*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_1133((a), (b));           \
1508*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1509*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(2, 0, 1, 0):                      \
1510*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_2010((a), (b));           \
1511*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1512*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(2, 0, 0, 1):                      \
1513*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_2001((a), (b));           \
1514*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1515*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(2, 0, 3, 2):                      \
1516*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_2032((a), (b));           \
1517*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1518*3f1979aaSAndroid Build Coastguard Worker         default:                                           \
1519*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_ps_default((a), (b), (imm)); \
1520*3f1979aaSAndroid Build Coastguard Worker             break;                                         \
1521*3f1979aaSAndroid Build Coastguard Worker         }                                                  \
1522*3f1979aaSAndroid Build Coastguard Worker         ret;                                               \
1523*3f1979aaSAndroid Build Coastguard Worker     })
1524*3f1979aaSAndroid Build Coastguard Worker #endif
1525*3f1979aaSAndroid Build Coastguard Worker 
1526*3f1979aaSAndroid Build Coastguard Worker // Takes the upper 64 bits of a and places it in the low end of the result
1527*3f1979aaSAndroid Build Coastguard Worker // Takes the lower 64 bits of a and places it into the high end of the result.
_mm_shuffle_epi_1032(__m128i a)1528*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
1529*3f1979aaSAndroid Build Coastguard Worker {
1530*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1531*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1532*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1533*3f1979aaSAndroid Build Coastguard Worker }
1534*3f1979aaSAndroid Build Coastguard Worker 
1535*3f1979aaSAndroid Build Coastguard Worker // takes the lower two 32-bit values from a and swaps them and places in low end
1536*3f1979aaSAndroid Build Coastguard Worker // of result takes the higher two 32 bit values from a and swaps them and places
1537*3f1979aaSAndroid Build Coastguard Worker // in high end of result.
_mm_shuffle_epi_2301(__m128i a)1538*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1539*3f1979aaSAndroid Build Coastguard Worker {
1540*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1541*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1542*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1543*3f1979aaSAndroid Build Coastguard Worker }
1544*3f1979aaSAndroid Build Coastguard Worker 
1545*3f1979aaSAndroid Build Coastguard Worker // rotates the least significant 32 bits into the most signficant 32 bits, and
1546*3f1979aaSAndroid Build Coastguard Worker // shifts the rest down
_mm_shuffle_epi_0321(__m128i a)1547*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1548*3f1979aaSAndroid Build Coastguard Worker {
1549*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
1550*3f1979aaSAndroid Build Coastguard Worker         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1551*3f1979aaSAndroid Build Coastguard Worker }
1552*3f1979aaSAndroid Build Coastguard Worker 
1553*3f1979aaSAndroid Build Coastguard Worker // rotates the most significant 32 bits into the least signficant 32 bits, and
1554*3f1979aaSAndroid Build Coastguard Worker // shifts the rest up
_mm_shuffle_epi_2103(__m128i a)1555*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1556*3f1979aaSAndroid Build Coastguard Worker {
1557*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
1558*3f1979aaSAndroid Build Coastguard Worker         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1559*3f1979aaSAndroid Build Coastguard Worker }
1560*3f1979aaSAndroid Build Coastguard Worker 
1561*3f1979aaSAndroid Build Coastguard Worker // gets the lower 64 bits of a, and places it in the upper 64 bits
1562*3f1979aaSAndroid Build Coastguard Worker // gets the lower 64 bits of a and places it in the lower 64 bits
_mm_shuffle_epi_1010(__m128i a)1563*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1564*3f1979aaSAndroid Build Coastguard Worker {
1565*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1566*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1567*3f1979aaSAndroid Build Coastguard Worker }
1568*3f1979aaSAndroid Build Coastguard Worker 
1569*3f1979aaSAndroid Build Coastguard Worker // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1570*3f1979aaSAndroid Build Coastguard Worker // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
_mm_shuffle_epi_1001(__m128i a)1571*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1572*3f1979aaSAndroid Build Coastguard Worker {
1573*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1574*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1575*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1576*3f1979aaSAndroid Build Coastguard Worker }
1577*3f1979aaSAndroid Build Coastguard Worker 
1578*3f1979aaSAndroid Build Coastguard Worker // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1579*3f1979aaSAndroid Build Coastguard Worker // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1580*3f1979aaSAndroid Build Coastguard Worker // places it in the lower 64 bits
_mm_shuffle_epi_0101(__m128i a)1581*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1582*3f1979aaSAndroid Build Coastguard Worker {
1583*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1584*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1585*3f1979aaSAndroid Build Coastguard Worker }
1586*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_epi_2211(__m128i a)1587*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1588*3f1979aaSAndroid Build Coastguard Worker {
1589*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1590*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1591*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1592*3f1979aaSAndroid Build Coastguard Worker }
1593*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_epi_0122(__m128i a)1594*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1595*3f1979aaSAndroid Build Coastguard Worker {
1596*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1597*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1598*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1599*3f1979aaSAndroid Build Coastguard Worker }
1600*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_epi_3332(__m128i a)1601*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1602*3f1979aaSAndroid Build Coastguard Worker {
1603*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1604*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1605*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1606*3f1979aaSAndroid Build Coastguard Worker }
1607*3f1979aaSAndroid Build Coastguard Worker 
1608*3f1979aaSAndroid Build Coastguard Worker // Shuffle packed 8-bit integers in a according to shuffle control mask in the
1609*3f1979aaSAndroid Build Coastguard Worker // corresponding 8-bit element of b, and store the results in dst.
1610*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
_mm_shuffle_epi8(__m128i a,__m128i b)1611*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
1612*3f1979aaSAndroid Build Coastguard Worker {
1613*3f1979aaSAndroid Build Coastguard Worker     int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
1614*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
1615*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t idx_masked =
1616*3f1979aaSAndroid Build Coastguard Worker         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
1617*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
1618*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
1619*3f1979aaSAndroid Build Coastguard Worker #elif defined(__GNUC__)
1620*3f1979aaSAndroid Build Coastguard Worker     int8x16_t ret;
1621*3f1979aaSAndroid Build Coastguard Worker     // %e and %f represent the even and odd D registers
1622*3f1979aaSAndroid Build Coastguard Worker     // respectively.
1623*3f1979aaSAndroid Build Coastguard Worker     __asm__ __volatile__(
1624*3f1979aaSAndroid Build Coastguard Worker         "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
1625*3f1979aaSAndroid Build Coastguard Worker         "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
1626*3f1979aaSAndroid Build Coastguard Worker         : [ret] "=&w"(ret)
1627*3f1979aaSAndroid Build Coastguard Worker         : [tbl] "w"(tbl), [idx] "w"(idx_masked));
1628*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(ret);
1629*3f1979aaSAndroid Build Coastguard Worker #else
1630*3f1979aaSAndroid Build Coastguard Worker     // use this line if testing on aarch64
1631*3f1979aaSAndroid Build Coastguard Worker     int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
1632*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(
1633*3f1979aaSAndroid Build Coastguard Worker         vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
1634*3f1979aaSAndroid Build Coastguard Worker                     vtbl2_s8(a_split, vget_high_u8(idx_masked))));
1635*3f1979aaSAndroid Build Coastguard Worker #endif
1636*3f1979aaSAndroid Build Coastguard Worker }
1637*3f1979aaSAndroid Build Coastguard Worker 
1638*3f1979aaSAndroid Build Coastguard Worker // C equivalent:
1639*3f1979aaSAndroid Build Coastguard Worker //   __m128i _mm_shuffle_epi32_default(__m128i a,
1640*3f1979aaSAndroid Build Coastguard Worker //                                     __constrange(0, 255) int imm) {
1641*3f1979aaSAndroid Build Coastguard Worker //       __m128i ret;
1642*3f1979aaSAndroid Build Coastguard Worker //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
1643*3f1979aaSAndroid Build Coastguard Worker //       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
1644*3f1979aaSAndroid Build Coastguard Worker //       return ret;
1645*3f1979aaSAndroid Build Coastguard Worker //   }
1646*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_epi32_default(a, imm)                                   \
1647*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                         \
1648*3f1979aaSAndroid Build Coastguard Worker         int32x4_t ret;                                                      \
1649*3f1979aaSAndroid Build Coastguard Worker         ret = vmovq_n_s32(                                                  \
1650*3f1979aaSAndroid Build Coastguard Worker             vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
1651*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s32(                                               \
1652*3f1979aaSAndroid Build Coastguard Worker             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
1653*3f1979aaSAndroid Build Coastguard Worker             ret, 1);                                                        \
1654*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s32(                                               \
1655*3f1979aaSAndroid Build Coastguard Worker             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
1656*3f1979aaSAndroid Build Coastguard Worker             ret, 2);                                                        \
1657*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s32(                                               \
1658*3f1979aaSAndroid Build Coastguard Worker             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
1659*3f1979aaSAndroid Build Coastguard Worker             ret, 3);                                                        \
1660*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s32(ret);                                       \
1661*3f1979aaSAndroid Build Coastguard Worker     })
1662*3f1979aaSAndroid Build Coastguard Worker 
1663*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1664*3f1979aaSAndroid Build Coastguard Worker // int imm)
1665*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
1666*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_epi32_splat(a, imm)                          \
1667*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                              \
1668*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s32(                                 \
1669*3f1979aaSAndroid Build Coastguard Worker             vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1670*3f1979aaSAndroid Build Coastguard Worker     })
1671*3f1979aaSAndroid Build Coastguard Worker #else
1672*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_epi32_splat(a, imm)                                      \
1673*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                          \
1674*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s32(                                             \
1675*3f1979aaSAndroid Build Coastguard Worker             vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1676*3f1979aaSAndroid Build Coastguard Worker     })
1677*3f1979aaSAndroid Build Coastguard Worker #endif
1678*3f1979aaSAndroid Build Coastguard Worker 
1679*3f1979aaSAndroid Build Coastguard Worker // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
1680*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
1681*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
1682*3f1979aaSAndroid Build Coastguard Worker //                                        __constrange(0,255) int imm)
1683*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1684*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_epi32(a, imm)                              \
1685*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                            \
1686*3f1979aaSAndroid Build Coastguard Worker         int32x4_t _input = vreinterpretq_s32_m128i(a);         \
1687*3f1979aaSAndroid Build Coastguard Worker         int32x4_t _shuf = __builtin_shufflevector(             \
1688*3f1979aaSAndroid Build Coastguard Worker             _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1689*3f1979aaSAndroid Build Coastguard Worker             ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
1690*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s32(_shuf);                        \
1691*3f1979aaSAndroid Build Coastguard Worker     })
1692*3f1979aaSAndroid Build Coastguard Worker #else  // generic
1693*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_epi32(a, imm)                        \
1694*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                      \
1695*3f1979aaSAndroid Build Coastguard Worker         __m128i ret;                                     \
1696*3f1979aaSAndroid Build Coastguard Worker         switch (imm) {                                   \
1697*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(1, 0, 3, 2):                    \
1698*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi_1032((a));             \
1699*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1700*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(2, 3, 0, 1):                    \
1701*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi_2301((a));             \
1702*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1703*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(0, 3, 2, 1):                    \
1704*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi_0321((a));             \
1705*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1706*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(2, 1, 0, 3):                    \
1707*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi_2103((a));             \
1708*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1709*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(1, 0, 1, 0):                    \
1710*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi_1010((a));             \
1711*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1712*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(1, 0, 0, 1):                    \
1713*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi_1001((a));             \
1714*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1715*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(0, 1, 0, 1):                    \
1716*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi_0101((a));             \
1717*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1718*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(2, 2, 1, 1):                    \
1719*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi_2211((a));             \
1720*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1721*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(0, 1, 2, 2):                    \
1722*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi_0122((a));             \
1723*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1724*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(3, 3, 3, 2):                    \
1725*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi_3332((a));             \
1726*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1727*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(0, 0, 0, 0):                    \
1728*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi32_splat((a), 0);       \
1729*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1730*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(1, 1, 1, 1):                    \
1731*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi32_splat((a), 1);       \
1732*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1733*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(2, 2, 2, 2):                    \
1734*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi32_splat((a), 2);       \
1735*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1736*3f1979aaSAndroid Build Coastguard Worker         case _MM_SHUFFLE(3, 3, 3, 3):                    \
1737*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi32_splat((a), 3);       \
1738*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1739*3f1979aaSAndroid Build Coastguard Worker         default:                                         \
1740*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_shuffle_epi32_default((a), (imm)); \
1741*3f1979aaSAndroid Build Coastguard Worker             break;                                       \
1742*3f1979aaSAndroid Build Coastguard Worker         }                                                \
1743*3f1979aaSAndroid Build Coastguard Worker         ret;                                             \
1744*3f1979aaSAndroid Build Coastguard Worker     })
1745*3f1979aaSAndroid Build Coastguard Worker #endif
1746*3f1979aaSAndroid Build Coastguard Worker 
1747*3f1979aaSAndroid Build Coastguard Worker // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1748*3f1979aaSAndroid Build Coastguard Worker // by imm.
1749*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1750*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1751*3f1979aaSAndroid Build Coastguard Worker //                                                   __constrange(0,255) int
1752*3f1979aaSAndroid Build Coastguard Worker //                                                   imm)
1753*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflelo_epi16_function(a, imm)                                  \
1754*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                           \
1755*3f1979aaSAndroid Build Coastguard Worker         int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
1756*3f1979aaSAndroid Build Coastguard Worker         int16x4_t lowBits = vget_low_s16(ret);                                \
1757*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
1758*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1759*3f1979aaSAndroid Build Coastguard Worker                              1);                                              \
1760*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1761*3f1979aaSAndroid Build Coastguard Worker                              2);                                              \
1762*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1763*3f1979aaSAndroid Build Coastguard Worker                              3);                                              \
1764*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s16(ret);                                         \
1765*3f1979aaSAndroid Build Coastguard Worker     })
1766*3f1979aaSAndroid Build Coastguard Worker 
1767*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
1768*3f1979aaSAndroid Build Coastguard Worker //                                          __constrange(0,255) int imm)
1769*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1770*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflelo_epi16(a, imm)                                  \
1771*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                  \
1772*3f1979aaSAndroid Build Coastguard Worker         int16x8_t _input = vreinterpretq_s16_m128i(a);               \
1773*3f1979aaSAndroid Build Coastguard Worker         int16x8_t _shuf = __builtin_shufflevector(                   \
1774*3f1979aaSAndroid Build Coastguard Worker             _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
1775*3f1979aaSAndroid Build Coastguard Worker             (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
1776*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s16(_shuf);                              \
1777*3f1979aaSAndroid Build Coastguard Worker     })
1778*3f1979aaSAndroid Build Coastguard Worker #else  // generic
1779*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
1780*3f1979aaSAndroid Build Coastguard Worker #endif
1781*3f1979aaSAndroid Build Coastguard Worker 
1782*3f1979aaSAndroid Build Coastguard Worker // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1783*3f1979aaSAndroid Build Coastguard Worker // by imm.
1784*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1785*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1786*3f1979aaSAndroid Build Coastguard Worker //                                                   __constrange(0,255) int
1787*3f1979aaSAndroid Build Coastguard Worker //                                                   imm)
1788*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflehi_epi16_function(a, imm)                                   \
1789*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                            \
1790*3f1979aaSAndroid Build Coastguard Worker         int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
1791*3f1979aaSAndroid Build Coastguard Worker         int16x4_t highBits = vget_high_s16(ret);                               \
1792*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
1793*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1794*3f1979aaSAndroid Build Coastguard Worker                              5);                                               \
1795*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1796*3f1979aaSAndroid Build Coastguard Worker                              6);                                               \
1797*3f1979aaSAndroid Build Coastguard Worker         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1798*3f1979aaSAndroid Build Coastguard Worker                              7);                                               \
1799*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s16(ret);                                          \
1800*3f1979aaSAndroid Build Coastguard Worker     })
1801*3f1979aaSAndroid Build Coastguard Worker 
1802*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
1803*3f1979aaSAndroid Build Coastguard Worker //                                          __constrange(0,255) int imm)
1804*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1805*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflehi_epi16(a, imm)                             \
1806*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                             \
1807*3f1979aaSAndroid Build Coastguard Worker         int16x8_t _input = vreinterpretq_s16_m128i(a);          \
1808*3f1979aaSAndroid Build Coastguard Worker         int16x8_t _shuf = __builtin_shufflevector(              \
1809*3f1979aaSAndroid Build Coastguard Worker             _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
1810*3f1979aaSAndroid Build Coastguard Worker             (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
1811*3f1979aaSAndroid Build Coastguard Worker             (((imm) >> 6) & 0x3) + 4);                          \
1812*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s16(_shuf);                         \
1813*3f1979aaSAndroid Build Coastguard Worker     })
1814*3f1979aaSAndroid Build Coastguard Worker #else  // generic
1815*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
1816*3f1979aaSAndroid Build Coastguard Worker #endif
1817*3f1979aaSAndroid Build Coastguard Worker 
1818*3f1979aaSAndroid Build Coastguard Worker // Blend packed 16-bit integers from a and b using control mask imm8, and store
1819*3f1979aaSAndroid Build Coastguard Worker // the results in dst.
1820*3f1979aaSAndroid Build Coastguard Worker //
1821*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
1822*3f1979aaSAndroid Build Coastguard Worker //       i := j*16
1823*3f1979aaSAndroid Build Coastguard Worker //       IF imm8[j]
1824*3f1979aaSAndroid Build Coastguard Worker //           dst[i+15:i] := b[i+15:i]
1825*3f1979aaSAndroid Build Coastguard Worker //       ELSE
1826*3f1979aaSAndroid Build Coastguard Worker //           dst[i+15:i] := a[i+15:i]
1827*3f1979aaSAndroid Build Coastguard Worker //       FI
1828*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1829*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
1830*3f1979aaSAndroid Build Coastguard Worker //                                      __constrange(0,255) int imm)
1831*3f1979aaSAndroid Build Coastguard Worker #define _mm_blend_epi16(a, b, imm)                                        \
1832*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                       \
1833*3f1979aaSAndroid Build Coastguard Worker         const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,  \
1834*3f1979aaSAndroid Build Coastguard Worker                                    ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,  \
1835*3f1979aaSAndroid Build Coastguard Worker                                    ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,  \
1836*3f1979aaSAndroid Build Coastguard Worker                                    ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,  \
1837*3f1979aaSAndroid Build Coastguard Worker                                    ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,  \
1838*3f1979aaSAndroid Build Coastguard Worker                                    ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,  \
1839*3f1979aaSAndroid Build Coastguard Worker                                    ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,  \
1840*3f1979aaSAndroid Build Coastguard Worker                                    ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
1841*3f1979aaSAndroid Build Coastguard Worker         uint16x8_t _mask_vec = vld1q_u16(_mask);                          \
1842*3f1979aaSAndroid Build Coastguard Worker         uint16x8_t _a = vreinterpretq_u16_m128i(a);                       \
1843*3f1979aaSAndroid Build Coastguard Worker         uint16x8_t _b = vreinterpretq_u16_m128i(b);                       \
1844*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));            \
1845*3f1979aaSAndroid Build Coastguard Worker     })
1846*3f1979aaSAndroid Build Coastguard Worker 
1847*3f1979aaSAndroid Build Coastguard Worker // Blend packed 8-bit integers from a and b using mask, and store the results in
1848*3f1979aaSAndroid Build Coastguard Worker // dst.
1849*3f1979aaSAndroid Build Coastguard Worker //
1850*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 15
1851*3f1979aaSAndroid Build Coastguard Worker //       i := j*8
1852*3f1979aaSAndroid Build Coastguard Worker //       IF mask[i+7]
1853*3f1979aaSAndroid Build Coastguard Worker //           dst[i+7:i] := b[i+7:i]
1854*3f1979aaSAndroid Build Coastguard Worker //       ELSE
1855*3f1979aaSAndroid Build Coastguard Worker //           dst[i+7:i] := a[i+7:i]
1856*3f1979aaSAndroid Build Coastguard Worker //       FI
1857*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
_mm_blendv_epi8(__m128i _a,__m128i _b,__m128i _mask)1858*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
1859*3f1979aaSAndroid Build Coastguard Worker {
1860*3f1979aaSAndroid Build Coastguard Worker     // Use a signed shift right to create a mask with the sign bit
1861*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t mask =
1862*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
1863*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t a = vreinterpretq_u8_m128i(_a);
1864*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t b = vreinterpretq_u8_m128i(_b);
1865*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
1866*3f1979aaSAndroid Build Coastguard Worker }
1867*3f1979aaSAndroid Build Coastguard Worker 
1868*3f1979aaSAndroid Build Coastguard Worker /* Shifts */
1869*3f1979aaSAndroid Build Coastguard Worker 
1870*3f1979aaSAndroid Build Coastguard Worker 
1871*3f1979aaSAndroid Build Coastguard Worker // Shift packed 16-bit integers in a right by imm while shifting in sign
1872*3f1979aaSAndroid Build Coastguard Worker // bits, and store the results in dst.
1873*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
_mm_srai_epi16(__m128i a,int imm)1874*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
1875*3f1979aaSAndroid Build Coastguard Worker {
1876*3f1979aaSAndroid Build Coastguard Worker     const int count = (imm & ~15) ? 15 : imm;
1877*3f1979aaSAndroid Build Coastguard Worker     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
1878*3f1979aaSAndroid Build Coastguard Worker }
1879*3f1979aaSAndroid Build Coastguard Worker 
1880*3f1979aaSAndroid Build Coastguard Worker // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1881*3f1979aaSAndroid Build Coastguard Worker // shifting in zeros.
1882*3f1979aaSAndroid Build Coastguard Worker //
1883*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 << count
1884*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 << count
1885*3f1979aaSAndroid Build Coastguard Worker //   ...
1886*3f1979aaSAndroid Build Coastguard Worker //   r7 := a7 << count
1887*3f1979aaSAndroid Build Coastguard Worker //
1888*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
1889*3f1979aaSAndroid Build Coastguard Worker #define _mm_slli_epi16(a, imm)                                   \
1890*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                              \
1891*3f1979aaSAndroid Build Coastguard Worker         __m128i ret;                                             \
1892*3f1979aaSAndroid Build Coastguard Worker         if ((imm) <= 0) {                                        \
1893*3f1979aaSAndroid Build Coastguard Worker             ret = a;                                             \
1894*3f1979aaSAndroid Build Coastguard Worker         } else if ((imm) > 15) {                                 \
1895*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_setzero_si128();                           \
1896*3f1979aaSAndroid Build Coastguard Worker         } else {                                                 \
1897*3f1979aaSAndroid Build Coastguard Worker             ret = vreinterpretq_m128i_s16(                       \
1898*3f1979aaSAndroid Build Coastguard Worker                 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
1899*3f1979aaSAndroid Build Coastguard Worker         }                                                        \
1900*3f1979aaSAndroid Build Coastguard Worker         ret;                                                     \
1901*3f1979aaSAndroid Build Coastguard Worker     })
1902*3f1979aaSAndroid Build Coastguard Worker 
1903*3f1979aaSAndroid Build Coastguard Worker // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
1904*3f1979aaSAndroid Build Coastguard Worker // shifting in zeros. :
1905*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
1906*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
_mm_slli_epi32(__m128i a,int imm)1907*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
1908*3f1979aaSAndroid Build Coastguard Worker {
1909*3f1979aaSAndroid Build Coastguard Worker     if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
1910*3f1979aaSAndroid Build Coastguard Worker         return a;
1911*3f1979aaSAndroid Build Coastguard Worker     if (imm > 31) /* TODO: add unlikely macro */
1912*3f1979aaSAndroid Build Coastguard Worker         return _mm_setzero_si128();
1913*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
1914*3f1979aaSAndroid Build Coastguard Worker         vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
1915*3f1979aaSAndroid Build Coastguard Worker }
1916*3f1979aaSAndroid Build Coastguard Worker 
1917*3f1979aaSAndroid Build Coastguard Worker // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
1918*3f1979aaSAndroid Build Coastguard Worker // store the results in dst.
_mm_slli_epi64(__m128i a,int imm)1919*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
1920*3f1979aaSAndroid Build Coastguard Worker {
1921*3f1979aaSAndroid Build Coastguard Worker     if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
1922*3f1979aaSAndroid Build Coastguard Worker         return a;
1923*3f1979aaSAndroid Build Coastguard Worker     if (imm > 63) /* TODO: add unlikely macro */
1924*3f1979aaSAndroid Build Coastguard Worker         return _mm_setzero_si128();
1925*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(
1926*3f1979aaSAndroid Build Coastguard Worker         vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
1927*3f1979aaSAndroid Build Coastguard Worker }
1928*3f1979aaSAndroid Build Coastguard Worker 
1929*3f1979aaSAndroid Build Coastguard Worker // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
1930*3f1979aaSAndroid Build Coastguard Worker // store the results in dst.
1931*3f1979aaSAndroid Build Coastguard Worker //
1932*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
1933*3f1979aaSAndroid Build Coastguard Worker //     i := j*16
1934*3f1979aaSAndroid Build Coastguard Worker //     IF imm8[7:0] > 15
1935*3f1979aaSAndroid Build Coastguard Worker //       dst[i+15:i] := 0
1936*3f1979aaSAndroid Build Coastguard Worker //     ELSE
1937*3f1979aaSAndroid Build Coastguard Worker //       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
1938*3f1979aaSAndroid Build Coastguard Worker //     FI
1939*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1940*3f1979aaSAndroid Build Coastguard Worker //
1941*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
1942*3f1979aaSAndroid Build Coastguard Worker #define _mm_srli_epi16(a, imm)                                             \
1943*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                        \
1944*3f1979aaSAndroid Build Coastguard Worker         __m128i ret;                                                       \
1945*3f1979aaSAndroid Build Coastguard Worker         if ((imm) == 0) {                                                  \
1946*3f1979aaSAndroid Build Coastguard Worker             ret = a;                                                       \
1947*3f1979aaSAndroid Build Coastguard Worker         } else if (0 < (imm) && (imm) < 16) {                              \
1948*3f1979aaSAndroid Build Coastguard Worker             ret = vreinterpretq_m128i_u16(                                 \
1949*3f1979aaSAndroid Build Coastguard Worker                 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
1950*3f1979aaSAndroid Build Coastguard Worker         } else {                                                           \
1951*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_setzero_si128();                                     \
1952*3f1979aaSAndroid Build Coastguard Worker         }                                                                  \
1953*3f1979aaSAndroid Build Coastguard Worker         ret;                                                               \
1954*3f1979aaSAndroid Build Coastguard Worker     })
1955*3f1979aaSAndroid Build Coastguard Worker 
1956*3f1979aaSAndroid Build Coastguard Worker // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
1957*3f1979aaSAndroid Build Coastguard Worker // store the results in dst.
1958*3f1979aaSAndroid Build Coastguard Worker //
1959*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
1960*3f1979aaSAndroid Build Coastguard Worker //     i := j*32
1961*3f1979aaSAndroid Build Coastguard Worker //     IF imm8[7:0] > 31
1962*3f1979aaSAndroid Build Coastguard Worker //       dst[i+31:i] := 0
1963*3f1979aaSAndroid Build Coastguard Worker //     ELSE
1964*3f1979aaSAndroid Build Coastguard Worker //       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
1965*3f1979aaSAndroid Build Coastguard Worker //     FI
1966*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1967*3f1979aaSAndroid Build Coastguard Worker //
1968*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
1969*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
1970*3f1979aaSAndroid Build Coastguard Worker #define _mm_srli_epi32(a, imm)                                             \
1971*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                        \
1972*3f1979aaSAndroid Build Coastguard Worker         __m128i ret;                                                       \
1973*3f1979aaSAndroid Build Coastguard Worker         if ((imm) == 0) {                                                  \
1974*3f1979aaSAndroid Build Coastguard Worker             ret = a;                                                       \
1975*3f1979aaSAndroid Build Coastguard Worker         } else if (0 < (imm) && (imm) < 32) {                              \
1976*3f1979aaSAndroid Build Coastguard Worker             ret = vreinterpretq_m128i_u32(                                 \
1977*3f1979aaSAndroid Build Coastguard Worker                 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
1978*3f1979aaSAndroid Build Coastguard Worker         } else {                                                           \
1979*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_setzero_si128();                                     \
1980*3f1979aaSAndroid Build Coastguard Worker         }                                                                  \
1981*3f1979aaSAndroid Build Coastguard Worker         ret;                                                               \
1982*3f1979aaSAndroid Build Coastguard Worker     })
1983*3f1979aaSAndroid Build Coastguard Worker 
1984*3f1979aaSAndroid Build Coastguard Worker // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
1985*3f1979aaSAndroid Build Coastguard Worker // store the results in dst.
1986*3f1979aaSAndroid Build Coastguard Worker //
1987*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 1
1988*3f1979aaSAndroid Build Coastguard Worker //     i := j*64
1989*3f1979aaSAndroid Build Coastguard Worker //     IF imm8[7:0] > 63
1990*3f1979aaSAndroid Build Coastguard Worker //       dst[i+63:i] := 0
1991*3f1979aaSAndroid Build Coastguard Worker //     ELSE
1992*3f1979aaSAndroid Build Coastguard Worker //       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
1993*3f1979aaSAndroid Build Coastguard Worker //     FI
1994*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
1995*3f1979aaSAndroid Build Coastguard Worker //
1996*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
1997*3f1979aaSAndroid Build Coastguard Worker #define _mm_srli_epi64(a, imm)                                             \
1998*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                        \
1999*3f1979aaSAndroid Build Coastguard Worker         __m128i ret;                                                       \
2000*3f1979aaSAndroid Build Coastguard Worker         if ((imm) == 0) {                                                  \
2001*3f1979aaSAndroid Build Coastguard Worker             ret = a;                                                       \
2002*3f1979aaSAndroid Build Coastguard Worker         } else if (0 < (imm) && (imm) < 64) {                              \
2003*3f1979aaSAndroid Build Coastguard Worker             ret = vreinterpretq_m128i_u64(                                 \
2004*3f1979aaSAndroid Build Coastguard Worker                 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
2005*3f1979aaSAndroid Build Coastguard Worker         } else {                                                           \
2006*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_setzero_si128();                                     \
2007*3f1979aaSAndroid Build Coastguard Worker         }                                                                  \
2008*3f1979aaSAndroid Build Coastguard Worker         ret;                                                               \
2009*3f1979aaSAndroid Build Coastguard Worker     })
2010*3f1979aaSAndroid Build Coastguard Worker 
2011*3f1979aaSAndroid Build Coastguard Worker // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
2012*3f1979aaSAndroid Build Coastguard Worker // and store the results in dst.
2013*3f1979aaSAndroid Build Coastguard Worker //
2014*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
2015*3f1979aaSAndroid Build Coastguard Worker //     i := j*32
2016*3f1979aaSAndroid Build Coastguard Worker //     IF imm8[7:0] > 31
2017*3f1979aaSAndroid Build Coastguard Worker //       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
2018*3f1979aaSAndroid Build Coastguard Worker //     ELSE
2019*3f1979aaSAndroid Build Coastguard Worker //       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
2020*3f1979aaSAndroid Build Coastguard Worker //     FI
2021*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
2022*3f1979aaSAndroid Build Coastguard Worker //
2023*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
2024*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
2025*3f1979aaSAndroid Build Coastguard Worker #define _mm_srai_epi32(a, imm)                                             \
2026*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                        \
2027*3f1979aaSAndroid Build Coastguard Worker         __m128i ret;                                                       \
2028*3f1979aaSAndroid Build Coastguard Worker         if ((imm) == 0) {                                                  \
2029*3f1979aaSAndroid Build Coastguard Worker             ret = a;                                                       \
2030*3f1979aaSAndroid Build Coastguard Worker         } else if (0 < (imm) && (imm) < 32) {                              \
2031*3f1979aaSAndroid Build Coastguard Worker             ret = vreinterpretq_m128i_s32(                                 \
2032*3f1979aaSAndroid Build Coastguard Worker                 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
2033*3f1979aaSAndroid Build Coastguard Worker         } else {                                                           \
2034*3f1979aaSAndroid Build Coastguard Worker             ret = vreinterpretq_m128i_s32(                                 \
2035*3f1979aaSAndroid Build Coastguard Worker                 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
2036*3f1979aaSAndroid Build Coastguard Worker         }                                                                  \
2037*3f1979aaSAndroid Build Coastguard Worker         ret;                                                               \
2038*3f1979aaSAndroid Build Coastguard Worker     })
2039*3f1979aaSAndroid Build Coastguard Worker 
2040*3f1979aaSAndroid Build Coastguard Worker // Shifts the 128 - bit value in a right by imm bytes while shifting in
2041*3f1979aaSAndroid Build Coastguard Worker // zeros.imm must be an immediate.
2042*3f1979aaSAndroid Build Coastguard Worker //
2043*3f1979aaSAndroid Build Coastguard Worker //   r := srl(a, imm*8)
2044*3f1979aaSAndroid Build Coastguard Worker //
2045*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
2046*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
2047*3f1979aaSAndroid Build Coastguard Worker #define _mm_srli_si128(a, imm)                                              \
2048*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                         \
2049*3f1979aaSAndroid Build Coastguard Worker         __m128i ret;                                                        \
2050*3f1979aaSAndroid Build Coastguard Worker         if ((imm) <= 0) {                                                   \
2051*3f1979aaSAndroid Build Coastguard Worker             ret = a;                                                        \
2052*3f1979aaSAndroid Build Coastguard Worker         } else if ((imm) > 15) {                                            \
2053*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_setzero_si128();                                      \
2054*3f1979aaSAndroid Build Coastguard Worker         } else {                                                            \
2055*3f1979aaSAndroid Build Coastguard Worker             ret = vreinterpretq_m128i_s8(                                   \
2056*3f1979aaSAndroid Build Coastguard Worker                 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
2057*3f1979aaSAndroid Build Coastguard Worker         }                                                                   \
2058*3f1979aaSAndroid Build Coastguard Worker         ret;                                                                \
2059*3f1979aaSAndroid Build Coastguard Worker     })
2060*3f1979aaSAndroid Build Coastguard Worker 
2061*3f1979aaSAndroid Build Coastguard Worker // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
2062*3f1979aaSAndroid Build Coastguard Worker // must be an immediate.
2063*3f1979aaSAndroid Build Coastguard Worker //
2064*3f1979aaSAndroid Build Coastguard Worker //   r := a << (imm * 8)
2065*3f1979aaSAndroid Build Coastguard Worker //
2066*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
2067*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
2068*3f1979aaSAndroid Build Coastguard Worker #define _mm_slli_si128(a, imm)                                          \
2069*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                     \
2070*3f1979aaSAndroid Build Coastguard Worker         __m128i ret;                                                    \
2071*3f1979aaSAndroid Build Coastguard Worker         if ((imm) <= 0) {                                               \
2072*3f1979aaSAndroid Build Coastguard Worker             ret = a;                                                    \
2073*3f1979aaSAndroid Build Coastguard Worker         } else if ((imm) > 15) {                                        \
2074*3f1979aaSAndroid Build Coastguard Worker             ret = _mm_setzero_si128();                                  \
2075*3f1979aaSAndroid Build Coastguard Worker         } else {                                                        \
2076*3f1979aaSAndroid Build Coastguard Worker             ret = vreinterpretq_m128i_s8(vextq_s8(                      \
2077*3f1979aaSAndroid Build Coastguard Worker                 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
2078*3f1979aaSAndroid Build Coastguard Worker         }                                                               \
2079*3f1979aaSAndroid Build Coastguard Worker         ret;                                                            \
2080*3f1979aaSAndroid Build Coastguard Worker     })
2081*3f1979aaSAndroid Build Coastguard Worker 
2082*3f1979aaSAndroid Build Coastguard Worker // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
2083*3f1979aaSAndroid Build Coastguard Worker // shifting in zeros.
2084*3f1979aaSAndroid Build Coastguard Worker //
2085*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 << count
2086*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 << count
2087*3f1979aaSAndroid Build Coastguard Worker //   ...
2088*3f1979aaSAndroid Build Coastguard Worker //   r7 := a7 << count
2089*3f1979aaSAndroid Build Coastguard Worker //
2090*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
_mm_sll_epi16(__m128i a,__m128i count)2091*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
2092*3f1979aaSAndroid Build Coastguard Worker {
2093*3f1979aaSAndroid Build Coastguard Worker     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2094*3f1979aaSAndroid Build Coastguard Worker     if (c > 15)
2095*3f1979aaSAndroid Build Coastguard Worker         return _mm_setzero_si128();
2096*3f1979aaSAndroid Build Coastguard Worker 
2097*3f1979aaSAndroid Build Coastguard Worker     int16x8_t vc = vdupq_n_s16((int16_t) c);
2098*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
2099*3f1979aaSAndroid Build Coastguard Worker }
2100*3f1979aaSAndroid Build Coastguard Worker 
2101*3f1979aaSAndroid Build Coastguard Worker // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2102*3f1979aaSAndroid Build Coastguard Worker // shifting in zeros.
2103*3f1979aaSAndroid Build Coastguard Worker //
2104*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 << count
2105*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 << count
2106*3f1979aaSAndroid Build Coastguard Worker // r2 := a2 << count
2107*3f1979aaSAndroid Build Coastguard Worker // r3 := a3 << count
2108*3f1979aaSAndroid Build Coastguard Worker //
2109*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
_mm_sll_epi32(__m128i a,__m128i count)2110*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
2111*3f1979aaSAndroid Build Coastguard Worker {
2112*3f1979aaSAndroid Build Coastguard Worker     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2113*3f1979aaSAndroid Build Coastguard Worker     if (c > 31)
2114*3f1979aaSAndroid Build Coastguard Worker         return _mm_setzero_si128();
2115*3f1979aaSAndroid Build Coastguard Worker 
2116*3f1979aaSAndroid Build Coastguard Worker     int32x4_t vc = vdupq_n_s32((int32_t) c);
2117*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
2118*3f1979aaSAndroid Build Coastguard Worker }
2119*3f1979aaSAndroid Build Coastguard Worker 
2120*3f1979aaSAndroid Build Coastguard Worker // Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
2121*3f1979aaSAndroid Build Coastguard Worker // shifting in zeros.
2122*3f1979aaSAndroid Build Coastguard Worker //
2123*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 << count
2124*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 << count
2125*3f1979aaSAndroid Build Coastguard Worker //
2126*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
_mm_sll_epi64(__m128i a,__m128i count)2127*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
2128*3f1979aaSAndroid Build Coastguard Worker {
2129*3f1979aaSAndroid Build Coastguard Worker     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2130*3f1979aaSAndroid Build Coastguard Worker     if (c > 63)
2131*3f1979aaSAndroid Build Coastguard Worker         return _mm_setzero_si128();
2132*3f1979aaSAndroid Build Coastguard Worker 
2133*3f1979aaSAndroid Build Coastguard Worker     int64x2_t vc = vdupq_n_s64((int64_t) c);
2134*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
2135*3f1979aaSAndroid Build Coastguard Worker }
2136*3f1979aaSAndroid Build Coastguard Worker 
2137*3f1979aaSAndroid Build Coastguard Worker // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
2138*3f1979aaSAndroid Build Coastguard Worker // while shifting in zeros.
2139*3f1979aaSAndroid Build Coastguard Worker //
2140*3f1979aaSAndroid Build Coastguard Worker // r0 := srl(a0, count)
2141*3f1979aaSAndroid Build Coastguard Worker // r1 := srl(a1, count)
2142*3f1979aaSAndroid Build Coastguard Worker // ...
2143*3f1979aaSAndroid Build Coastguard Worker // r7 := srl(a7, count)
2144*3f1979aaSAndroid Build Coastguard Worker //
2145*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
_mm_srl_epi16(__m128i a,__m128i count)2146*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
2147*3f1979aaSAndroid Build Coastguard Worker {
2148*3f1979aaSAndroid Build Coastguard Worker     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2149*3f1979aaSAndroid Build Coastguard Worker     if (c > 15)
2150*3f1979aaSAndroid Build Coastguard Worker         return _mm_setzero_si128();
2151*3f1979aaSAndroid Build Coastguard Worker 
2152*3f1979aaSAndroid Build Coastguard Worker     int16x8_t vc = vdupq_n_s16(-(int16_t) c);
2153*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
2154*3f1979aaSAndroid Build Coastguard Worker }
2155*3f1979aaSAndroid Build Coastguard Worker 
2156*3f1979aaSAndroid Build Coastguard Worker // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
2157*3f1979aaSAndroid Build Coastguard Worker // while shifting in zeros.
2158*3f1979aaSAndroid Build Coastguard Worker //
2159*3f1979aaSAndroid Build Coastguard Worker // r0 := srl(a0, count)
2160*3f1979aaSAndroid Build Coastguard Worker // r1 := srl(a1, count)
2161*3f1979aaSAndroid Build Coastguard Worker // r2 := srl(a2, count)
2162*3f1979aaSAndroid Build Coastguard Worker // r3 := srl(a3, count)
2163*3f1979aaSAndroid Build Coastguard Worker //
2164*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
_mm_srl_epi32(__m128i a,__m128i count)2165*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
2166*3f1979aaSAndroid Build Coastguard Worker {
2167*3f1979aaSAndroid Build Coastguard Worker     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2168*3f1979aaSAndroid Build Coastguard Worker     if (c > 31)
2169*3f1979aaSAndroid Build Coastguard Worker         return _mm_setzero_si128();
2170*3f1979aaSAndroid Build Coastguard Worker 
2171*3f1979aaSAndroid Build Coastguard Worker     int32x4_t vc = vdupq_n_s32(-(int32_t) c);
2172*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
2173*3f1979aaSAndroid Build Coastguard Worker }
2174*3f1979aaSAndroid Build Coastguard Worker 
2175*3f1979aaSAndroid Build Coastguard Worker // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
2176*3f1979aaSAndroid Build Coastguard Worker // while shifting in zeros.
2177*3f1979aaSAndroid Build Coastguard Worker //
2178*3f1979aaSAndroid Build Coastguard Worker // r0 := srl(a0, count)
2179*3f1979aaSAndroid Build Coastguard Worker // r1 := srl(a1, count)
2180*3f1979aaSAndroid Build Coastguard Worker //
2181*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
_mm_srl_epi64(__m128i a,__m128i count)2182*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
2183*3f1979aaSAndroid Build Coastguard Worker {
2184*3f1979aaSAndroid Build Coastguard Worker     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2185*3f1979aaSAndroid Build Coastguard Worker     if (c > 63)
2186*3f1979aaSAndroid Build Coastguard Worker         return _mm_setzero_si128();
2187*3f1979aaSAndroid Build Coastguard Worker 
2188*3f1979aaSAndroid Build Coastguard Worker     int64x2_t vc = vdupq_n_s64(-(int64_t) c);
2189*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
2190*3f1979aaSAndroid Build Coastguard Worker }
2191*3f1979aaSAndroid Build Coastguard Worker 
2192*3f1979aaSAndroid Build Coastguard Worker // NEON does not provide a version of this function.
2193*3f1979aaSAndroid Build Coastguard Worker // Creates a 16-bit mask from the most significant bits of the 16 signed or
2194*3f1979aaSAndroid Build Coastguard Worker // unsigned 8-bit integers in a and zero extends the upper bits.
2195*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
_mm_movemask_epi8(__m128i a)2196*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_movemask_epi8(__m128i a)
2197*3f1979aaSAndroid Build Coastguard Worker {
2198*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2199*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t input = vreinterpretq_u8_m128i(a);
2200*3f1979aaSAndroid Build Coastguard Worker     const int8_t ALIGN_STRUCT(16)
2201*3f1979aaSAndroid Build Coastguard Worker         xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
2202*3f1979aaSAndroid Build Coastguard Worker     const uint8x16_t mask_and = vdupq_n_u8(0x80);
2203*3f1979aaSAndroid Build Coastguard Worker     const int8x16_t mask_shift = vld1q_s8(xr);
2204*3f1979aaSAndroid Build Coastguard Worker     const uint8x16_t mask_result =
2205*3f1979aaSAndroid Build Coastguard Worker         vshlq_u8(vandq_u8(input, mask_and), mask_shift);
2206*3f1979aaSAndroid Build Coastguard Worker     uint8x8_t lo = vget_low_u8(mask_result);
2207*3f1979aaSAndroid Build Coastguard Worker     uint8x8_t hi = vget_high_u8(mask_result);
2208*3f1979aaSAndroid Build Coastguard Worker 
2209*3f1979aaSAndroid Build Coastguard Worker     return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
2210*3f1979aaSAndroid Build Coastguard Worker #else
2211*3f1979aaSAndroid Build Coastguard Worker     // Use increasingly wide shifts+adds to collect the sign bits
2212*3f1979aaSAndroid Build Coastguard Worker     // together.
2213*3f1979aaSAndroid Build Coastguard Worker     // Since the widening shifts would be rather confusing to follow in little
2214*3f1979aaSAndroid Build Coastguard Worker     // endian, everything will be illustrated in big endian order instead. This
2215*3f1979aaSAndroid Build Coastguard Worker     // has a different result - the bits would actually be reversed on a big
2216*3f1979aaSAndroid Build Coastguard Worker     // endian machine.
2217*3f1979aaSAndroid Build Coastguard Worker 
2218*3f1979aaSAndroid Build Coastguard Worker     // Starting input (only half the elements are shown):
2219*3f1979aaSAndroid Build Coastguard Worker     // 89 ff 1d c0 00 10 99 33
2220*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t input = vreinterpretq_u8_m128i(a);
2221*3f1979aaSAndroid Build Coastguard Worker 
2222*3f1979aaSAndroid Build Coastguard Worker     // Shift out everything but the sign bits with an unsigned shift right.
2223*3f1979aaSAndroid Build Coastguard Worker     //
2224*3f1979aaSAndroid Build Coastguard Worker     // Bytes of the vector::
2225*3f1979aaSAndroid Build Coastguard Worker     // 89 ff 1d c0 00 10 99 33
2226*3f1979aaSAndroid Build Coastguard Worker     // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
2227*3f1979aaSAndroid Build Coastguard Worker     //  |  |  |  |  |  |  |  |
2228*3f1979aaSAndroid Build Coastguard Worker     // 01 01 00 01 00 00 01 00
2229*3f1979aaSAndroid Build Coastguard Worker     //
2230*3f1979aaSAndroid Build Coastguard Worker     // Bits of first important lane(s):
2231*3f1979aaSAndroid Build Coastguard Worker     // 10001001 (89)
2232*3f1979aaSAndroid Build Coastguard Worker     // \______
2233*3f1979aaSAndroid Build Coastguard Worker     //        |
2234*3f1979aaSAndroid Build Coastguard Worker     // 00000001 (01)
2235*3f1979aaSAndroid Build Coastguard Worker     uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
2236*3f1979aaSAndroid Build Coastguard Worker 
2237*3f1979aaSAndroid Build Coastguard Worker     // Merge the even lanes together with a 16-bit unsigned shift right + add.
2238*3f1979aaSAndroid Build Coastguard Worker     // 'xx' represents garbage data which will be ignored in the final result.
2239*3f1979aaSAndroid Build Coastguard Worker     // In the important bytes, the add functions like a binary OR.
2240*3f1979aaSAndroid Build Coastguard Worker     //
2241*3f1979aaSAndroid Build Coastguard Worker     // 01 01 00 01 00 00 01 00
2242*3f1979aaSAndroid Build Coastguard Worker     //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
2243*3f1979aaSAndroid Build Coastguard Worker     //    \|    \|    \|    \|
2244*3f1979aaSAndroid Build Coastguard Worker     // xx 03 xx 01 xx 00 xx 02
2245*3f1979aaSAndroid Build Coastguard Worker     //
2246*3f1979aaSAndroid Build Coastguard Worker     // 00000001 00000001 (01 01)
2247*3f1979aaSAndroid Build Coastguard Worker     //        \_______ |
2248*3f1979aaSAndroid Build Coastguard Worker     //                \|
2249*3f1979aaSAndroid Build Coastguard Worker     // xxxxxxxx xxxxxx11 (xx 03)
2250*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t paired16 =
2251*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2252*3f1979aaSAndroid Build Coastguard Worker 
2253*3f1979aaSAndroid Build Coastguard Worker     // Repeat with a wider 32-bit shift + add.
2254*3f1979aaSAndroid Build Coastguard Worker     // xx 03 xx 01 xx 00 xx 02
2255*3f1979aaSAndroid Build Coastguard Worker     //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
2256*3f1979aaSAndroid Build Coastguard Worker     //     14))
2257*3f1979aaSAndroid Build Coastguard Worker     //          \|          \|
2258*3f1979aaSAndroid Build Coastguard Worker     // xx xx xx 0d xx xx xx 02
2259*3f1979aaSAndroid Build Coastguard Worker     //
2260*3f1979aaSAndroid Build Coastguard Worker     // 00000011 00000001 (03 01)
2261*3f1979aaSAndroid Build Coastguard Worker     //        \\_____ ||
2262*3f1979aaSAndroid Build Coastguard Worker     //         '----.\||
2263*3f1979aaSAndroid Build Coastguard Worker     // xxxxxxxx xxxx1101 (xx 0d)
2264*3f1979aaSAndroid Build Coastguard Worker     uint64x2_t paired32 =
2265*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2266*3f1979aaSAndroid Build Coastguard Worker 
2267*3f1979aaSAndroid Build Coastguard Worker     // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
2268*3f1979aaSAndroid Build Coastguard Worker     // lanes. xx xx xx 0d xx xx xx 02
2269*3f1979aaSAndroid Build Coastguard Worker     //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
2270*3f1979aaSAndroid Build Coastguard Worker     //            28))
2271*3f1979aaSAndroid Build Coastguard Worker     //                      \|
2272*3f1979aaSAndroid Build Coastguard Worker     // xx xx xx xx xx xx xx d2
2273*3f1979aaSAndroid Build Coastguard Worker     //
2274*3f1979aaSAndroid Build Coastguard Worker     // 00001101 00000010 (0d 02)
2275*3f1979aaSAndroid Build Coastguard Worker     //     \   \___ |  |
2276*3f1979aaSAndroid Build Coastguard Worker     //      '---.  \|  |
2277*3f1979aaSAndroid Build Coastguard Worker     // xxxxxxxx 11010010 (xx d2)
2278*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t paired64 =
2279*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2280*3f1979aaSAndroid Build Coastguard Worker 
2281*3f1979aaSAndroid Build Coastguard Worker     // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
2282*3f1979aaSAndroid Build Coastguard Worker     // xx xx xx xx xx xx xx d2
2283*3f1979aaSAndroid Build Coastguard Worker     //                      ||  return paired64[0]
2284*3f1979aaSAndroid Build Coastguard Worker     //                      d2
2285*3f1979aaSAndroid Build Coastguard Worker     // Note: Little endian would return the correct value 4b (01001011) instead.
2286*3f1979aaSAndroid Build Coastguard Worker     return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
2287*3f1979aaSAndroid Build Coastguard Worker #endif
2288*3f1979aaSAndroid Build Coastguard Worker }
2289*3f1979aaSAndroid Build Coastguard Worker 
2290*3f1979aaSAndroid Build Coastguard Worker // Copy the lower 64-bit integer in a to dst.
2291*3f1979aaSAndroid Build Coastguard Worker //
2292*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := a[63:0]
2293*3f1979aaSAndroid Build Coastguard Worker //
2294*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
_mm_movepi64_pi64(__m128i a)2295*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
2296*3f1979aaSAndroid Build Coastguard Worker {
2297*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
2298*3f1979aaSAndroid Build Coastguard Worker }
2299*3f1979aaSAndroid Build Coastguard Worker 
2300*3f1979aaSAndroid Build Coastguard Worker // Copy the 64-bit integer a to the lower element of dst, and zero the upper
2301*3f1979aaSAndroid Build Coastguard Worker // element.
2302*3f1979aaSAndroid Build Coastguard Worker //
2303*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := a[63:0]
2304*3f1979aaSAndroid Build Coastguard Worker //   dst[127:64] := 0
2305*3f1979aaSAndroid Build Coastguard Worker //
2306*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
_mm_movpi64_epi64(__m64 a)2307*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
2308*3f1979aaSAndroid Build Coastguard Worker {
2309*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(
2310*3f1979aaSAndroid Build Coastguard Worker         vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
2311*3f1979aaSAndroid Build Coastguard Worker }
2312*3f1979aaSAndroid Build Coastguard Worker 
2313*3f1979aaSAndroid Build Coastguard Worker // NEON does not provide this method
2314*3f1979aaSAndroid Build Coastguard Worker // Creates a 4-bit mask from the most significant bits of the four
2315*3f1979aaSAndroid Build Coastguard Worker // single-precision, floating-point values.
2316*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
_mm_movemask_ps(__m128 a)2317*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_movemask_ps(__m128 a)
2318*3f1979aaSAndroid Build Coastguard Worker {
2319*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t input = vreinterpretq_u32_m128(a);
2320*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2321*3f1979aaSAndroid Build Coastguard Worker     static const int32x4_t shift = {0, 1, 2, 3};
2322*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t tmp = vshrq_n_u32(input, 31);
2323*3f1979aaSAndroid Build Coastguard Worker     return vaddvq_u32(vshlq_u32(tmp, shift));
2324*3f1979aaSAndroid Build Coastguard Worker #else
2325*3f1979aaSAndroid Build Coastguard Worker     // Uses the exact same method as _mm_movemask_epi8, see that for details.
2326*3f1979aaSAndroid Build Coastguard Worker     // Shift out everything but the sign bits with a 32-bit unsigned shift
2327*3f1979aaSAndroid Build Coastguard Worker     // right.
2328*3f1979aaSAndroid Build Coastguard Worker     uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2329*3f1979aaSAndroid Build Coastguard Worker     // Merge the two pairs together with a 64-bit unsigned shift right + add.
2330*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t paired =
2331*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2332*3f1979aaSAndroid Build Coastguard Worker     // Extract the result.
2333*3f1979aaSAndroid Build Coastguard Worker     return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2334*3f1979aaSAndroid Build Coastguard Worker #endif
2335*3f1979aaSAndroid Build Coastguard Worker }
2336*3f1979aaSAndroid Build Coastguard Worker 
2337*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
2338*3f1979aaSAndroid Build Coastguard Worker // all 1's, and return 1 if the result is zero, otherwise return 0.
2339*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
_mm_test_all_ones(__m128i a)2340*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_test_all_ones(__m128i a)
2341*3f1979aaSAndroid Build Coastguard Worker {
2342*3f1979aaSAndroid Build Coastguard Worker     return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
2343*3f1979aaSAndroid Build Coastguard Worker            ~(uint64_t) 0;
2344*3f1979aaSAndroid Build Coastguard Worker }
2345*3f1979aaSAndroid Build Coastguard Worker 
2346*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise AND of 128 bits (representing integer data) in a and
2347*3f1979aaSAndroid Build Coastguard Worker // mask, and return 1 if the result is zero, otherwise return 0.
2348*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
_mm_test_all_zeros(__m128i a,__m128i mask)2349*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
2350*3f1979aaSAndroid Build Coastguard Worker {
2351*3f1979aaSAndroid Build Coastguard Worker     int64x2_t a_and_mask =
2352*3f1979aaSAndroid Build Coastguard Worker         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
2353*3f1979aaSAndroid Build Coastguard Worker     return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
2354*3f1979aaSAndroid Build Coastguard Worker                                                                            : 1;
2355*3f1979aaSAndroid Build Coastguard Worker }
2356*3f1979aaSAndroid Build Coastguard Worker 
2357*3f1979aaSAndroid Build Coastguard Worker /* Math operations */
2358*3f1979aaSAndroid Build Coastguard Worker 
2359*3f1979aaSAndroid Build Coastguard Worker // Subtracts the four single-precision, floating-point values of a and b.
2360*3f1979aaSAndroid Build Coastguard Worker //
2361*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 - b0
2362*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 - b1
2363*3f1979aaSAndroid Build Coastguard Worker //   r2 := a2 - b2
2364*3f1979aaSAndroid Build Coastguard Worker //   r3 := a3 - b3
2365*3f1979aaSAndroid Build Coastguard Worker //
2366*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
_mm_sub_ps(__m128 a,__m128 b)2367*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2368*3f1979aaSAndroid Build Coastguard Worker {
2369*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
2370*3f1979aaSAndroid Build Coastguard Worker         vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2371*3f1979aaSAndroid Build Coastguard Worker }
2372*3f1979aaSAndroid Build Coastguard Worker 
2373*3f1979aaSAndroid Build Coastguard Worker // Subtract the lower single-precision (32-bit) floating-point element in b from
2374*3f1979aaSAndroid Build Coastguard Worker // the lower single-precision (32-bit) floating-point element in a, store the
2375*3f1979aaSAndroid Build Coastguard Worker // result in the lower element of dst, and copy the upper 3 packed elements from
2376*3f1979aaSAndroid Build Coastguard Worker // a to the upper elements of dst.
2377*3f1979aaSAndroid Build Coastguard Worker //
2378*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := a[31:0] - b[31:0]
2379*3f1979aaSAndroid Build Coastguard Worker //   dst[127:32] := a[127:32]
2380*3f1979aaSAndroid Build Coastguard Worker //
2381*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
_mm_sub_ss(__m128 a,__m128 b)2382*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2383*3f1979aaSAndroid Build Coastguard Worker {
2384*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_sub_ps(a, b));
2385*3f1979aaSAndroid Build Coastguard Worker }
2386*3f1979aaSAndroid Build Coastguard Worker 
2387*3f1979aaSAndroid Build Coastguard Worker // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
2388*3f1979aaSAndroid Build Coastguard Worker // and store the results in dst.
2389*3f1979aaSAndroid Build Coastguard Worker //    r0 := a0 - b0
2390*3f1979aaSAndroid Build Coastguard Worker //    r1 := a1 - b1
_mm_sub_epi64(__m128i a,__m128i b)2391*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
2392*3f1979aaSAndroid Build Coastguard Worker {
2393*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(
2394*3f1979aaSAndroid Build Coastguard Worker         vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2395*3f1979aaSAndroid Build Coastguard Worker }
2396*3f1979aaSAndroid Build Coastguard Worker 
2397*3f1979aaSAndroid Build Coastguard Worker // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
2398*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers of a.
2399*3f1979aaSAndroid Build Coastguard Worker //
2400*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 - b0
2401*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 - b1
2402*3f1979aaSAndroid Build Coastguard Worker //   r2 := a2 - b2
2403*3f1979aaSAndroid Build Coastguard Worker //   r3 := a3 - b3
2404*3f1979aaSAndroid Build Coastguard Worker //
2405*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
_mm_sub_epi32(__m128i a,__m128i b)2406*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
2407*3f1979aaSAndroid Build Coastguard Worker {
2408*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
2409*3f1979aaSAndroid Build Coastguard Worker         vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2410*3f1979aaSAndroid Build Coastguard Worker }
2411*3f1979aaSAndroid Build Coastguard Worker 
_mm_sub_epi16(__m128i a,__m128i b)2412*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
2413*3f1979aaSAndroid Build Coastguard Worker {
2414*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
2415*3f1979aaSAndroid Build Coastguard Worker         vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2416*3f1979aaSAndroid Build Coastguard Worker }
2417*3f1979aaSAndroid Build Coastguard Worker 
_mm_sub_epi8(__m128i a,__m128i b)2418*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
2419*3f1979aaSAndroid Build Coastguard Worker {
2420*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(
2421*3f1979aaSAndroid Build Coastguard Worker         vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2422*3f1979aaSAndroid Build Coastguard Worker }
2423*3f1979aaSAndroid Build Coastguard Worker 
2424*3f1979aaSAndroid Build Coastguard Worker // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
2425*3f1979aaSAndroid Build Coastguard Worker //
2426*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := a[63:0] - b[63:0]
2427*3f1979aaSAndroid Build Coastguard Worker //
2428*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
_mm_sub_si64(__m64 a,__m64 b)2429*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
2430*3f1979aaSAndroid Build Coastguard Worker {
2431*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s64(
2432*3f1979aaSAndroid Build Coastguard Worker         vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2433*3f1979aaSAndroid Build Coastguard Worker }
2434*3f1979aaSAndroid Build Coastguard Worker 
2435*3f1979aaSAndroid Build Coastguard Worker // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
2436*3f1979aaSAndroid Build Coastguard Worker // integers of a and saturates..
2437*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
_mm_subs_epu16(__m128i a,__m128i b)2438*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
2439*3f1979aaSAndroid Build Coastguard Worker {
2440*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u16(
2441*3f1979aaSAndroid Build Coastguard Worker         vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2442*3f1979aaSAndroid Build Coastguard Worker }
2443*3f1979aaSAndroid Build Coastguard Worker 
2444*3f1979aaSAndroid Build Coastguard Worker // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
2445*3f1979aaSAndroid Build Coastguard Worker // integers of a and saturates.
2446*3f1979aaSAndroid Build Coastguard Worker //
2447*3f1979aaSAndroid Build Coastguard Worker //   r0 := UnsignedSaturate(a0 - b0)
2448*3f1979aaSAndroid Build Coastguard Worker //   r1 := UnsignedSaturate(a1 - b1)
2449*3f1979aaSAndroid Build Coastguard Worker //   ...
2450*3f1979aaSAndroid Build Coastguard Worker //   r15 := UnsignedSaturate(a15 - b15)
2451*3f1979aaSAndroid Build Coastguard Worker //
2452*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
_mm_subs_epu8(__m128i a,__m128i b)2453*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
2454*3f1979aaSAndroid Build Coastguard Worker {
2455*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(
2456*3f1979aaSAndroid Build Coastguard Worker         vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2457*3f1979aaSAndroid Build Coastguard Worker }
2458*3f1979aaSAndroid Build Coastguard Worker 
2459*3f1979aaSAndroid Build Coastguard Worker // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
2460*3f1979aaSAndroid Build Coastguard Worker // of a and saturates.
2461*3f1979aaSAndroid Build Coastguard Worker //
2462*3f1979aaSAndroid Build Coastguard Worker //   r0 := SignedSaturate(a0 - b0)
2463*3f1979aaSAndroid Build Coastguard Worker //   r1 := SignedSaturate(a1 - b1)
2464*3f1979aaSAndroid Build Coastguard Worker //   ...
2465*3f1979aaSAndroid Build Coastguard Worker //   r15 := SignedSaturate(a15 - b15)
2466*3f1979aaSAndroid Build Coastguard Worker //
2467*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
_mm_subs_epi8(__m128i a,__m128i b)2468*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
2469*3f1979aaSAndroid Build Coastguard Worker {
2470*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(
2471*3f1979aaSAndroid Build Coastguard Worker         vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2472*3f1979aaSAndroid Build Coastguard Worker }
2473*3f1979aaSAndroid Build Coastguard Worker 
2474*3f1979aaSAndroid Build Coastguard Worker // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
2475*3f1979aaSAndroid Build Coastguard Worker // of a and saturates.
2476*3f1979aaSAndroid Build Coastguard Worker //
2477*3f1979aaSAndroid Build Coastguard Worker //   r0 := SignedSaturate(a0 - b0)
2478*3f1979aaSAndroid Build Coastguard Worker //   r1 := SignedSaturate(a1 - b1)
2479*3f1979aaSAndroid Build Coastguard Worker //   ...
2480*3f1979aaSAndroid Build Coastguard Worker //   r7 := SignedSaturate(a7 - b7)
2481*3f1979aaSAndroid Build Coastguard Worker //
2482*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
_mm_subs_epi16(__m128i a,__m128i b)2483*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
2484*3f1979aaSAndroid Build Coastguard Worker {
2485*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
2486*3f1979aaSAndroid Build Coastguard Worker         vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2487*3f1979aaSAndroid Build Coastguard Worker }
2488*3f1979aaSAndroid Build Coastguard Worker 
_mm_adds_epu16(__m128i a,__m128i b)2489*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2490*3f1979aaSAndroid Build Coastguard Worker {
2491*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u16(
2492*3f1979aaSAndroid Build Coastguard Worker         vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2493*3f1979aaSAndroid Build Coastguard Worker }
2494*3f1979aaSAndroid Build Coastguard Worker 
2495*3f1979aaSAndroid Build Coastguard Worker // Negate packed 8-bit integers in a when the corresponding signed
2496*3f1979aaSAndroid Build Coastguard Worker // 8-bit integer in b is negative, and store the results in dst.
2497*3f1979aaSAndroid Build Coastguard Worker // Element in dst are zeroed out when the corresponding element
2498*3f1979aaSAndroid Build Coastguard Worker // in b is zero.
2499*3f1979aaSAndroid Build Coastguard Worker //
2500*3f1979aaSAndroid Build Coastguard Worker //   for i in 0..15
2501*3f1979aaSAndroid Build Coastguard Worker //     if b[i] < 0
2502*3f1979aaSAndroid Build Coastguard Worker //       r[i] := -a[i]
2503*3f1979aaSAndroid Build Coastguard Worker //     else if b[i] == 0
2504*3f1979aaSAndroid Build Coastguard Worker //       r[i] := 0
2505*3f1979aaSAndroid Build Coastguard Worker //     else
2506*3f1979aaSAndroid Build Coastguard Worker //       r[i] := a[i]
2507*3f1979aaSAndroid Build Coastguard Worker //     fi
2508*3f1979aaSAndroid Build Coastguard Worker //   done
_mm_sign_epi8(__m128i _a,__m128i _b)2509*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
2510*3f1979aaSAndroid Build Coastguard Worker {
2511*3f1979aaSAndroid Build Coastguard Worker     int8x16_t a = vreinterpretq_s8_m128i(_a);
2512*3f1979aaSAndroid Build Coastguard Worker     int8x16_t b = vreinterpretq_s8_m128i(_b);
2513*3f1979aaSAndroid Build Coastguard Worker 
2514*3f1979aaSAndroid Build Coastguard Worker     // signed shift right: faster than vclt
2515*3f1979aaSAndroid Build Coastguard Worker     // (b < 0) ? 0xFF : 0
2516*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
2517*3f1979aaSAndroid Build Coastguard Worker 
2518*3f1979aaSAndroid Build Coastguard Worker     // (b == 0) ? 0xFF : 0
2519*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2520*3f1979aaSAndroid Build Coastguard Worker     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
2521*3f1979aaSAndroid Build Coastguard Worker #else
2522*3f1979aaSAndroid Build Coastguard Worker     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
2523*3f1979aaSAndroid Build Coastguard Worker #endif
2524*3f1979aaSAndroid Build Coastguard Worker 
2525*3f1979aaSAndroid Build Coastguard Worker     // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
2526*3f1979aaSAndroid Build Coastguard Worker     // based on ltMask
2527*3f1979aaSAndroid Build Coastguard Worker     int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
2528*3f1979aaSAndroid Build Coastguard Worker     // res = masked & (~zeroMask)
2529*3f1979aaSAndroid Build Coastguard Worker     int8x16_t res = vbicq_s8(masked, zeroMask);
2530*3f1979aaSAndroid Build Coastguard Worker 
2531*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(res);
2532*3f1979aaSAndroid Build Coastguard Worker }
2533*3f1979aaSAndroid Build Coastguard Worker 
2534*3f1979aaSAndroid Build Coastguard Worker // Negate packed 16-bit integers in a when the corresponding signed
2535*3f1979aaSAndroid Build Coastguard Worker // 16-bit integer in b is negative, and store the results in dst.
2536*3f1979aaSAndroid Build Coastguard Worker // Element in dst are zeroed out when the corresponding element
2537*3f1979aaSAndroid Build Coastguard Worker // in b is zero.
2538*3f1979aaSAndroid Build Coastguard Worker //
2539*3f1979aaSAndroid Build Coastguard Worker //   for i in 0..7
2540*3f1979aaSAndroid Build Coastguard Worker //     if b[i] < 0
2541*3f1979aaSAndroid Build Coastguard Worker //       r[i] := -a[i]
2542*3f1979aaSAndroid Build Coastguard Worker //     else if b[i] == 0
2543*3f1979aaSAndroid Build Coastguard Worker //       r[i] := 0
2544*3f1979aaSAndroid Build Coastguard Worker //     else
2545*3f1979aaSAndroid Build Coastguard Worker //       r[i] := a[i]
2546*3f1979aaSAndroid Build Coastguard Worker //     fi
2547*3f1979aaSAndroid Build Coastguard Worker //   done
_mm_sign_epi16(__m128i _a,__m128i _b)2548*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
2549*3f1979aaSAndroid Build Coastguard Worker {
2550*3f1979aaSAndroid Build Coastguard Worker     int16x8_t a = vreinterpretq_s16_m128i(_a);
2551*3f1979aaSAndroid Build Coastguard Worker     int16x8_t b = vreinterpretq_s16_m128i(_b);
2552*3f1979aaSAndroid Build Coastguard Worker 
2553*3f1979aaSAndroid Build Coastguard Worker     // signed shift right: faster than vclt
2554*3f1979aaSAndroid Build Coastguard Worker     // (b < 0) ? 0xFFFF : 0
2555*3f1979aaSAndroid Build Coastguard Worker     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
2556*3f1979aaSAndroid Build Coastguard Worker     // (b == 0) ? 0xFFFF : 0
2557*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2558*3f1979aaSAndroid Build Coastguard Worker     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
2559*3f1979aaSAndroid Build Coastguard Worker #else
2560*3f1979aaSAndroid Build Coastguard Worker     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
2561*3f1979aaSAndroid Build Coastguard Worker #endif
2562*3f1979aaSAndroid Build Coastguard Worker 
2563*3f1979aaSAndroid Build Coastguard Worker     // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
2564*3f1979aaSAndroid Build Coastguard Worker     // 'a') based on ltMask
2565*3f1979aaSAndroid Build Coastguard Worker     int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
2566*3f1979aaSAndroid Build Coastguard Worker     // res = masked & (~zeroMask)
2567*3f1979aaSAndroid Build Coastguard Worker     int16x8_t res = vbicq_s16(masked, zeroMask);
2568*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(res);
2569*3f1979aaSAndroid Build Coastguard Worker }
2570*3f1979aaSAndroid Build Coastguard Worker 
2571*3f1979aaSAndroid Build Coastguard Worker // Negate packed 32-bit integers in a when the corresponding signed
2572*3f1979aaSAndroid Build Coastguard Worker // 32-bit integer in b is negative, and store the results in dst.
2573*3f1979aaSAndroid Build Coastguard Worker // Element in dst are zeroed out when the corresponding element
2574*3f1979aaSAndroid Build Coastguard Worker // in b is zero.
2575*3f1979aaSAndroid Build Coastguard Worker //
2576*3f1979aaSAndroid Build Coastguard Worker //   for i in 0..3
2577*3f1979aaSAndroid Build Coastguard Worker //     if b[i] < 0
2578*3f1979aaSAndroid Build Coastguard Worker //       r[i] := -a[i]
2579*3f1979aaSAndroid Build Coastguard Worker //     else if b[i] == 0
2580*3f1979aaSAndroid Build Coastguard Worker //       r[i] := 0
2581*3f1979aaSAndroid Build Coastguard Worker //     else
2582*3f1979aaSAndroid Build Coastguard Worker //       r[i] := a[i]
2583*3f1979aaSAndroid Build Coastguard Worker //     fi
2584*3f1979aaSAndroid Build Coastguard Worker //   done
_mm_sign_epi32(__m128i _a,__m128i _b)2585*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
2586*3f1979aaSAndroid Build Coastguard Worker {
2587*3f1979aaSAndroid Build Coastguard Worker     int32x4_t a = vreinterpretq_s32_m128i(_a);
2588*3f1979aaSAndroid Build Coastguard Worker     int32x4_t b = vreinterpretq_s32_m128i(_b);
2589*3f1979aaSAndroid Build Coastguard Worker 
2590*3f1979aaSAndroid Build Coastguard Worker     // signed shift right: faster than vclt
2591*3f1979aaSAndroid Build Coastguard Worker     // (b < 0) ? 0xFFFFFFFF : 0
2592*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
2593*3f1979aaSAndroid Build Coastguard Worker 
2594*3f1979aaSAndroid Build Coastguard Worker     // (b == 0) ? 0xFFFFFFFF : 0
2595*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2596*3f1979aaSAndroid Build Coastguard Worker     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
2597*3f1979aaSAndroid Build Coastguard Worker #else
2598*3f1979aaSAndroid Build Coastguard Worker     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
2599*3f1979aaSAndroid Build Coastguard Worker #endif
2600*3f1979aaSAndroid Build Coastguard Worker 
2601*3f1979aaSAndroid Build Coastguard Worker     // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
2602*3f1979aaSAndroid Build Coastguard Worker     // 'a') based on ltMask
2603*3f1979aaSAndroid Build Coastguard Worker     int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
2604*3f1979aaSAndroid Build Coastguard Worker     // res = masked & (~zeroMask)
2605*3f1979aaSAndroid Build Coastguard Worker     int32x4_t res = vbicq_s32(masked, zeroMask);
2606*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(res);
2607*3f1979aaSAndroid Build Coastguard Worker }
2608*3f1979aaSAndroid Build Coastguard Worker 
2609*3f1979aaSAndroid Build Coastguard Worker // Negate packed 16-bit integers in a when the corresponding signed 16-bit
2610*3f1979aaSAndroid Build Coastguard Worker // integer in b is negative, and store the results in dst. Element in dst are
2611*3f1979aaSAndroid Build Coastguard Worker // zeroed out when the corresponding element in b is zero.
2612*3f1979aaSAndroid Build Coastguard Worker //
2613*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
2614*3f1979aaSAndroid Build Coastguard Worker //      i := j*16
2615*3f1979aaSAndroid Build Coastguard Worker //      IF b[i+15:i] < 0
2616*3f1979aaSAndroid Build Coastguard Worker //        dst[i+15:i] := -(a[i+15:i])
2617*3f1979aaSAndroid Build Coastguard Worker //      ELSE IF b[i+15:i] == 0
2618*3f1979aaSAndroid Build Coastguard Worker //        dst[i+15:i] := 0
2619*3f1979aaSAndroid Build Coastguard Worker //      ELSE
2620*3f1979aaSAndroid Build Coastguard Worker //        dst[i+15:i] := a[i+15:i]
2621*3f1979aaSAndroid Build Coastguard Worker //      FI
2622*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
2623*3f1979aaSAndroid Build Coastguard Worker //
2624*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
_mm_sign_pi16(__m64 _a,__m64 _b)2625*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
2626*3f1979aaSAndroid Build Coastguard Worker {
2627*3f1979aaSAndroid Build Coastguard Worker     int16x4_t a = vreinterpret_s16_m64(_a);
2628*3f1979aaSAndroid Build Coastguard Worker     int16x4_t b = vreinterpret_s16_m64(_b);
2629*3f1979aaSAndroid Build Coastguard Worker 
2630*3f1979aaSAndroid Build Coastguard Worker     // signed shift right: faster than vclt
2631*3f1979aaSAndroid Build Coastguard Worker     // (b < 0) ? 0xFFFF : 0
2632*3f1979aaSAndroid Build Coastguard Worker     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
2633*3f1979aaSAndroid Build Coastguard Worker 
2634*3f1979aaSAndroid Build Coastguard Worker     // (b == 0) ? 0xFFFF : 0
2635*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2636*3f1979aaSAndroid Build Coastguard Worker     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
2637*3f1979aaSAndroid Build Coastguard Worker #else
2638*3f1979aaSAndroid Build Coastguard Worker     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
2639*3f1979aaSAndroid Build Coastguard Worker #endif
2640*3f1979aaSAndroid Build Coastguard Worker 
2641*3f1979aaSAndroid Build Coastguard Worker     // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
2642*3f1979aaSAndroid Build Coastguard Worker     // based on ltMask
2643*3f1979aaSAndroid Build Coastguard Worker     int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
2644*3f1979aaSAndroid Build Coastguard Worker     // res = masked & (~zeroMask)
2645*3f1979aaSAndroid Build Coastguard Worker     int16x4_t res = vbic_s16(masked, zeroMask);
2646*3f1979aaSAndroid Build Coastguard Worker 
2647*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s16(res);
2648*3f1979aaSAndroid Build Coastguard Worker }
2649*3f1979aaSAndroid Build Coastguard Worker 
2650*3f1979aaSAndroid Build Coastguard Worker // Negate packed 32-bit integers in a when the corresponding signed 32-bit
2651*3f1979aaSAndroid Build Coastguard Worker // integer in b is negative, and store the results in dst. Element in dst are
2652*3f1979aaSAndroid Build Coastguard Worker // zeroed out when the corresponding element in b is zero.
2653*3f1979aaSAndroid Build Coastguard Worker //
2654*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 1
2655*3f1979aaSAndroid Build Coastguard Worker //      i := j*32
2656*3f1979aaSAndroid Build Coastguard Worker //      IF b[i+31:i] < 0
2657*3f1979aaSAndroid Build Coastguard Worker //        dst[i+31:i] := -(a[i+31:i])
2658*3f1979aaSAndroid Build Coastguard Worker //      ELSE IF b[i+31:i] == 0
2659*3f1979aaSAndroid Build Coastguard Worker //        dst[i+31:i] := 0
2660*3f1979aaSAndroid Build Coastguard Worker //      ELSE
2661*3f1979aaSAndroid Build Coastguard Worker //        dst[i+31:i] := a[i+31:i]
2662*3f1979aaSAndroid Build Coastguard Worker //      FI
2663*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
2664*3f1979aaSAndroid Build Coastguard Worker //
2665*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
_mm_sign_pi32(__m64 _a,__m64 _b)2666*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
2667*3f1979aaSAndroid Build Coastguard Worker {
2668*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a = vreinterpret_s32_m64(_a);
2669*3f1979aaSAndroid Build Coastguard Worker     int32x2_t b = vreinterpret_s32_m64(_b);
2670*3f1979aaSAndroid Build Coastguard Worker 
2671*3f1979aaSAndroid Build Coastguard Worker     // signed shift right: faster than vclt
2672*3f1979aaSAndroid Build Coastguard Worker     // (b < 0) ? 0xFFFFFFFF : 0
2673*3f1979aaSAndroid Build Coastguard Worker     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
2674*3f1979aaSAndroid Build Coastguard Worker 
2675*3f1979aaSAndroid Build Coastguard Worker     // (b == 0) ? 0xFFFFFFFF : 0
2676*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2677*3f1979aaSAndroid Build Coastguard Worker     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
2678*3f1979aaSAndroid Build Coastguard Worker #else
2679*3f1979aaSAndroid Build Coastguard Worker     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
2680*3f1979aaSAndroid Build Coastguard Worker #endif
2681*3f1979aaSAndroid Build Coastguard Worker 
2682*3f1979aaSAndroid Build Coastguard Worker     // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
2683*3f1979aaSAndroid Build Coastguard Worker     // based on ltMask
2684*3f1979aaSAndroid Build Coastguard Worker     int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
2685*3f1979aaSAndroid Build Coastguard Worker     // res = masked & (~zeroMask)
2686*3f1979aaSAndroid Build Coastguard Worker     int32x2_t res = vbic_s32(masked, zeroMask);
2687*3f1979aaSAndroid Build Coastguard Worker 
2688*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s32(res);
2689*3f1979aaSAndroid Build Coastguard Worker }
2690*3f1979aaSAndroid Build Coastguard Worker 
2691*3f1979aaSAndroid Build Coastguard Worker // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
2692*3f1979aaSAndroid Build Coastguard Worker // in b is negative, and store the results in dst. Element in dst are zeroed out
2693*3f1979aaSAndroid Build Coastguard Worker // when the corresponding element in b is zero.
2694*3f1979aaSAndroid Build Coastguard Worker //
2695*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
2696*3f1979aaSAndroid Build Coastguard Worker //      i := j*8
2697*3f1979aaSAndroid Build Coastguard Worker //      IF b[i+7:i] < 0
2698*3f1979aaSAndroid Build Coastguard Worker //        dst[i+7:i] := -(a[i+7:i])
2699*3f1979aaSAndroid Build Coastguard Worker //      ELSE IF b[i+7:i] == 0
2700*3f1979aaSAndroid Build Coastguard Worker //        dst[i+7:i] := 0
2701*3f1979aaSAndroid Build Coastguard Worker //      ELSE
2702*3f1979aaSAndroid Build Coastguard Worker //        dst[i+7:i] := a[i+7:i]
2703*3f1979aaSAndroid Build Coastguard Worker //      FI
2704*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
2705*3f1979aaSAndroid Build Coastguard Worker //
2706*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
_mm_sign_pi8(__m64 _a,__m64 _b)2707*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
2708*3f1979aaSAndroid Build Coastguard Worker {
2709*3f1979aaSAndroid Build Coastguard Worker     int8x8_t a = vreinterpret_s8_m64(_a);
2710*3f1979aaSAndroid Build Coastguard Worker     int8x8_t b = vreinterpret_s8_m64(_b);
2711*3f1979aaSAndroid Build Coastguard Worker 
2712*3f1979aaSAndroid Build Coastguard Worker     // signed shift right: faster than vclt
2713*3f1979aaSAndroid Build Coastguard Worker     // (b < 0) ? 0xFF : 0
2714*3f1979aaSAndroid Build Coastguard Worker     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
2715*3f1979aaSAndroid Build Coastguard Worker 
2716*3f1979aaSAndroid Build Coastguard Worker     // (b == 0) ? 0xFF : 0
2717*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2718*3f1979aaSAndroid Build Coastguard Worker     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
2719*3f1979aaSAndroid Build Coastguard Worker #else
2720*3f1979aaSAndroid Build Coastguard Worker     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
2721*3f1979aaSAndroid Build Coastguard Worker #endif
2722*3f1979aaSAndroid Build Coastguard Worker 
2723*3f1979aaSAndroid Build Coastguard Worker     // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
2724*3f1979aaSAndroid Build Coastguard Worker     // based on ltMask
2725*3f1979aaSAndroid Build Coastguard Worker     int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
2726*3f1979aaSAndroid Build Coastguard Worker     // res = masked & (~zeroMask)
2727*3f1979aaSAndroid Build Coastguard Worker     int8x8_t res = vbic_s8(masked, zeroMask);
2728*3f1979aaSAndroid Build Coastguard Worker 
2729*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s8(res);
2730*3f1979aaSAndroid Build Coastguard Worker }
2731*3f1979aaSAndroid Build Coastguard Worker 
2732*3f1979aaSAndroid Build Coastguard Worker // Average packed unsigned 16-bit integers in a and b, and store the results in
2733*3f1979aaSAndroid Build Coastguard Worker // dst.
2734*3f1979aaSAndroid Build Coastguard Worker //
2735*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
2736*3f1979aaSAndroid Build Coastguard Worker //     i := j*16
2737*3f1979aaSAndroid Build Coastguard Worker //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2738*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
2739*3f1979aaSAndroid Build Coastguard Worker //
2740*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
_mm_avg_pu16(__m64 a,__m64 b)2741*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
2742*3f1979aaSAndroid Build Coastguard Worker {
2743*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_u16(
2744*3f1979aaSAndroid Build Coastguard Worker         vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
2745*3f1979aaSAndroid Build Coastguard Worker }
2746*3f1979aaSAndroid Build Coastguard Worker 
2747*3f1979aaSAndroid Build Coastguard Worker // Average packed unsigned 8-bit integers in a and b, and store the results in
2748*3f1979aaSAndroid Build Coastguard Worker // dst.
2749*3f1979aaSAndroid Build Coastguard Worker //
2750*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
2751*3f1979aaSAndroid Build Coastguard Worker //     i := j*8
2752*3f1979aaSAndroid Build Coastguard Worker //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2753*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
2754*3f1979aaSAndroid Build Coastguard Worker //
2755*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
_mm_avg_pu8(__m64 a,__m64 b)2756*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
2757*3f1979aaSAndroid Build Coastguard Worker {
2758*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_u8(
2759*3f1979aaSAndroid Build Coastguard Worker         vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2760*3f1979aaSAndroid Build Coastguard Worker }
2761*3f1979aaSAndroid Build Coastguard Worker 
2762*3f1979aaSAndroid Build Coastguard Worker // Average packed unsigned 8-bit integers in a and b, and store the results in
2763*3f1979aaSAndroid Build Coastguard Worker // dst.
2764*3f1979aaSAndroid Build Coastguard Worker //
2765*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
2766*3f1979aaSAndroid Build Coastguard Worker //     i := j*8
2767*3f1979aaSAndroid Build Coastguard Worker //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2768*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
2769*3f1979aaSAndroid Build Coastguard Worker //
2770*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2771*3f1979aaSAndroid Build Coastguard Worker #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2772*3f1979aaSAndroid Build Coastguard Worker 
2773*3f1979aaSAndroid Build Coastguard Worker // Average packed unsigned 16-bit integers in a and b, and store the results in
2774*3f1979aaSAndroid Build Coastguard Worker // dst.
2775*3f1979aaSAndroid Build Coastguard Worker //
2776*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
2777*3f1979aaSAndroid Build Coastguard Worker //     i := j*16
2778*3f1979aaSAndroid Build Coastguard Worker //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2779*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
2780*3f1979aaSAndroid Build Coastguard Worker //
2781*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2782*3f1979aaSAndroid Build Coastguard Worker #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2783*3f1979aaSAndroid Build Coastguard Worker 
2784*3f1979aaSAndroid Build Coastguard Worker // Computes the average of the 16 unsigned 8-bit integers in a and the 16
2785*3f1979aaSAndroid Build Coastguard Worker // unsigned 8-bit integers in b and rounds.
2786*3f1979aaSAndroid Build Coastguard Worker //
2787*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 + b0) / 2
2788*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 + b1) / 2
2789*3f1979aaSAndroid Build Coastguard Worker //   ...
2790*3f1979aaSAndroid Build Coastguard Worker //   r15 := (a15 + b15) / 2
2791*3f1979aaSAndroid Build Coastguard Worker //
2792*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
_mm_avg_epu8(__m128i a,__m128i b)2793*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
2794*3f1979aaSAndroid Build Coastguard Worker {
2795*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(
2796*3f1979aaSAndroid Build Coastguard Worker         vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2797*3f1979aaSAndroid Build Coastguard Worker }
2798*3f1979aaSAndroid Build Coastguard Worker 
2799*3f1979aaSAndroid Build Coastguard Worker // Computes the average of the 8 unsigned 16-bit integers in a and the 8
2800*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers in b and rounds.
2801*3f1979aaSAndroid Build Coastguard Worker //
2802*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 + b0) / 2
2803*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 + b1) / 2
2804*3f1979aaSAndroid Build Coastguard Worker //   ...
2805*3f1979aaSAndroid Build Coastguard Worker //   r7 := (a7 + b7) / 2
2806*3f1979aaSAndroid Build Coastguard Worker //
2807*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
_mm_avg_epu16(__m128i a,__m128i b)2808*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
2809*3f1979aaSAndroid Build Coastguard Worker {
2810*3f1979aaSAndroid Build Coastguard Worker     return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
2811*3f1979aaSAndroid Build Coastguard Worker                                  vreinterpretq_u16_m128i(b));
2812*3f1979aaSAndroid Build Coastguard Worker }
2813*3f1979aaSAndroid Build Coastguard Worker 
2814*3f1979aaSAndroid Build Coastguard Worker // Adds the four single-precision, floating-point values of a and b.
2815*3f1979aaSAndroid Build Coastguard Worker //
2816*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 + b0
2817*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 + b1
2818*3f1979aaSAndroid Build Coastguard Worker //   r2 := a2 + b2
2819*3f1979aaSAndroid Build Coastguard Worker //   r3 := a3 + b3
2820*3f1979aaSAndroid Build Coastguard Worker //
2821*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
_mm_add_ps(__m128 a,__m128 b)2822*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
2823*3f1979aaSAndroid Build Coastguard Worker {
2824*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
2825*3f1979aaSAndroid Build Coastguard Worker         vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2826*3f1979aaSAndroid Build Coastguard Worker }
2827*3f1979aaSAndroid Build Coastguard Worker 
2828*3f1979aaSAndroid Build Coastguard Worker // Add packed double-precision (64-bit) floating-point elements in a and b, and
2829*3f1979aaSAndroid Build Coastguard Worker // store the results in dst.
2830*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
_mm_add_pd(__m128d a,__m128d b)2831*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2832*3f1979aaSAndroid Build Coastguard Worker {
2833*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2834*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f64(
2835*3f1979aaSAndroid Build Coastguard Worker         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2836*3f1979aaSAndroid Build Coastguard Worker #else
2837*3f1979aaSAndroid Build Coastguard Worker     double *da = (double *) &a;
2838*3f1979aaSAndroid Build Coastguard Worker     double *db = (double *) &b;
2839*3f1979aaSAndroid Build Coastguard Worker     double c[2];
2840*3f1979aaSAndroid Build Coastguard Worker     c[0] = da[0] + db[0];
2841*3f1979aaSAndroid Build Coastguard Worker     c[1] = da[1] + db[1];
2842*3f1979aaSAndroid Build Coastguard Worker     return vld1q_f32((float32_t *) c);
2843*3f1979aaSAndroid Build Coastguard Worker #endif
2844*3f1979aaSAndroid Build Coastguard Worker }
2845*3f1979aaSAndroid Build Coastguard Worker 
2846*3f1979aaSAndroid Build Coastguard Worker // Add 64-bit integers a and b, and store the result in dst.
2847*3f1979aaSAndroid Build Coastguard Worker //
2848*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := a[63:0] + b[63:0]
2849*3f1979aaSAndroid Build Coastguard Worker //
2850*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
_mm_add_si64(__m64 a,__m64 b)2851*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2852*3f1979aaSAndroid Build Coastguard Worker {
2853*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s64(
2854*3f1979aaSAndroid Build Coastguard Worker         vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2855*3f1979aaSAndroid Build Coastguard Worker }
2856*3f1979aaSAndroid Build Coastguard Worker 
2857*3f1979aaSAndroid Build Coastguard Worker // adds the scalar single-precision floating point values of a and b.
2858*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
_mm_add_ss(__m128 a,__m128 b)2859*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
2860*3f1979aaSAndroid Build Coastguard Worker {
2861*3f1979aaSAndroid Build Coastguard Worker     float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
2862*3f1979aaSAndroid Build Coastguard Worker     float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
2863*3f1979aaSAndroid Build Coastguard Worker     // the upper values in the result must be the remnants of <a>.
2864*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vaddq_f32(a, value));
2865*3f1979aaSAndroid Build Coastguard Worker }
2866*3f1979aaSAndroid Build Coastguard Worker 
2867*3f1979aaSAndroid Build Coastguard Worker // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2868*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers in b.
2869*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi64(__m128i a,__m128i b)2870*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2871*3f1979aaSAndroid Build Coastguard Worker {
2872*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(
2873*3f1979aaSAndroid Build Coastguard Worker         vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2874*3f1979aaSAndroid Build Coastguard Worker }
2875*3f1979aaSAndroid Build Coastguard Worker 
2876*3f1979aaSAndroid Build Coastguard Worker // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2877*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers in b.
2878*3f1979aaSAndroid Build Coastguard Worker //
2879*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 + b0
2880*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 + b1
2881*3f1979aaSAndroid Build Coastguard Worker //   r2 := a2 + b2
2882*3f1979aaSAndroid Build Coastguard Worker //   r3 := a3 + b3
2883*3f1979aaSAndroid Build Coastguard Worker //
2884*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi32(__m128i a,__m128i b)2885*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2886*3f1979aaSAndroid Build Coastguard Worker {
2887*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
2888*3f1979aaSAndroid Build Coastguard Worker         vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2889*3f1979aaSAndroid Build Coastguard Worker }
2890*3f1979aaSAndroid Build Coastguard Worker 
2891*3f1979aaSAndroid Build Coastguard Worker // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2892*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers in b.
2893*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
_mm_add_epi16(__m128i a,__m128i b)2894*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2895*3f1979aaSAndroid Build Coastguard Worker {
2896*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
2897*3f1979aaSAndroid Build Coastguard Worker         vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2898*3f1979aaSAndroid Build Coastguard Worker }
2899*3f1979aaSAndroid Build Coastguard Worker 
2900*3f1979aaSAndroid Build Coastguard Worker // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2901*3f1979aaSAndroid Build Coastguard Worker // unsigned 8-bit integers in b.
2902*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
_mm_add_epi8(__m128i a,__m128i b)2903*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2904*3f1979aaSAndroid Build Coastguard Worker {
2905*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(
2906*3f1979aaSAndroid Build Coastguard Worker         vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2907*3f1979aaSAndroid Build Coastguard Worker }
2908*3f1979aaSAndroid Build Coastguard Worker 
2909*3f1979aaSAndroid Build Coastguard Worker // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
2910*3f1979aaSAndroid Build Coastguard Worker // and saturates.
2911*3f1979aaSAndroid Build Coastguard Worker //
2912*3f1979aaSAndroid Build Coastguard Worker //   r0 := SignedSaturate(a0 + b0)
2913*3f1979aaSAndroid Build Coastguard Worker //   r1 := SignedSaturate(a1 + b1)
2914*3f1979aaSAndroid Build Coastguard Worker //   ...
2915*3f1979aaSAndroid Build Coastguard Worker //   r7 := SignedSaturate(a7 + b7)
2916*3f1979aaSAndroid Build Coastguard Worker //
2917*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
_mm_adds_epi16(__m128i a,__m128i b)2918*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2919*3f1979aaSAndroid Build Coastguard Worker {
2920*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
2921*3f1979aaSAndroid Build Coastguard Worker         vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2922*3f1979aaSAndroid Build Coastguard Worker }
2923*3f1979aaSAndroid Build Coastguard Worker 
2924*3f1979aaSAndroid Build Coastguard Worker // Add packed signed 8-bit integers in a and b using saturation, and store the
2925*3f1979aaSAndroid Build Coastguard Worker // results in dst.
2926*3f1979aaSAndroid Build Coastguard Worker //
2927*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 15
2928*3f1979aaSAndroid Build Coastguard Worker //     i := j*8
2929*3f1979aaSAndroid Build Coastguard Worker //     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
2930*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
2931*3f1979aaSAndroid Build Coastguard Worker //
2932*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
_mm_adds_epi8(__m128i a,__m128i b)2933*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
2934*3f1979aaSAndroid Build Coastguard Worker {
2935*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(
2936*3f1979aaSAndroid Build Coastguard Worker         vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2937*3f1979aaSAndroid Build Coastguard Worker }
2938*3f1979aaSAndroid Build Coastguard Worker 
2939*3f1979aaSAndroid Build Coastguard Worker // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
2940*3f1979aaSAndroid Build Coastguard Worker // b and saturates..
2941*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
_mm_adds_epu8(__m128i a,__m128i b)2942*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
2943*3f1979aaSAndroid Build Coastguard Worker {
2944*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(
2945*3f1979aaSAndroid Build Coastguard Worker         vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2946*3f1979aaSAndroid Build Coastguard Worker }
2947*3f1979aaSAndroid Build Coastguard Worker 
2948*3f1979aaSAndroid Build Coastguard Worker // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
2949*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers from b.
2950*3f1979aaSAndroid Build Coastguard Worker //
2951*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 * b0)[15:0]
2952*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 * b1)[15:0]
2953*3f1979aaSAndroid Build Coastguard Worker //   ...
2954*3f1979aaSAndroid Build Coastguard Worker //   r7 := (a7 * b7)[15:0]
2955*3f1979aaSAndroid Build Coastguard Worker //
2956*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
_mm_mullo_epi16(__m128i a,__m128i b)2957*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
2958*3f1979aaSAndroid Build Coastguard Worker {
2959*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
2960*3f1979aaSAndroid Build Coastguard Worker         vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2961*3f1979aaSAndroid Build Coastguard Worker }
2962*3f1979aaSAndroid Build Coastguard Worker 
2963*3f1979aaSAndroid Build Coastguard Worker // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
2964*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers from b.
2965*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
_mm_mullo_epi32(__m128i a,__m128i b)2966*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
2967*3f1979aaSAndroid Build Coastguard Worker {
2968*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
2969*3f1979aaSAndroid Build Coastguard Worker         vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2970*3f1979aaSAndroid Build Coastguard Worker }
2971*3f1979aaSAndroid Build Coastguard Worker 
2972*3f1979aaSAndroid Build Coastguard Worker // Multiply the packed unsigned 16-bit integers in a and b, producing
2973*3f1979aaSAndroid Build Coastguard Worker // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2974*3f1979aaSAndroid Build Coastguard Worker // integers in dst.
2975*3f1979aaSAndroid Build Coastguard Worker //
2976*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
2977*3f1979aaSAndroid Build Coastguard Worker //      i := j*16
2978*3f1979aaSAndroid Build Coastguard Worker //      tmp[31:0] := a[i+15:i] * b[i+15:i]
2979*3f1979aaSAndroid Build Coastguard Worker //      dst[i+15:i] := tmp[31:16]
2980*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
2981*3f1979aaSAndroid Build Coastguard Worker //
2982*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2983*3f1979aaSAndroid Build Coastguard Worker #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2984*3f1979aaSAndroid Build Coastguard Worker 
2985*3f1979aaSAndroid Build Coastguard Worker // Multiplies the four single-precision, floating-point values of a and b.
2986*3f1979aaSAndroid Build Coastguard Worker //
2987*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 * b0
2988*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 * b1
2989*3f1979aaSAndroid Build Coastguard Worker //   r2 := a2 * b2
2990*3f1979aaSAndroid Build Coastguard Worker //   r3 := a3 * b3
2991*3f1979aaSAndroid Build Coastguard Worker //
2992*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
_mm_mul_ps(__m128 a,__m128 b)2993*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2994*3f1979aaSAndroid Build Coastguard Worker {
2995*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
2996*3f1979aaSAndroid Build Coastguard Worker         vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2997*3f1979aaSAndroid Build Coastguard Worker }
2998*3f1979aaSAndroid Build Coastguard Worker 
2999*3f1979aaSAndroid Build Coastguard Worker // Multiply the lower single-precision (32-bit) floating-point element in a and
3000*3f1979aaSAndroid Build Coastguard Worker // b, store the result in the lower element of dst, and copy the upper 3 packed
3001*3f1979aaSAndroid Build Coastguard Worker // elements from a to the upper elements of dst.
3002*3f1979aaSAndroid Build Coastguard Worker //
3003*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := a[31:0] * b[31:0]
3004*3f1979aaSAndroid Build Coastguard Worker //   dst[127:32] := a[127:32]
3005*3f1979aaSAndroid Build Coastguard Worker //
3006*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
_mm_mul_ss(__m128 a,__m128 b)3007*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
3008*3f1979aaSAndroid Build Coastguard Worker {
3009*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_mul_ps(a, b));
3010*3f1979aaSAndroid Build Coastguard Worker }
3011*3f1979aaSAndroid Build Coastguard Worker 
3012*3f1979aaSAndroid Build Coastguard Worker // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
3013*3f1979aaSAndroid Build Coastguard Worker // a and b, and store the unsigned 64-bit results in dst.
3014*3f1979aaSAndroid Build Coastguard Worker //
3015*3f1979aaSAndroid Build Coastguard Worker //   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
3016*3f1979aaSAndroid Build Coastguard Worker //   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
_mm_mul_epu32(__m128i a,__m128i b)3017*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
3018*3f1979aaSAndroid Build Coastguard Worker {
3019*3f1979aaSAndroid Build Coastguard Worker     // vmull_u32 upcasts instead of masking, so we downcast.
3020*3f1979aaSAndroid Build Coastguard Worker     uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
3021*3f1979aaSAndroid Build Coastguard Worker     uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
3022*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
3023*3f1979aaSAndroid Build Coastguard Worker }
3024*3f1979aaSAndroid Build Coastguard Worker 
3025*3f1979aaSAndroid Build Coastguard Worker // Multiply the low unsigned 32-bit integers from a and b, and store the
3026*3f1979aaSAndroid Build Coastguard Worker // unsigned 64-bit result in dst.
3027*3f1979aaSAndroid Build Coastguard Worker //
3028*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := a[31:0] * b[31:0]
3029*3f1979aaSAndroid Build Coastguard Worker //
3030*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
_mm_mul_su32(__m64 a,__m64 b)3031*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
3032*3f1979aaSAndroid Build Coastguard Worker {
3033*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_u64(vget_low_u64(
3034*3f1979aaSAndroid Build Coastguard Worker         vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
3035*3f1979aaSAndroid Build Coastguard Worker }
3036*3f1979aaSAndroid Build Coastguard Worker 
3037*3f1979aaSAndroid Build Coastguard Worker // Multiply the low signed 32-bit integers from each packed 64-bit element in
3038*3f1979aaSAndroid Build Coastguard Worker // a and b, and store the signed 64-bit results in dst.
3039*3f1979aaSAndroid Build Coastguard Worker //
3040*3f1979aaSAndroid Build Coastguard Worker //   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
3041*3f1979aaSAndroid Build Coastguard Worker //   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
_mm_mul_epi32(__m128i a,__m128i b)3042*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
3043*3f1979aaSAndroid Build Coastguard Worker {
3044*3f1979aaSAndroid Build Coastguard Worker     // vmull_s32 upcasts instead of masking, so we downcast.
3045*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
3046*3f1979aaSAndroid Build Coastguard Worker     int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
3047*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
3048*3f1979aaSAndroid Build Coastguard Worker }
3049*3f1979aaSAndroid Build Coastguard Worker 
3050*3f1979aaSAndroid Build Coastguard Worker // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3051*3f1979aaSAndroid Build Coastguard Worker // integers from b.
3052*3f1979aaSAndroid Build Coastguard Worker //
3053*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 * b0) + (a1 * b1)
3054*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a2 * b2) + (a3 * b3)
3055*3f1979aaSAndroid Build Coastguard Worker //   r2 := (a4 * b4) + (a5 * b5)
3056*3f1979aaSAndroid Build Coastguard Worker //   r3 := (a6 * b6) + (a7 * b7)
3057*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
_mm_madd_epi16(__m128i a,__m128i b)3058*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
3059*3f1979aaSAndroid Build Coastguard Worker {
3060*3f1979aaSAndroid Build Coastguard Worker     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3061*3f1979aaSAndroid Build Coastguard Worker                               vget_low_s16(vreinterpretq_s16_m128i(b)));
3062*3f1979aaSAndroid Build Coastguard Worker     int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3063*3f1979aaSAndroid Build Coastguard Worker                                vget_high_s16(vreinterpretq_s16_m128i(b)));
3064*3f1979aaSAndroid Build Coastguard Worker 
3065*3f1979aaSAndroid Build Coastguard Worker     int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
3066*3f1979aaSAndroid Build Coastguard Worker     int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
3067*3f1979aaSAndroid Build Coastguard Worker 
3068*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
3069*3f1979aaSAndroid Build Coastguard Worker }
3070*3f1979aaSAndroid Build Coastguard Worker 
3071*3f1979aaSAndroid Build Coastguard Worker // Multiply packed signed 16-bit integers in a and b, producing intermediate
3072*3f1979aaSAndroid Build Coastguard Worker // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
3073*3f1979aaSAndroid Build Coastguard Worker // the packed 16-bit integers in dst.
3074*3f1979aaSAndroid Build Coastguard Worker //
3075*3f1979aaSAndroid Build Coastguard Worker //   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
3076*3f1979aaSAndroid Build Coastguard Worker //   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
3077*3f1979aaSAndroid Build Coastguard Worker //   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
3078*3f1979aaSAndroid Build Coastguard Worker //   ...
3079*3f1979aaSAndroid Build Coastguard Worker //   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
_mm_mulhrs_epi16(__m128i a,__m128i b)3080*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
3081*3f1979aaSAndroid Build Coastguard Worker {
3082*3f1979aaSAndroid Build Coastguard Worker     // Has issues due to saturation
3083*3f1979aaSAndroid Build Coastguard Worker     // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
3084*3f1979aaSAndroid Build Coastguard Worker 
3085*3f1979aaSAndroid Build Coastguard Worker     // Multiply
3086*3f1979aaSAndroid Build Coastguard Worker     int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3087*3f1979aaSAndroid Build Coastguard Worker                                  vget_low_s16(vreinterpretq_s16_m128i(b)));
3088*3f1979aaSAndroid Build Coastguard Worker     int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3089*3f1979aaSAndroid Build Coastguard Worker                                  vget_high_s16(vreinterpretq_s16_m128i(b)));
3090*3f1979aaSAndroid Build Coastguard Worker 
3091*3f1979aaSAndroid Build Coastguard Worker     // Rounding narrowing shift right
3092*3f1979aaSAndroid Build Coastguard Worker     // narrow = (int16_t)((mul + 16384) >> 15);
3093*3f1979aaSAndroid Build Coastguard Worker     int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
3094*3f1979aaSAndroid Build Coastguard Worker     int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
3095*3f1979aaSAndroid Build Coastguard Worker 
3096*3f1979aaSAndroid Build Coastguard Worker     // Join together
3097*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
3098*3f1979aaSAndroid Build Coastguard Worker }
3099*3f1979aaSAndroid Build Coastguard Worker 
3100*3f1979aaSAndroid Build Coastguard Worker // Vertically multiply each unsigned 8-bit integer from a with the corresponding
3101*3f1979aaSAndroid Build Coastguard Worker // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
3102*3f1979aaSAndroid Build Coastguard Worker // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
3103*3f1979aaSAndroid Build Coastguard Worker // and pack the saturated results in dst.
3104*3f1979aaSAndroid Build Coastguard Worker //
3105*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
3106*3f1979aaSAndroid Build Coastguard Worker //      i := j*16
3107*3f1979aaSAndroid Build Coastguard Worker //      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
3108*3f1979aaSAndroid Build Coastguard Worker //      a[i+7:i]*b[i+7:i] )
3109*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
_mm_maddubs_epi16(__m128i _a,__m128i _b)3110*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
3111*3f1979aaSAndroid Build Coastguard Worker {
3112*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3113*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t a = vreinterpretq_u8_m128i(_a);
3114*3f1979aaSAndroid Build Coastguard Worker     int8x16_t b = vreinterpretq_s8_m128i(_b);
3115*3f1979aaSAndroid Build Coastguard Worker     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
3116*3f1979aaSAndroid Build Coastguard Worker                              vmovl_s8(vget_low_s8(b)));
3117*3f1979aaSAndroid Build Coastguard Worker     int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
3118*3f1979aaSAndroid Build Coastguard Worker                              vmovl_s8(vget_high_s8(b)));
3119*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
3120*3f1979aaSAndroid Build Coastguard Worker         vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
3121*3f1979aaSAndroid Build Coastguard Worker #else
3122*3f1979aaSAndroid Build Coastguard Worker     // This would be much simpler if x86 would choose to zero extend OR sign
3123*3f1979aaSAndroid Build Coastguard Worker     // extend, not both. This could probably be optimized better.
3124*3f1979aaSAndroid Build Coastguard Worker     uint16x8_t a = vreinterpretq_u16_m128i(_a);
3125*3f1979aaSAndroid Build Coastguard Worker     int16x8_t b = vreinterpretq_s16_m128i(_b);
3126*3f1979aaSAndroid Build Coastguard Worker 
3127*3f1979aaSAndroid Build Coastguard Worker     // Zero extend a
3128*3f1979aaSAndroid Build Coastguard Worker     int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
3129*3f1979aaSAndroid Build Coastguard Worker     int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
3130*3f1979aaSAndroid Build Coastguard Worker 
3131*3f1979aaSAndroid Build Coastguard Worker     // Sign extend by shifting left then shifting right.
3132*3f1979aaSAndroid Build Coastguard Worker     int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
3133*3f1979aaSAndroid Build Coastguard Worker     int16x8_t b_odd = vshrq_n_s16(b, 8);
3134*3f1979aaSAndroid Build Coastguard Worker 
3135*3f1979aaSAndroid Build Coastguard Worker     // multiply
3136*3f1979aaSAndroid Build Coastguard Worker     int16x8_t prod1 = vmulq_s16(a_even, b_even);
3137*3f1979aaSAndroid Build Coastguard Worker     int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
3138*3f1979aaSAndroid Build Coastguard Worker 
3139*3f1979aaSAndroid Build Coastguard Worker     // saturated add
3140*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
3141*3f1979aaSAndroid Build Coastguard Worker #endif
3142*3f1979aaSAndroid Build Coastguard Worker }
3143*3f1979aaSAndroid Build Coastguard Worker 
3144*3f1979aaSAndroid Build Coastguard Worker // Computes the fused multiple add product of 32-bit floating point numbers.
3145*3f1979aaSAndroid Build Coastguard Worker //
3146*3f1979aaSAndroid Build Coastguard Worker // Return Value
3147*3f1979aaSAndroid Build Coastguard Worker // Multiplies A and B, and adds C to the temporary result before returning it.
3148*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
_mm_fmadd_ps(__m128 a,__m128 b,__m128 c)3149*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
3150*3f1979aaSAndroid Build Coastguard Worker {
3151*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3152*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
3153*3f1979aaSAndroid Build Coastguard Worker                                             vreinterpretq_f32_m128(b),
3154*3f1979aaSAndroid Build Coastguard Worker                                             vreinterpretq_f32_m128(a)));
3155*3f1979aaSAndroid Build Coastguard Worker #else
3156*3f1979aaSAndroid Build Coastguard Worker     return _mm_add_ps(_mm_mul_ps(a, b), c);
3157*3f1979aaSAndroid Build Coastguard Worker #endif
3158*3f1979aaSAndroid Build Coastguard Worker }
3159*3f1979aaSAndroid Build Coastguard Worker 
3160*3f1979aaSAndroid Build Coastguard Worker // Alternatively add and subtract packed single-precision (32-bit)
3161*3f1979aaSAndroid Build Coastguard Worker // floating-point elements in a to/from packed elements in b, and store the
3162*3f1979aaSAndroid Build Coastguard Worker // results in dst.
3163*3f1979aaSAndroid Build Coastguard Worker //
3164*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
_mm_addsub_ps(__m128 a,__m128 b)3165*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
3166*3f1979aaSAndroid Build Coastguard Worker {
3167*3f1979aaSAndroid Build Coastguard Worker     __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
3168*3f1979aaSAndroid Build Coastguard Worker     return _mm_fmadd_ps(b, mask, a);
3169*3f1979aaSAndroid Build Coastguard Worker }
3170*3f1979aaSAndroid Build Coastguard Worker 
3171*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute differences of packed unsigned 8-bit integers in a and
3172*3f1979aaSAndroid Build Coastguard Worker // b, then horizontally sum each consecutive 8 differences to produce two
3173*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3174*3f1979aaSAndroid Build Coastguard Worker // 16 bits of 64-bit elements in dst.
3175*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
_mm_sad_epu8(__m128i a,__m128i b)3176*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
3177*3f1979aaSAndroid Build Coastguard Worker {
3178*3f1979aaSAndroid Build Coastguard Worker     uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
3179*3f1979aaSAndroid Build Coastguard Worker     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3180*3f1979aaSAndroid Build Coastguard Worker     uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3181*3f1979aaSAndroid Build Coastguard Worker     uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3182*3f1979aaSAndroid Build Coastguard Worker     return (__m128i) vsetq_lane_u16(r4, r, 4);
3183*3f1979aaSAndroid Build Coastguard Worker }
3184*3f1979aaSAndroid Build Coastguard Worker 
3185*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute differences of packed unsigned 8-bit integers in a and
3186*3f1979aaSAndroid Build Coastguard Worker // b, then horizontally sum each consecutive 8 differences to produce four
3187*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3188*3f1979aaSAndroid Build Coastguard Worker // 16 bits of dst.
3189*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
_mm_sad_pu8(__m64 a,__m64 b)3190*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
3191*3f1979aaSAndroid Build Coastguard Worker {
3192*3f1979aaSAndroid Build Coastguard Worker     uint16x4_t t =
3193*3f1979aaSAndroid Build Coastguard Worker         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3194*3f1979aaSAndroid Build Coastguard Worker     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3195*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
3196*3f1979aaSAndroid Build Coastguard Worker }
3197*3f1979aaSAndroid Build Coastguard Worker 
3198*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute differences of packed unsigned 8-bit integers in a and
3199*3f1979aaSAndroid Build Coastguard Worker // b, then horizontally sum each consecutive 8 differences to produce four
3200*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3201*3f1979aaSAndroid Build Coastguard Worker // 16 bits of dst.
3202*3f1979aaSAndroid Build Coastguard Worker //
3203*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
3204*3f1979aaSAndroid Build Coastguard Worker //      i := j*8
3205*3f1979aaSAndroid Build Coastguard Worker //      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
3206*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
3207*3f1979aaSAndroid Build Coastguard Worker //   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
3208*3f1979aaSAndroid Build Coastguard Worker //   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
3209*3f1979aaSAndroid Build Coastguard Worker //
3210*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
3211*3f1979aaSAndroid Build Coastguard Worker #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
3212*3f1979aaSAndroid Build Coastguard Worker 
3213*3f1979aaSAndroid Build Coastguard Worker // Divides the four single-precision, floating-point values of a and b.
3214*3f1979aaSAndroid Build Coastguard Worker //
3215*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 / b0
3216*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 / b1
3217*3f1979aaSAndroid Build Coastguard Worker //   r2 := a2 / b2
3218*3f1979aaSAndroid Build Coastguard Worker //   r3 := a3 / b3
3219*3f1979aaSAndroid Build Coastguard Worker //
3220*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
_mm_div_ps(__m128 a,__m128 b)3221*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
3222*3f1979aaSAndroid Build Coastguard Worker {
3223*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3224*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
3225*3f1979aaSAndroid Build Coastguard Worker         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3226*3f1979aaSAndroid Build Coastguard Worker #else
3227*3f1979aaSAndroid Build Coastguard Worker     float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
3228*3f1979aaSAndroid Build Coastguard Worker     float32x4_t recip1 =
3229*3f1979aaSAndroid Build Coastguard Worker         vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
3230*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
3231*3f1979aaSAndroid Build Coastguard Worker #endif
3232*3f1979aaSAndroid Build Coastguard Worker }
3233*3f1979aaSAndroid Build Coastguard Worker 
3234*3f1979aaSAndroid Build Coastguard Worker // Divides the scalar single-precision floating point value of a by b.
3235*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
_mm_div_ss(__m128 a,__m128 b)3236*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
3237*3f1979aaSAndroid Build Coastguard Worker {
3238*3f1979aaSAndroid Build Coastguard Worker     float32_t value =
3239*3f1979aaSAndroid Build Coastguard Worker         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
3240*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
3241*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3242*3f1979aaSAndroid Build Coastguard Worker }
3243*3f1979aaSAndroid Build Coastguard Worker 
3244*3f1979aaSAndroid Build Coastguard Worker // Computes the approximations of reciprocals of the four single-precision,
3245*3f1979aaSAndroid Build Coastguard Worker // floating-point values of a.
3246*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
_mm_rcp_ps(__m128 in)3247*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
3248*3f1979aaSAndroid Build Coastguard Worker {
3249*3f1979aaSAndroid Build Coastguard Worker     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
3250*3f1979aaSAndroid Build Coastguard Worker     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
3251*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(recip);
3252*3f1979aaSAndroid Build Coastguard Worker }
3253*3f1979aaSAndroid Build Coastguard Worker 
3254*3f1979aaSAndroid Build Coastguard Worker // Compute the approximate reciprocal of the lower single-precision (32-bit)
3255*3f1979aaSAndroid Build Coastguard Worker // floating-point element in a, store the result in the lower element of dst,
3256*3f1979aaSAndroid Build Coastguard Worker // and copy the upper 3 packed elements from a to the upper elements of dst. The
3257*3f1979aaSAndroid Build Coastguard Worker // maximum relative error for this approximation is less than 1.5*2^-12.
3258*3f1979aaSAndroid Build Coastguard Worker //
3259*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := (1.0 / a[31:0])
3260*3f1979aaSAndroid Build Coastguard Worker //   dst[127:32] := a[127:32]
3261*3f1979aaSAndroid Build Coastguard Worker //
3262*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
_mm_rcp_ss(__m128 a)3263*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
3264*3f1979aaSAndroid Build Coastguard Worker {
3265*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_rcp_ps(a));
3266*3f1979aaSAndroid Build Coastguard Worker }
3267*3f1979aaSAndroid Build Coastguard Worker 
3268*3f1979aaSAndroid Build Coastguard Worker // Computes the approximations of square roots of the four single-precision,
3269*3f1979aaSAndroid Build Coastguard Worker // floating-point values of a. First computes reciprocal square roots and then
3270*3f1979aaSAndroid Build Coastguard Worker // reciprocals of the four values.
3271*3f1979aaSAndroid Build Coastguard Worker //
3272*3f1979aaSAndroid Build Coastguard Worker //   r0 := sqrt(a0)
3273*3f1979aaSAndroid Build Coastguard Worker //   r1 := sqrt(a1)
3274*3f1979aaSAndroid Build Coastguard Worker //   r2 := sqrt(a2)
3275*3f1979aaSAndroid Build Coastguard Worker //   r3 := sqrt(a3)
3276*3f1979aaSAndroid Build Coastguard Worker //
3277*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
_mm_sqrt_ps(__m128 in)3278*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
3279*3f1979aaSAndroid Build Coastguard Worker {
3280*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3281*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
3282*3f1979aaSAndroid Build Coastguard Worker #else
3283*3f1979aaSAndroid Build Coastguard Worker     float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3284*3f1979aaSAndroid Build Coastguard Worker     float32x4_t sq = vrecpeq_f32(recipsq);
3285*3f1979aaSAndroid Build Coastguard Worker     // ??? use step versions of both sqrt and recip for better accuracy?
3286*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(sq);
3287*3f1979aaSAndroid Build Coastguard Worker #endif
3288*3f1979aaSAndroid Build Coastguard Worker }
3289*3f1979aaSAndroid Build Coastguard Worker 
3290*3f1979aaSAndroid Build Coastguard Worker // Computes the approximation of the square root of the scalar single-precision
3291*3f1979aaSAndroid Build Coastguard Worker // floating point value of in.
3292*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
_mm_sqrt_ss(__m128 in)3293*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
3294*3f1979aaSAndroid Build Coastguard Worker {
3295*3f1979aaSAndroid Build Coastguard Worker     float32_t value =
3296*3f1979aaSAndroid Build Coastguard Worker         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
3297*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
3298*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
3299*3f1979aaSAndroid Build Coastguard Worker }
3300*3f1979aaSAndroid Build Coastguard Worker 
3301*3f1979aaSAndroid Build Coastguard Worker // Computes the approximations of the reciprocal square roots of the four
3302*3f1979aaSAndroid Build Coastguard Worker // single-precision floating point values of in.
3303*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
_mm_rsqrt_ps(__m128 in)3304*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
3305*3f1979aaSAndroid Build Coastguard Worker {
3306*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
3307*3f1979aaSAndroid Build Coastguard Worker }
3308*3f1979aaSAndroid Build Coastguard Worker 
3309*3f1979aaSAndroid Build Coastguard Worker // Compute the approximate reciprocal square root of the lower single-precision
3310*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point element in a, store the result in the lower element
3311*3f1979aaSAndroid Build Coastguard Worker // of dst, and copy the upper 3 packed elements from a to the upper elements of
3312*3f1979aaSAndroid Build Coastguard Worker // dst.
3313*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
_mm_rsqrt_ss(__m128 in)3314*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
3315*3f1979aaSAndroid Build Coastguard Worker {
3316*3f1979aaSAndroid Build Coastguard Worker     return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
3317*3f1979aaSAndroid Build Coastguard Worker }
3318*3f1979aaSAndroid Build Coastguard Worker 
3319*3f1979aaSAndroid Build Coastguard Worker // Compare packed signed 16-bit integers in a and b, and store packed maximum
3320*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3321*3f1979aaSAndroid Build Coastguard Worker //
3322*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
3323*3f1979aaSAndroid Build Coastguard Worker //      i := j*16
3324*3f1979aaSAndroid Build Coastguard Worker //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3325*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
3326*3f1979aaSAndroid Build Coastguard Worker //
3327*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
_mm_max_pi16(__m64 a,__m64 b)3328*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
3329*3f1979aaSAndroid Build Coastguard Worker {
3330*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s16(
3331*3f1979aaSAndroid Build Coastguard Worker         vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3332*3f1979aaSAndroid Build Coastguard Worker }
3333*3f1979aaSAndroid Build Coastguard Worker 
3334*3f1979aaSAndroid Build Coastguard Worker // Compare packed signed 16-bit integers in a and b, and store packed maximum
3335*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3336*3f1979aaSAndroid Build Coastguard Worker //
3337*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
3338*3f1979aaSAndroid Build Coastguard Worker //      i := j*16
3339*3f1979aaSAndroid Build Coastguard Worker //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3340*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
3341*3f1979aaSAndroid Build Coastguard Worker //
3342*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
3343*3f1979aaSAndroid Build Coastguard Worker #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
3344*3f1979aaSAndroid Build Coastguard Worker 
3345*3f1979aaSAndroid Build Coastguard Worker // Computes the maximums of the four single-precision, floating-point values of
3346*3f1979aaSAndroid Build Coastguard Worker // a and b.
3347*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
_mm_max_ps(__m128 a,__m128 b)3348*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
3349*3f1979aaSAndroid Build Coastguard Worker {
3350*3f1979aaSAndroid Build Coastguard Worker #if SSE2NEON_PRECISE_MINMAX
3351*3f1979aaSAndroid Build Coastguard Worker     float32x4_t _a = vreinterpretq_f32_m128(a);
3352*3f1979aaSAndroid Build Coastguard Worker     float32x4_t _b = vreinterpretq_f32_m128(b);
3353*3f1979aaSAndroid Build Coastguard Worker     return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
3354*3f1979aaSAndroid Build Coastguard Worker #else
3355*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
3356*3f1979aaSAndroid Build Coastguard Worker         vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3357*3f1979aaSAndroid Build Coastguard Worker #endif
3358*3f1979aaSAndroid Build Coastguard Worker }
3359*3f1979aaSAndroid Build Coastguard Worker 
3360*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3361*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3362*3f1979aaSAndroid Build Coastguard Worker //
3363*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
3364*3f1979aaSAndroid Build Coastguard Worker //      i := j*8
3365*3f1979aaSAndroid Build Coastguard Worker //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3366*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
3367*3f1979aaSAndroid Build Coastguard Worker //
3368*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
_mm_max_pu8(__m64 a,__m64 b)3369*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
3370*3f1979aaSAndroid Build Coastguard Worker {
3371*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_u8(
3372*3f1979aaSAndroid Build Coastguard Worker         vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3373*3f1979aaSAndroid Build Coastguard Worker }
3374*3f1979aaSAndroid Build Coastguard Worker 
3375*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3376*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3377*3f1979aaSAndroid Build Coastguard Worker //
3378*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
3379*3f1979aaSAndroid Build Coastguard Worker //      i := j*8
3380*3f1979aaSAndroid Build Coastguard Worker //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3381*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
3382*3f1979aaSAndroid Build Coastguard Worker //
3383*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
3384*3f1979aaSAndroid Build Coastguard Worker #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
3385*3f1979aaSAndroid Build Coastguard Worker 
3386*3f1979aaSAndroid Build Coastguard Worker // Compare packed signed 16-bit integers in a and b, and store packed minimum
3387*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3388*3f1979aaSAndroid Build Coastguard Worker //
3389*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
3390*3f1979aaSAndroid Build Coastguard Worker //      i := j*16
3391*3f1979aaSAndroid Build Coastguard Worker //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3392*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
3393*3f1979aaSAndroid Build Coastguard Worker //
3394*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
_mm_min_pi16(__m64 a,__m64 b)3395*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
3396*3f1979aaSAndroid Build Coastguard Worker {
3397*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s16(
3398*3f1979aaSAndroid Build Coastguard Worker         vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3399*3f1979aaSAndroid Build Coastguard Worker }
3400*3f1979aaSAndroid Build Coastguard Worker 
3401*3f1979aaSAndroid Build Coastguard Worker // Compare packed signed 16-bit integers in a and b, and store packed minimum
3402*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3403*3f1979aaSAndroid Build Coastguard Worker //
3404*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
3405*3f1979aaSAndroid Build Coastguard Worker //      i := j*16
3406*3f1979aaSAndroid Build Coastguard Worker //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3407*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
3408*3f1979aaSAndroid Build Coastguard Worker //
3409*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
3410*3f1979aaSAndroid Build Coastguard Worker #define _m_pminsw(a, b) _mm_min_pi16(a, b)
3411*3f1979aaSAndroid Build Coastguard Worker 
3412*3f1979aaSAndroid Build Coastguard Worker // Computes the minima of the four single-precision, floating-point values of a
3413*3f1979aaSAndroid Build Coastguard Worker // and b.
3414*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
_mm_min_ps(__m128 a,__m128 b)3415*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
3416*3f1979aaSAndroid Build Coastguard Worker {
3417*3f1979aaSAndroid Build Coastguard Worker #if SSE2NEON_PRECISE_MINMAX
3418*3f1979aaSAndroid Build Coastguard Worker     float32x4_t _a = vreinterpretq_f32_m128(a);
3419*3f1979aaSAndroid Build Coastguard Worker     float32x4_t _b = vreinterpretq_f32_m128(b);
3420*3f1979aaSAndroid Build Coastguard Worker     return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
3421*3f1979aaSAndroid Build Coastguard Worker #else
3422*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
3423*3f1979aaSAndroid Build Coastguard Worker         vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3424*3f1979aaSAndroid Build Coastguard Worker #endif
3425*3f1979aaSAndroid Build Coastguard Worker }
3426*3f1979aaSAndroid Build Coastguard Worker 
3427*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3428*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3429*3f1979aaSAndroid Build Coastguard Worker //
3430*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
3431*3f1979aaSAndroid Build Coastguard Worker //      i := j*8
3432*3f1979aaSAndroid Build Coastguard Worker //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3433*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
3434*3f1979aaSAndroid Build Coastguard Worker //
3435*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
_mm_min_pu8(__m64 a,__m64 b)3436*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
3437*3f1979aaSAndroid Build Coastguard Worker {
3438*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_u8(
3439*3f1979aaSAndroid Build Coastguard Worker         vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3440*3f1979aaSAndroid Build Coastguard Worker }
3441*3f1979aaSAndroid Build Coastguard Worker 
3442*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3443*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3444*3f1979aaSAndroid Build Coastguard Worker //
3445*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
3446*3f1979aaSAndroid Build Coastguard Worker //      i := j*8
3447*3f1979aaSAndroid Build Coastguard Worker //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3448*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
3449*3f1979aaSAndroid Build Coastguard Worker //
3450*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
3451*3f1979aaSAndroid Build Coastguard Worker #define _m_pminub(a, b) _mm_min_pu8(a, b)
3452*3f1979aaSAndroid Build Coastguard Worker 
3453*3f1979aaSAndroid Build Coastguard Worker // Computes the maximum of the two lower scalar single-precision floating point
3454*3f1979aaSAndroid Build Coastguard Worker // values of a and b.
3455*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
_mm_max_ss(__m128 a,__m128 b)3456*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
3457*3f1979aaSAndroid Build Coastguard Worker {
3458*3f1979aaSAndroid Build Coastguard Worker     float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
3459*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
3460*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3461*3f1979aaSAndroid Build Coastguard Worker }
3462*3f1979aaSAndroid Build Coastguard Worker 
3463*3f1979aaSAndroid Build Coastguard Worker // Computes the minimum of the two lower scalar single-precision floating point
3464*3f1979aaSAndroid Build Coastguard Worker // values of a and b.
3465*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
_mm_min_ss(__m128 a,__m128 b)3466*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
3467*3f1979aaSAndroid Build Coastguard Worker {
3468*3f1979aaSAndroid Build Coastguard Worker     float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
3469*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
3470*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3471*3f1979aaSAndroid Build Coastguard Worker }
3472*3f1979aaSAndroid Build Coastguard Worker 
3473*3f1979aaSAndroid Build Coastguard Worker // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
3474*3f1979aaSAndroid Build Coastguard Worker // 16 unsigned 8-bit integers from b.
3475*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
_mm_max_epu8(__m128i a,__m128i b)3476*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
3477*3f1979aaSAndroid Build Coastguard Worker {
3478*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(
3479*3f1979aaSAndroid Build Coastguard Worker         vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3480*3f1979aaSAndroid Build Coastguard Worker }
3481*3f1979aaSAndroid Build Coastguard Worker 
3482*3f1979aaSAndroid Build Coastguard Worker // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
3483*3f1979aaSAndroid Build Coastguard Worker // 16 unsigned 8-bit integers from b.
3484*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
_mm_min_epu8(__m128i a,__m128i b)3485*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
3486*3f1979aaSAndroid Build Coastguard Worker {
3487*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(
3488*3f1979aaSAndroid Build Coastguard Worker         vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3489*3f1979aaSAndroid Build Coastguard Worker }
3490*3f1979aaSAndroid Build Coastguard Worker 
3491*3f1979aaSAndroid Build Coastguard Worker // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
3492*3f1979aaSAndroid Build Coastguard Worker // signed 16-bit integers from b.
3493*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
_mm_min_epi16(__m128i a,__m128i b)3494*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
3495*3f1979aaSAndroid Build Coastguard Worker {
3496*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
3497*3f1979aaSAndroid Build Coastguard Worker         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3498*3f1979aaSAndroid Build Coastguard Worker }
3499*3f1979aaSAndroid Build Coastguard Worker 
3500*3f1979aaSAndroid Build Coastguard Worker // Compare packed signed 8-bit integers in a and b, and store packed maximum
3501*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3502*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
_mm_max_epi8(__m128i a,__m128i b)3503*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
3504*3f1979aaSAndroid Build Coastguard Worker {
3505*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(
3506*3f1979aaSAndroid Build Coastguard Worker         vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3507*3f1979aaSAndroid Build Coastguard Worker }
3508*3f1979aaSAndroid Build Coastguard Worker 
3509*3f1979aaSAndroid Build Coastguard Worker // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
3510*3f1979aaSAndroid Build Coastguard Worker // signed 16-bit integers from b.
3511*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
_mm_max_epi16(__m128i a,__m128i b)3512*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
3513*3f1979aaSAndroid Build Coastguard Worker {
3514*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
3515*3f1979aaSAndroid Build Coastguard Worker         vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3516*3f1979aaSAndroid Build Coastguard Worker }
3517*3f1979aaSAndroid Build Coastguard Worker 
3518*3f1979aaSAndroid Build Coastguard Worker // epi versions of min/max
3519*3f1979aaSAndroid Build Coastguard Worker // Computes the pariwise maximums of the four signed 32-bit integer values of a
3520*3f1979aaSAndroid Build Coastguard Worker // and b.
3521*3f1979aaSAndroid Build Coastguard Worker //
3522*3f1979aaSAndroid Build Coastguard Worker // A 128-bit parameter that can be defined with the following equations:
3523*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 > b0) ? a0 : b0
3524*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 > b1) ? a1 : b1
3525*3f1979aaSAndroid Build Coastguard Worker //   r2 := (a2 > b2) ? a2 : b2
3526*3f1979aaSAndroid Build Coastguard Worker //   r3 := (a3 > b3) ? a3 : b3
3527*3f1979aaSAndroid Build Coastguard Worker //
3528*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
_mm_max_epi32(__m128i a,__m128i b)3529*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
3530*3f1979aaSAndroid Build Coastguard Worker {
3531*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
3532*3f1979aaSAndroid Build Coastguard Worker         vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3533*3f1979aaSAndroid Build Coastguard Worker }
3534*3f1979aaSAndroid Build Coastguard Worker 
3535*3f1979aaSAndroid Build Coastguard Worker // Computes the pariwise minima of the four signed 32-bit integer values of a
3536*3f1979aaSAndroid Build Coastguard Worker // and b.
3537*3f1979aaSAndroid Build Coastguard Worker //
3538*3f1979aaSAndroid Build Coastguard Worker // A 128-bit parameter that can be defined with the following equations:
3539*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 < b0) ? a0 : b0
3540*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 < b1) ? a1 : b1
3541*3f1979aaSAndroid Build Coastguard Worker //   r2 := (a2 < b2) ? a2 : b2
3542*3f1979aaSAndroid Build Coastguard Worker //   r3 := (a3 < b3) ? a3 : b3
3543*3f1979aaSAndroid Build Coastguard Worker //
3544*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
_mm_min_epi32(__m128i a,__m128i b)3545*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
3546*3f1979aaSAndroid Build Coastguard Worker {
3547*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
3548*3f1979aaSAndroid Build Coastguard Worker         vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3549*3f1979aaSAndroid Build Coastguard Worker }
3550*3f1979aaSAndroid Build Coastguard Worker 
3551*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
3552*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3553*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_max_epu32(__m128i a,__m128i b)3554*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
3555*3f1979aaSAndroid Build Coastguard Worker {
3556*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u32(
3557*3f1979aaSAndroid Build Coastguard Worker         vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3558*3f1979aaSAndroid Build Coastguard Worker }
3559*3f1979aaSAndroid Build Coastguard Worker 
3560*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
3561*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3562*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_min_epu32(__m128i a,__m128i b)3563*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
3564*3f1979aaSAndroid Build Coastguard Worker {
3565*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u32(
3566*3f1979aaSAndroid Build Coastguard Worker         vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3567*3f1979aaSAndroid Build Coastguard Worker }
3568*3f1979aaSAndroid Build Coastguard Worker 
3569*3f1979aaSAndroid Build Coastguard Worker // Multiply the packed unsigned 16-bit integers in a and b, producing
3570*3f1979aaSAndroid Build Coastguard Worker // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3571*3f1979aaSAndroid Build Coastguard Worker // integers in dst.
3572*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
_mm_mulhi_pu16(__m64 a,__m64 b)3573*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
3574*3f1979aaSAndroid Build Coastguard Worker {
3575*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_u16(vshrn_n_u32(
3576*3f1979aaSAndroid Build Coastguard Worker         vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
3577*3f1979aaSAndroid Build Coastguard Worker }
3578*3f1979aaSAndroid Build Coastguard Worker 
3579*3f1979aaSAndroid Build Coastguard Worker // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3580*3f1979aaSAndroid Build Coastguard Worker // integers from b.
3581*3f1979aaSAndroid Build Coastguard Worker //
3582*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 * b0)[31:16]
3583*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 * b1)[31:16]
3584*3f1979aaSAndroid Build Coastguard Worker //   ...
3585*3f1979aaSAndroid Build Coastguard Worker //   r7 := (a7 * b7)[31:16]
3586*3f1979aaSAndroid Build Coastguard Worker //
3587*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
_mm_mulhi_epi16(__m128i a,__m128i b)3588*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
3589*3f1979aaSAndroid Build Coastguard Worker {
3590*3f1979aaSAndroid Build Coastguard Worker     /* FIXME: issue with large values because of result saturation */
3591*3f1979aaSAndroid Build Coastguard Worker     // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
3592*3f1979aaSAndroid Build Coastguard Worker     // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
3593*3f1979aaSAndroid Build Coastguard Worker     // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
3594*3f1979aaSAndroid Build Coastguard Worker     int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
3595*3f1979aaSAndroid Build Coastguard Worker     int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
3596*3f1979aaSAndroid Build Coastguard Worker     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3597*3f1979aaSAndroid Build Coastguard Worker     int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
3598*3f1979aaSAndroid Build Coastguard Worker     int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
3599*3f1979aaSAndroid Build Coastguard Worker     int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3600*3f1979aaSAndroid Build Coastguard Worker     uint16x8x2_t r =
3601*3f1979aaSAndroid Build Coastguard Worker         vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
3602*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u16(r.val[1]);
3603*3f1979aaSAndroid Build Coastguard Worker }
3604*3f1979aaSAndroid Build Coastguard Worker 
3605*3f1979aaSAndroid Build Coastguard Worker // Computes pairwise add of each argument as single-precision, floating-point
3606*3f1979aaSAndroid Build Coastguard Worker // values a and b.
3607*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
_mm_hadd_ps(__m128 a,__m128 b)3608*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
3609*3f1979aaSAndroid Build Coastguard Worker {
3610*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3611*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
3612*3f1979aaSAndroid Build Coastguard Worker         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3613*3f1979aaSAndroid Build Coastguard Worker #else
3614*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
3615*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
3616*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
3617*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
3618*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
3619*3f1979aaSAndroid Build Coastguard Worker         vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
3620*3f1979aaSAndroid Build Coastguard Worker #endif
3621*3f1979aaSAndroid Build Coastguard Worker }
3622*3f1979aaSAndroid Build Coastguard Worker 
3623*3f1979aaSAndroid Build Coastguard Worker // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
3624*3f1979aaSAndroid Build Coastguard Worker // values a and b.
_mm_hadd_epi16(__m128i _a,__m128i _b)3625*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
3626*3f1979aaSAndroid Build Coastguard Worker {
3627*3f1979aaSAndroid Build Coastguard Worker     int16x8_t a = vreinterpretq_s16_m128i(_a);
3628*3f1979aaSAndroid Build Coastguard Worker     int16x8_t b = vreinterpretq_s16_m128i(_b);
3629*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3630*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
3631*3f1979aaSAndroid Build Coastguard Worker #else
3632*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
3633*3f1979aaSAndroid Build Coastguard Worker         vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
3634*3f1979aaSAndroid Build Coastguard Worker                      vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
3635*3f1979aaSAndroid Build Coastguard Worker #endif
3636*3f1979aaSAndroid Build Coastguard Worker }
3637*3f1979aaSAndroid Build Coastguard Worker 
3638*3f1979aaSAndroid Build Coastguard Worker // Horizontally substract adjacent pairs of single-precision (32-bit)
3639*3f1979aaSAndroid Build Coastguard Worker // floating-point elements in a and b, and pack the results in dst.
3640*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
_mm_hsub_ps(__m128 _a,__m128 _b)3641*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
3642*3f1979aaSAndroid Build Coastguard Worker {
3643*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3644*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vsubq_f32(
3645*3f1979aaSAndroid Build Coastguard Worker         vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
3646*3f1979aaSAndroid Build Coastguard Worker         vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
3647*3f1979aaSAndroid Build Coastguard Worker #else
3648*3f1979aaSAndroid Build Coastguard Worker     float32x4x2_t c =
3649*3f1979aaSAndroid Build Coastguard Worker         vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
3650*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
3651*3f1979aaSAndroid Build Coastguard Worker #endif
3652*3f1979aaSAndroid Build Coastguard Worker }
3653*3f1979aaSAndroid Build Coastguard Worker 
3654*3f1979aaSAndroid Build Coastguard Worker // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
3655*3f1979aaSAndroid Build Coastguard Worker // signed 16-bit results in dst.
3656*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
_mm_hadd_pi16(__m64 a,__m64 b)3657*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
3658*3f1979aaSAndroid Build Coastguard Worker {
3659*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s16(
3660*3f1979aaSAndroid Build Coastguard Worker         vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3661*3f1979aaSAndroid Build Coastguard Worker }
3662*3f1979aaSAndroid Build Coastguard Worker 
3663*3f1979aaSAndroid Build Coastguard Worker // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
3664*3f1979aaSAndroid Build Coastguard Worker // signed 32-bit results in dst.
3665*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
_mm_hadd_pi32(__m64 a,__m64 b)3666*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
3667*3f1979aaSAndroid Build Coastguard Worker {
3668*3f1979aaSAndroid Build Coastguard Worker     return vreinterpret_m64_s32(
3669*3f1979aaSAndroid Build Coastguard Worker         vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
3670*3f1979aaSAndroid Build Coastguard Worker }
3671*3f1979aaSAndroid Build Coastguard Worker 
3672*3f1979aaSAndroid Build Coastguard Worker // Computes pairwise difference of each argument as a 16-bit signed or unsigned
3673*3f1979aaSAndroid Build Coastguard Worker // integer values a and b.
_mm_hsub_epi16(__m128i _a,__m128i _b)3674*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
3675*3f1979aaSAndroid Build Coastguard Worker {
3676*3f1979aaSAndroid Build Coastguard Worker     int32x4_t a = vreinterpretq_s32_m128i(_a);
3677*3f1979aaSAndroid Build Coastguard Worker     int32x4_t b = vreinterpretq_s32_m128i(_b);
3678*3f1979aaSAndroid Build Coastguard Worker     // Interleave using vshrn/vmovn
3679*3f1979aaSAndroid Build Coastguard Worker     // [a0|a2|a4|a6|b0|b2|b4|b6]
3680*3f1979aaSAndroid Build Coastguard Worker     // [a1|a3|a5|a7|b1|b3|b5|b7]
3681*3f1979aaSAndroid Build Coastguard Worker     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3682*3f1979aaSAndroid Build Coastguard Worker     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3683*3f1979aaSAndroid Build Coastguard Worker     // Subtract
3684*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
3685*3f1979aaSAndroid Build Coastguard Worker }
3686*3f1979aaSAndroid Build Coastguard Worker 
3687*3f1979aaSAndroid Build Coastguard Worker // Computes saturated pairwise sub of each argument as a 16-bit signed
3688*3f1979aaSAndroid Build Coastguard Worker // integer values a and b.
_mm_hadds_epi16(__m128i _a,__m128i _b)3689*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
3690*3f1979aaSAndroid Build Coastguard Worker {
3691*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3692*3f1979aaSAndroid Build Coastguard Worker     int16x8_t a = vreinterpretq_s16_m128i(_a);
3693*3f1979aaSAndroid Build Coastguard Worker     int16x8_t b = vreinterpretq_s16_m128i(_b);
3694*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_s64_s16(
3695*3f1979aaSAndroid Build Coastguard Worker         vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3696*3f1979aaSAndroid Build Coastguard Worker #else
3697*3f1979aaSAndroid Build Coastguard Worker     int32x4_t a = vreinterpretq_s32_m128i(_a);
3698*3f1979aaSAndroid Build Coastguard Worker     int32x4_t b = vreinterpretq_s32_m128i(_b);
3699*3f1979aaSAndroid Build Coastguard Worker     // Interleave using vshrn/vmovn
3700*3f1979aaSAndroid Build Coastguard Worker     // [a0|a2|a4|a6|b0|b2|b4|b6]
3701*3f1979aaSAndroid Build Coastguard Worker     // [a1|a3|a5|a7|b1|b3|b5|b7]
3702*3f1979aaSAndroid Build Coastguard Worker     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3703*3f1979aaSAndroid Build Coastguard Worker     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3704*3f1979aaSAndroid Build Coastguard Worker     // Saturated add
3705*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
3706*3f1979aaSAndroid Build Coastguard Worker #endif
3707*3f1979aaSAndroid Build Coastguard Worker }
3708*3f1979aaSAndroid Build Coastguard Worker 
3709*3f1979aaSAndroid Build Coastguard Worker // Computes saturated pairwise difference of each argument as a 16-bit signed
3710*3f1979aaSAndroid Build Coastguard Worker // integer values a and b.
3711*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
_mm_hsubs_epi16(__m128i _a,__m128i _b)3712*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
3713*3f1979aaSAndroid Build Coastguard Worker {
3714*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3715*3f1979aaSAndroid Build Coastguard Worker     int16x8_t a = vreinterpretq_s16_m128i(_a);
3716*3f1979aaSAndroid Build Coastguard Worker     int16x8_t b = vreinterpretq_s16_m128i(_b);
3717*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_s64_s16(
3718*3f1979aaSAndroid Build Coastguard Worker         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3719*3f1979aaSAndroid Build Coastguard Worker #else
3720*3f1979aaSAndroid Build Coastguard Worker     int32x4_t a = vreinterpretq_s32_m128i(_a);
3721*3f1979aaSAndroid Build Coastguard Worker     int32x4_t b = vreinterpretq_s32_m128i(_b);
3722*3f1979aaSAndroid Build Coastguard Worker     // Interleave using vshrn/vmovn
3723*3f1979aaSAndroid Build Coastguard Worker     // [a0|a2|a4|a6|b0|b2|b4|b6]
3724*3f1979aaSAndroid Build Coastguard Worker     // [a1|a3|a5|a7|b1|b3|b5|b7]
3725*3f1979aaSAndroid Build Coastguard Worker     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3726*3f1979aaSAndroid Build Coastguard Worker     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3727*3f1979aaSAndroid Build Coastguard Worker     // Saturated subtract
3728*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
3729*3f1979aaSAndroid Build Coastguard Worker #endif
3730*3f1979aaSAndroid Build Coastguard Worker }
3731*3f1979aaSAndroid Build Coastguard Worker 
3732*3f1979aaSAndroid Build Coastguard Worker // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
3733*3f1979aaSAndroid Build Coastguard Worker // values a and b.
_mm_hadd_epi32(__m128i _a,__m128i _b)3734*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
3735*3f1979aaSAndroid Build Coastguard Worker {
3736*3f1979aaSAndroid Build Coastguard Worker     int32x4_t a = vreinterpretq_s32_m128i(_a);
3737*3f1979aaSAndroid Build Coastguard Worker     int32x4_t b = vreinterpretq_s32_m128i(_b);
3738*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
3739*3f1979aaSAndroid Build Coastguard Worker         vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
3740*3f1979aaSAndroid Build Coastguard Worker                      vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
3741*3f1979aaSAndroid Build Coastguard Worker }
3742*3f1979aaSAndroid Build Coastguard Worker 
3743*3f1979aaSAndroid Build Coastguard Worker // Computes pairwise difference of each argument as a 32-bit signed or unsigned
3744*3f1979aaSAndroid Build Coastguard Worker // integer values a and b.
_mm_hsub_epi32(__m128i _a,__m128i _b)3745*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
3746*3f1979aaSAndroid Build Coastguard Worker {
3747*3f1979aaSAndroid Build Coastguard Worker     int64x2_t a = vreinterpretq_s64_m128i(_a);
3748*3f1979aaSAndroid Build Coastguard Worker     int64x2_t b = vreinterpretq_s64_m128i(_b);
3749*3f1979aaSAndroid Build Coastguard Worker     // Interleave using vshrn/vmovn
3750*3f1979aaSAndroid Build Coastguard Worker     // [a0|a2|b0|b2]
3751*3f1979aaSAndroid Build Coastguard Worker     // [a1|a2|b1|b3]
3752*3f1979aaSAndroid Build Coastguard Worker     int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
3753*3f1979aaSAndroid Build Coastguard Worker     int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
3754*3f1979aaSAndroid Build Coastguard Worker     // Subtract
3755*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
3756*3f1979aaSAndroid Build Coastguard Worker }
3757*3f1979aaSAndroid Build Coastguard Worker 
3758*3f1979aaSAndroid Build Coastguard Worker // Kahan summation for accurate summation of floating-point numbers.
3759*3f1979aaSAndroid Build Coastguard Worker // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
sse2neon_kadd_f32(float * sum,float * c,float y)3760*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y)
3761*3f1979aaSAndroid Build Coastguard Worker {
3762*3f1979aaSAndroid Build Coastguard Worker     y -= *c;
3763*3f1979aaSAndroid Build Coastguard Worker     float t = *sum + y;
3764*3f1979aaSAndroid Build Coastguard Worker     *c = (t - *sum) - y;
3765*3f1979aaSAndroid Build Coastguard Worker     *sum = t;
3766*3f1979aaSAndroid Build Coastguard Worker }
3767*3f1979aaSAndroid Build Coastguard Worker 
3768*3f1979aaSAndroid Build Coastguard Worker // Conditionally multiply the packed single-precision (32-bit) floating-point
3769*3f1979aaSAndroid Build Coastguard Worker // elements in a and b using the high 4 bits in imm8, sum the four products,
3770*3f1979aaSAndroid Build Coastguard Worker // and conditionally store the sum in dst using the low 4 bits of imm.
3771*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
_mm_dp_ps(__m128 a,__m128 b,const int imm)3772*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
3773*3f1979aaSAndroid Build Coastguard Worker {
3774*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3775*3f1979aaSAndroid Build Coastguard Worker     /* shortcuts */
3776*3f1979aaSAndroid Build Coastguard Worker     if (imm == 0xFF) {
3777*3f1979aaSAndroid Build Coastguard Worker         return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
3778*3f1979aaSAndroid Build Coastguard Worker     }
3779*3f1979aaSAndroid Build Coastguard Worker     if (imm == 0x7F) {
3780*3f1979aaSAndroid Build Coastguard Worker         float32x4_t m = _mm_mul_ps(a, b);
3781*3f1979aaSAndroid Build Coastguard Worker         m[3] = 0;
3782*3f1979aaSAndroid Build Coastguard Worker         return _mm_set1_ps(vaddvq_f32(m));
3783*3f1979aaSAndroid Build Coastguard Worker     }
3784*3f1979aaSAndroid Build Coastguard Worker #endif
3785*3f1979aaSAndroid Build Coastguard Worker 
3786*3f1979aaSAndroid Build Coastguard Worker     float s = 0, c = 0;
3787*3f1979aaSAndroid Build Coastguard Worker     float32x4_t f32a = vreinterpretq_f32_m128(a);
3788*3f1979aaSAndroid Build Coastguard Worker     float32x4_t f32b = vreinterpretq_f32_m128(b);
3789*3f1979aaSAndroid Build Coastguard Worker 
3790*3f1979aaSAndroid Build Coastguard Worker     /* To improve the accuracy of floating-point summation, Kahan algorithm
3791*3f1979aaSAndroid Build Coastguard Worker      * is used for each operation.
3792*3f1979aaSAndroid Build Coastguard Worker      */
3793*3f1979aaSAndroid Build Coastguard Worker     if (imm & (1 << 4))
3794*3f1979aaSAndroid Build Coastguard Worker         sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
3795*3f1979aaSAndroid Build Coastguard Worker     if (imm & (1 << 5))
3796*3f1979aaSAndroid Build Coastguard Worker         sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
3797*3f1979aaSAndroid Build Coastguard Worker     if (imm & (1 << 6))
3798*3f1979aaSAndroid Build Coastguard Worker         sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
3799*3f1979aaSAndroid Build Coastguard Worker     if (imm & (1 << 7))
3800*3f1979aaSAndroid Build Coastguard Worker         sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
3801*3f1979aaSAndroid Build Coastguard Worker     s += c;
3802*3f1979aaSAndroid Build Coastguard Worker 
3803*3f1979aaSAndroid Build Coastguard Worker     float32x4_t res = {
3804*3f1979aaSAndroid Build Coastguard Worker         (imm & 0x1) ? s : 0,
3805*3f1979aaSAndroid Build Coastguard Worker         (imm & 0x2) ? s : 0,
3806*3f1979aaSAndroid Build Coastguard Worker         (imm & 0x4) ? s : 0,
3807*3f1979aaSAndroid Build Coastguard Worker         (imm & 0x8) ? s : 0,
3808*3f1979aaSAndroid Build Coastguard Worker     };
3809*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(res);
3810*3f1979aaSAndroid Build Coastguard Worker }
3811*3f1979aaSAndroid Build Coastguard Worker 
3812*3f1979aaSAndroid Build Coastguard Worker /* Compare operations */
3813*3f1979aaSAndroid Build Coastguard Worker 
3814*3f1979aaSAndroid Build Coastguard Worker // Compares for less than
3815*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
_mm_cmplt_ps(__m128 a,__m128 b)3816*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
3817*3f1979aaSAndroid Build Coastguard Worker {
3818*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_u32(
3819*3f1979aaSAndroid Build Coastguard Worker         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3820*3f1979aaSAndroid Build Coastguard Worker }
3821*3f1979aaSAndroid Build Coastguard Worker 
3822*3f1979aaSAndroid Build Coastguard Worker // Compares for less than
3823*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
_mm_cmplt_ss(__m128 a,__m128 b)3824*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
3825*3f1979aaSAndroid Build Coastguard Worker {
3826*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_cmplt_ps(a, b));
3827*3f1979aaSAndroid Build Coastguard Worker }
3828*3f1979aaSAndroid Build Coastguard Worker 
3829*3f1979aaSAndroid Build Coastguard Worker // Compares for greater than.
3830*3f1979aaSAndroid Build Coastguard Worker //
3831*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 > b0) ? 0xffffffff : 0x0
3832*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 > b1) ? 0xffffffff : 0x0
3833*3f1979aaSAndroid Build Coastguard Worker //   r2 := (a2 > b2) ? 0xffffffff : 0x0
3834*3f1979aaSAndroid Build Coastguard Worker //   r3 := (a3 > b3) ? 0xffffffff : 0x0
3835*3f1979aaSAndroid Build Coastguard Worker //
3836*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
_mm_cmpgt_ps(__m128 a,__m128 b)3837*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
3838*3f1979aaSAndroid Build Coastguard Worker {
3839*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_u32(
3840*3f1979aaSAndroid Build Coastguard Worker         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3841*3f1979aaSAndroid Build Coastguard Worker }
3842*3f1979aaSAndroid Build Coastguard Worker 
3843*3f1979aaSAndroid Build Coastguard Worker // Compares for greater than.
3844*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
_mm_cmpgt_ss(__m128 a,__m128 b)3845*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
3846*3f1979aaSAndroid Build Coastguard Worker {
3847*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
3848*3f1979aaSAndroid Build Coastguard Worker }
3849*3f1979aaSAndroid Build Coastguard Worker 
3850*3f1979aaSAndroid Build Coastguard Worker // Compares for greater than or equal.
3851*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
_mm_cmpge_ps(__m128 a,__m128 b)3852*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
3853*3f1979aaSAndroid Build Coastguard Worker {
3854*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_u32(
3855*3f1979aaSAndroid Build Coastguard Worker         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3856*3f1979aaSAndroid Build Coastguard Worker }
3857*3f1979aaSAndroid Build Coastguard Worker 
3858*3f1979aaSAndroid Build Coastguard Worker // Compares for greater than or equal.
3859*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
_mm_cmpge_ss(__m128 a,__m128 b)3860*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
3861*3f1979aaSAndroid Build Coastguard Worker {
3862*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_cmpge_ps(a, b));
3863*3f1979aaSAndroid Build Coastguard Worker }
3864*3f1979aaSAndroid Build Coastguard Worker 
3865*3f1979aaSAndroid Build Coastguard Worker // Compares for less than or equal.
3866*3f1979aaSAndroid Build Coastguard Worker //
3867*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 <= b0) ? 0xffffffff : 0x0
3868*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 <= b1) ? 0xffffffff : 0x0
3869*3f1979aaSAndroid Build Coastguard Worker //   r2 := (a2 <= b2) ? 0xffffffff : 0x0
3870*3f1979aaSAndroid Build Coastguard Worker //   r3 := (a3 <= b3) ? 0xffffffff : 0x0
3871*3f1979aaSAndroid Build Coastguard Worker //
3872*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
_mm_cmple_ps(__m128 a,__m128 b)3873*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
3874*3f1979aaSAndroid Build Coastguard Worker {
3875*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_u32(
3876*3f1979aaSAndroid Build Coastguard Worker         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3877*3f1979aaSAndroid Build Coastguard Worker }
3878*3f1979aaSAndroid Build Coastguard Worker 
3879*3f1979aaSAndroid Build Coastguard Worker // Compares for less than or equal.
3880*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
_mm_cmple_ss(__m128 a,__m128 b)3881*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
3882*3f1979aaSAndroid Build Coastguard Worker {
3883*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_cmple_ps(a, b));
3884*3f1979aaSAndroid Build Coastguard Worker }
3885*3f1979aaSAndroid Build Coastguard Worker 
3886*3f1979aaSAndroid Build Coastguard Worker // Compares for equality.
3887*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
_mm_cmpeq_ps(__m128 a,__m128 b)3888*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
3889*3f1979aaSAndroid Build Coastguard Worker {
3890*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_u32(
3891*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3892*3f1979aaSAndroid Build Coastguard Worker }
3893*3f1979aaSAndroid Build Coastguard Worker 
3894*3f1979aaSAndroid Build Coastguard Worker // Compares for equality.
3895*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
_mm_cmpeq_ss(__m128 a,__m128 b)3896*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
3897*3f1979aaSAndroid Build Coastguard Worker {
3898*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
3899*3f1979aaSAndroid Build Coastguard Worker }
3900*3f1979aaSAndroid Build Coastguard Worker 
3901*3f1979aaSAndroid Build Coastguard Worker // Compares for inequality.
3902*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
_mm_cmpneq_ps(__m128 a,__m128 b)3903*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
3904*3f1979aaSAndroid Build Coastguard Worker {
3905*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_u32(vmvnq_u32(
3906*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
3907*3f1979aaSAndroid Build Coastguard Worker }
3908*3f1979aaSAndroid Build Coastguard Worker 
3909*3f1979aaSAndroid Build Coastguard Worker // Compares for inequality.
3910*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
_mm_cmpneq_ss(__m128 a,__m128 b)3911*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
3912*3f1979aaSAndroid Build Coastguard Worker {
3913*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
3914*3f1979aaSAndroid Build Coastguard Worker }
3915*3f1979aaSAndroid Build Coastguard Worker 
3916*3f1979aaSAndroid Build Coastguard Worker // Compares for not greater than or equal.
3917*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
_mm_cmpnge_ps(__m128 a,__m128 b)3918*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
3919*3f1979aaSAndroid Build Coastguard Worker {
3920*3f1979aaSAndroid Build Coastguard Worker     return _mm_cmplt_ps(a, b);
3921*3f1979aaSAndroid Build Coastguard Worker }
3922*3f1979aaSAndroid Build Coastguard Worker 
3923*3f1979aaSAndroid Build Coastguard Worker // Compares for not greater than or equal.
3924*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
_mm_cmpnge_ss(__m128 a,__m128 b)3925*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
3926*3f1979aaSAndroid Build Coastguard Worker {
3927*3f1979aaSAndroid Build Coastguard Worker     return _mm_cmplt_ss(a, b);
3928*3f1979aaSAndroid Build Coastguard Worker }
3929*3f1979aaSAndroid Build Coastguard Worker 
3930*3f1979aaSAndroid Build Coastguard Worker // Compares for not greater than.
3931*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
_mm_cmpngt_ps(__m128 a,__m128 b)3932*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
3933*3f1979aaSAndroid Build Coastguard Worker {
3934*3f1979aaSAndroid Build Coastguard Worker     return _mm_cmple_ps(a, b);
3935*3f1979aaSAndroid Build Coastguard Worker }
3936*3f1979aaSAndroid Build Coastguard Worker 
3937*3f1979aaSAndroid Build Coastguard Worker // Compares for not greater than.
3938*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpngt_ss(__m128 a,__m128 b)3939*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
3940*3f1979aaSAndroid Build Coastguard Worker {
3941*3f1979aaSAndroid Build Coastguard Worker     return _mm_cmple_ss(a, b);
3942*3f1979aaSAndroid Build Coastguard Worker }
3943*3f1979aaSAndroid Build Coastguard Worker 
3944*3f1979aaSAndroid Build Coastguard Worker // Compares for not less than or equal.
3945*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
_mm_cmpnle_ps(__m128 a,__m128 b)3946*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
3947*3f1979aaSAndroid Build Coastguard Worker {
3948*3f1979aaSAndroid Build Coastguard Worker     return _mm_cmpgt_ps(a, b);
3949*3f1979aaSAndroid Build Coastguard Worker }
3950*3f1979aaSAndroid Build Coastguard Worker 
3951*3f1979aaSAndroid Build Coastguard Worker // Compares for not less than or equal.
3952*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpnle_ss(__m128 a,__m128 b)3953*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
3954*3f1979aaSAndroid Build Coastguard Worker {
3955*3f1979aaSAndroid Build Coastguard Worker     return _mm_cmpgt_ss(a, b);
3956*3f1979aaSAndroid Build Coastguard Worker }
3957*3f1979aaSAndroid Build Coastguard Worker 
3958*3f1979aaSAndroid Build Coastguard Worker // Compares for not less than.
3959*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
_mm_cmpnlt_ps(__m128 a,__m128 b)3960*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
3961*3f1979aaSAndroid Build Coastguard Worker {
3962*3f1979aaSAndroid Build Coastguard Worker     return _mm_cmpge_ps(a, b);
3963*3f1979aaSAndroid Build Coastguard Worker }
3964*3f1979aaSAndroid Build Coastguard Worker 
3965*3f1979aaSAndroid Build Coastguard Worker // Compares for not less than.
3966*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
_mm_cmpnlt_ss(__m128 a,__m128 b)3967*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
3968*3f1979aaSAndroid Build Coastguard Worker {
3969*3f1979aaSAndroid Build Coastguard Worker     return _mm_cmpge_ss(a, b);
3970*3f1979aaSAndroid Build Coastguard Worker }
3971*3f1979aaSAndroid Build Coastguard Worker 
3972*3f1979aaSAndroid Build Coastguard Worker // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3973*3f1979aaSAndroid Build Coastguard Worker // unsigned 8-bit integers in b for equality.
3974*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
_mm_cmpeq_epi8(__m128i a,__m128i b)3975*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3976*3f1979aaSAndroid Build Coastguard Worker {
3977*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(
3978*3f1979aaSAndroid Build Coastguard Worker         vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3979*3f1979aaSAndroid Build Coastguard Worker }
3980*3f1979aaSAndroid Build Coastguard Worker 
3981*3f1979aaSAndroid Build Coastguard Worker // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3982*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers in b for equality.
3983*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
_mm_cmpeq_epi16(__m128i a,__m128i b)3984*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3985*3f1979aaSAndroid Build Coastguard Worker {
3986*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u16(
3987*3f1979aaSAndroid Build Coastguard Worker         vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3988*3f1979aaSAndroid Build Coastguard Worker }
3989*3f1979aaSAndroid Build Coastguard Worker 
3990*3f1979aaSAndroid Build Coastguard Worker // Compare packed 32-bit integers in a and b for equality, and store the results
3991*3f1979aaSAndroid Build Coastguard Worker // in dst
_mm_cmpeq_epi32(__m128i a,__m128i b)3992*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3993*3f1979aaSAndroid Build Coastguard Worker {
3994*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u32(
3995*3f1979aaSAndroid Build Coastguard Worker         vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3996*3f1979aaSAndroid Build Coastguard Worker }
3997*3f1979aaSAndroid Build Coastguard Worker 
3998*3f1979aaSAndroid Build Coastguard Worker // Compare packed 64-bit integers in a and b for equality, and store the results
3999*3f1979aaSAndroid Build Coastguard Worker // in dst
_mm_cmpeq_epi64(__m128i a,__m128i b)4000*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
4001*3f1979aaSAndroid Build Coastguard Worker {
4002*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4003*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u64(
4004*3f1979aaSAndroid Build Coastguard Worker         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
4005*3f1979aaSAndroid Build Coastguard Worker #else
4006*3f1979aaSAndroid Build Coastguard Worker     // ARMv7 lacks vceqq_u64
4007*3f1979aaSAndroid Build Coastguard Worker     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
4008*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t cmp =
4009*3f1979aaSAndroid Build Coastguard Worker         vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
4010*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t swapped = vrev64q_u32(cmp);
4011*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
4012*3f1979aaSAndroid Build Coastguard Worker #endif
4013*3f1979aaSAndroid Build Coastguard Worker }
4014*3f1979aaSAndroid Build Coastguard Worker 
4015*3f1979aaSAndroid Build Coastguard Worker // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4016*3f1979aaSAndroid Build Coastguard Worker // in b for lesser than.
4017*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
_mm_cmplt_epi8(__m128i a,__m128i b)4018*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
4019*3f1979aaSAndroid Build Coastguard Worker {
4020*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(
4021*3f1979aaSAndroid Build Coastguard Worker         vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4022*3f1979aaSAndroid Build Coastguard Worker }
4023*3f1979aaSAndroid Build Coastguard Worker 
4024*3f1979aaSAndroid Build Coastguard Worker // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4025*3f1979aaSAndroid Build Coastguard Worker // in b for greater than.
4026*3f1979aaSAndroid Build Coastguard Worker //
4027*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 > b0) ? 0xff : 0x0
4028*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 > b1) ? 0xff : 0x0
4029*3f1979aaSAndroid Build Coastguard Worker //   ...
4030*3f1979aaSAndroid Build Coastguard Worker //   r15 := (a15 > b15) ? 0xff : 0x0
4031*3f1979aaSAndroid Build Coastguard Worker //
4032*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
_mm_cmpgt_epi8(__m128i a,__m128i b)4033*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
4034*3f1979aaSAndroid Build Coastguard Worker {
4035*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(
4036*3f1979aaSAndroid Build Coastguard Worker         vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4037*3f1979aaSAndroid Build Coastguard Worker }
4038*3f1979aaSAndroid Build Coastguard Worker 
4039*3f1979aaSAndroid Build Coastguard Worker // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4040*3f1979aaSAndroid Build Coastguard Worker // in b for less than.
4041*3f1979aaSAndroid Build Coastguard Worker //
4042*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 < b0) ? 0xffff : 0x0
4043*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 < b1) ? 0xffff : 0x0
4044*3f1979aaSAndroid Build Coastguard Worker //   ...
4045*3f1979aaSAndroid Build Coastguard Worker //   r7 := (a7 < b7) ? 0xffff : 0x0
4046*3f1979aaSAndroid Build Coastguard Worker //
4047*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
_mm_cmplt_epi16(__m128i a,__m128i b)4048*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
4049*3f1979aaSAndroid Build Coastguard Worker {
4050*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u16(
4051*3f1979aaSAndroid Build Coastguard Worker         vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4052*3f1979aaSAndroid Build Coastguard Worker }
4053*3f1979aaSAndroid Build Coastguard Worker 
4054*3f1979aaSAndroid Build Coastguard Worker // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4055*3f1979aaSAndroid Build Coastguard Worker // in b for greater than.
4056*3f1979aaSAndroid Build Coastguard Worker //
4057*3f1979aaSAndroid Build Coastguard Worker //   r0 := (a0 > b0) ? 0xffff : 0x0
4058*3f1979aaSAndroid Build Coastguard Worker //   r1 := (a1 > b1) ? 0xffff : 0x0
4059*3f1979aaSAndroid Build Coastguard Worker //   ...
4060*3f1979aaSAndroid Build Coastguard Worker //   r7 := (a7 > b7) ? 0xffff : 0x0
4061*3f1979aaSAndroid Build Coastguard Worker //
4062*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
_mm_cmpgt_epi16(__m128i a,__m128i b)4063*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
4064*3f1979aaSAndroid Build Coastguard Worker {
4065*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u16(
4066*3f1979aaSAndroid Build Coastguard Worker         vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4067*3f1979aaSAndroid Build Coastguard Worker }
4068*3f1979aaSAndroid Build Coastguard Worker 
4069*3f1979aaSAndroid Build Coastguard Worker 
4070*3f1979aaSAndroid Build Coastguard Worker // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4071*3f1979aaSAndroid Build Coastguard Worker // in b for less than.
4072*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
_mm_cmplt_epi32(__m128i a,__m128i b)4073*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
4074*3f1979aaSAndroid Build Coastguard Worker {
4075*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u32(
4076*3f1979aaSAndroid Build Coastguard Worker         vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4077*3f1979aaSAndroid Build Coastguard Worker }
4078*3f1979aaSAndroid Build Coastguard Worker 
4079*3f1979aaSAndroid Build Coastguard Worker // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4080*3f1979aaSAndroid Build Coastguard Worker // in b for greater than.
4081*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
_mm_cmpgt_epi32(__m128i a,__m128i b)4082*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
4083*3f1979aaSAndroid Build Coastguard Worker {
4084*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u32(
4085*3f1979aaSAndroid Build Coastguard Worker         vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4086*3f1979aaSAndroid Build Coastguard Worker }
4087*3f1979aaSAndroid Build Coastguard Worker 
4088*3f1979aaSAndroid Build Coastguard Worker // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
4089*3f1979aaSAndroid Build Coastguard Worker // in b for greater than.
_mm_cmpgt_epi64(__m128i a,__m128i b)4090*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
4091*3f1979aaSAndroid Build Coastguard Worker {
4092*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4093*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u64(
4094*3f1979aaSAndroid Build Coastguard Worker         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
4095*3f1979aaSAndroid Build Coastguard Worker #else
4096*3f1979aaSAndroid Build Coastguard Worker     // ARMv7 lacks vcgtq_s64.
4097*3f1979aaSAndroid Build Coastguard Worker     // This is based off of Clang's SSE2 polyfill:
4098*3f1979aaSAndroid Build Coastguard Worker     // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
4099*3f1979aaSAndroid Build Coastguard Worker 
4100*3f1979aaSAndroid Build Coastguard Worker     // Mask the sign bit out since we need a signed AND an unsigned comparison
4101*3f1979aaSAndroid Build Coastguard Worker     // and it is ugly to try and split them.
4102*3f1979aaSAndroid Build Coastguard Worker     int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
4103*3f1979aaSAndroid Build Coastguard Worker     int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
4104*3f1979aaSAndroid Build Coastguard Worker     int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
4105*3f1979aaSAndroid Build Coastguard Worker     // Check if a > b
4106*3f1979aaSAndroid Build Coastguard Worker     int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
4107*3f1979aaSAndroid Build Coastguard Worker     // Copy upper mask to lower mask
4108*3f1979aaSAndroid Build Coastguard Worker     // a_hi > b_hi
4109*3f1979aaSAndroid Build Coastguard Worker     int64x2_t gt_hi = vshrq_n_s64(greater, 63);
4110*3f1979aaSAndroid Build Coastguard Worker     // Copy lower mask to upper mask
4111*3f1979aaSAndroid Build Coastguard Worker     // a_lo > b_lo
4112*3f1979aaSAndroid Build Coastguard Worker     int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
4113*3f1979aaSAndroid Build Coastguard Worker     // Compare for equality
4114*3f1979aaSAndroid Build Coastguard Worker     int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
4115*3f1979aaSAndroid Build Coastguard Worker     // Copy upper mask to lower mask
4116*3f1979aaSAndroid Build Coastguard Worker     // a_hi == b_hi
4117*3f1979aaSAndroid Build Coastguard Worker     int64x2_t eq_hi = vshrq_n_s64(equal, 63);
4118*3f1979aaSAndroid Build Coastguard Worker     // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
4119*3f1979aaSAndroid Build Coastguard Worker     int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
4120*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(ret);
4121*3f1979aaSAndroid Build Coastguard Worker #endif
4122*3f1979aaSAndroid Build Coastguard Worker }
4123*3f1979aaSAndroid Build Coastguard Worker 
4124*3f1979aaSAndroid Build Coastguard Worker // Compares the four 32-bit floats in a and b to check if any values are NaN.
4125*3f1979aaSAndroid Build Coastguard Worker // Ordered compare between each value returns true for "orderable" and false for
4126*3f1979aaSAndroid Build Coastguard Worker // "not orderable" (NaN).
4127*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
4128*3f1979aaSAndroid Build Coastguard Worker // also:
4129*3f1979aaSAndroid Build Coastguard Worker // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
4130*3f1979aaSAndroid Build Coastguard Worker // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
_mm_cmpord_ps(__m128 a,__m128 b)4131*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
4132*3f1979aaSAndroid Build Coastguard Worker {
4133*3f1979aaSAndroid Build Coastguard Worker     // Note: NEON does not have ordered compare builtin
4134*3f1979aaSAndroid Build Coastguard Worker     // Need to compare a eq a and b eq b to check for NaN
4135*3f1979aaSAndroid Build Coastguard Worker     // Do AND of results to get final
4136*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t ceqaa =
4137*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4138*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t ceqbb =
4139*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4140*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
4141*3f1979aaSAndroid Build Coastguard Worker }
4142*3f1979aaSAndroid Build Coastguard Worker 
4143*3f1979aaSAndroid Build Coastguard Worker // Compares for ordered.
4144*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
_mm_cmpord_ss(__m128 a,__m128 b)4145*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
4146*3f1979aaSAndroid Build Coastguard Worker {
4147*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_cmpord_ps(a, b));
4148*3f1979aaSAndroid Build Coastguard Worker }
4149*3f1979aaSAndroid Build Coastguard Worker 
4150*3f1979aaSAndroid Build Coastguard Worker // Compares for unordered.
4151*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
_mm_cmpunord_ps(__m128 a,__m128 b)4152*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
4153*3f1979aaSAndroid Build Coastguard Worker {
4154*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t f32a =
4155*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4156*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t f32b =
4157*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4158*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
4159*3f1979aaSAndroid Build Coastguard Worker }
4160*3f1979aaSAndroid Build Coastguard Worker 
4161*3f1979aaSAndroid Build Coastguard Worker // Compares for unordered.
4162*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
_mm_cmpunord_ss(__m128 a,__m128 b)4163*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
4164*3f1979aaSAndroid Build Coastguard Worker {
4165*3f1979aaSAndroid Build Coastguard Worker     return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
4166*3f1979aaSAndroid Build Coastguard Worker }
4167*3f1979aaSAndroid Build Coastguard Worker 
4168*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4169*3f1979aaSAndroid Build Coastguard Worker // using a less than operation. :
4170*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
4171*3f1979aaSAndroid Build Coastguard Worker // note!! The documentation on MSDN is incorrect!  If either of the values is a
4172*3f1979aaSAndroid Build Coastguard Worker // NAN the docs say you will get a one, but in fact, it will return a zero!!
_mm_comilt_ss(__m128 a,__m128 b)4173*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
4174*3f1979aaSAndroid Build Coastguard Worker {
4175*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_not_nan =
4176*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4177*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t b_not_nan =
4178*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4179*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4180*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_lt_b =
4181*3f1979aaSAndroid Build Coastguard Worker         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4182*3f1979aaSAndroid Build Coastguard Worker     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
4183*3f1979aaSAndroid Build Coastguard Worker }
4184*3f1979aaSAndroid Build Coastguard Worker 
4185*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4186*3f1979aaSAndroid Build Coastguard Worker // using a greater than operation. :
4187*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
_mm_comigt_ss(__m128 a,__m128 b)4188*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
4189*3f1979aaSAndroid Build Coastguard Worker {
4190*3f1979aaSAndroid Build Coastguard Worker     // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
4191*3f1979aaSAndroid Build Coastguard Worker     // vreinterpretq_f32_m128(b)), 0);
4192*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_not_nan =
4193*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4194*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t b_not_nan =
4195*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4196*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4197*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_gt_b =
4198*3f1979aaSAndroid Build Coastguard Worker         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4199*3f1979aaSAndroid Build Coastguard Worker     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
4200*3f1979aaSAndroid Build Coastguard Worker }
4201*3f1979aaSAndroid Build Coastguard Worker 
4202*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4203*3f1979aaSAndroid Build Coastguard Worker // using a less than or equal operation. :
4204*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
_mm_comile_ss(__m128 a,__m128 b)4205*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
4206*3f1979aaSAndroid Build Coastguard Worker {
4207*3f1979aaSAndroid Build Coastguard Worker     // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
4208*3f1979aaSAndroid Build Coastguard Worker     // vreinterpretq_f32_m128(b)), 0);
4209*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_not_nan =
4210*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4211*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t b_not_nan =
4212*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4213*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4214*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_le_b =
4215*3f1979aaSAndroid Build Coastguard Worker         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4216*3f1979aaSAndroid Build Coastguard Worker     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
4217*3f1979aaSAndroid Build Coastguard Worker }
4218*3f1979aaSAndroid Build Coastguard Worker 
4219*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4220*3f1979aaSAndroid Build Coastguard Worker // using a greater than or equal operation. :
4221*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
_mm_comige_ss(__m128 a,__m128 b)4222*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
4223*3f1979aaSAndroid Build Coastguard Worker {
4224*3f1979aaSAndroid Build Coastguard Worker     // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
4225*3f1979aaSAndroid Build Coastguard Worker     // vreinterpretq_f32_m128(b)), 0);
4226*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_not_nan =
4227*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4228*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t b_not_nan =
4229*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4230*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4231*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_ge_b =
4232*3f1979aaSAndroid Build Coastguard Worker         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4233*3f1979aaSAndroid Build Coastguard Worker     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
4234*3f1979aaSAndroid Build Coastguard Worker }
4235*3f1979aaSAndroid Build Coastguard Worker 
4236*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4237*3f1979aaSAndroid Build Coastguard Worker // using an equality operation. :
4238*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
_mm_comieq_ss(__m128 a,__m128 b)4239*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
4240*3f1979aaSAndroid Build Coastguard Worker {
4241*3f1979aaSAndroid Build Coastguard Worker     // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4242*3f1979aaSAndroid Build Coastguard Worker     // vreinterpretq_f32_m128(b)), 0);
4243*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_not_nan =
4244*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4245*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t b_not_nan =
4246*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4247*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4248*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_eq_b =
4249*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4250*3f1979aaSAndroid Build Coastguard Worker     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
4251*3f1979aaSAndroid Build Coastguard Worker }
4252*3f1979aaSAndroid Build Coastguard Worker 
4253*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4254*3f1979aaSAndroid Build Coastguard Worker // using an inequality operation. :
4255*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
_mm_comineq_ss(__m128 a,__m128 b)4256*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
4257*3f1979aaSAndroid Build Coastguard Worker {
4258*3f1979aaSAndroid Build Coastguard Worker     // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4259*3f1979aaSAndroid Build Coastguard Worker     // vreinterpretq_f32_m128(b)), 0);
4260*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_not_nan =
4261*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4262*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t b_not_nan =
4263*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4264*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4265*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t a_neq_b = vmvnq_u32(
4266*3f1979aaSAndroid Build Coastguard Worker         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4267*3f1979aaSAndroid Build Coastguard Worker     return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
4268*3f1979aaSAndroid Build Coastguard Worker }
4269*3f1979aaSAndroid Build Coastguard Worker 
4270*3f1979aaSAndroid Build Coastguard Worker // according to the documentation, these intrinsics behave the same as the
4271*3f1979aaSAndroid Build Coastguard Worker // non-'u' versions.  We'll just alias them here.
4272*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomilt_ss _mm_comilt_ss
4273*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomile_ss _mm_comile_ss
4274*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomigt_ss _mm_comigt_ss
4275*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomige_ss _mm_comige_ss
4276*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomieq_ss _mm_comieq_ss
4277*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomineq_ss _mm_comineq_ss
4278*3f1979aaSAndroid Build Coastguard Worker 
4279*3f1979aaSAndroid Build Coastguard Worker /* Conversions */
4280*3f1979aaSAndroid Build Coastguard Worker 
4281*3f1979aaSAndroid Build Coastguard Worker // Convert packed signed 32-bit integers in b to packed single-precision
4282*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point elements, store the results in the lower 2 elements
4283*3f1979aaSAndroid Build Coastguard Worker // of dst, and copy the upper 2 packed elements from a to the upper elements of
4284*3f1979aaSAndroid Build Coastguard Worker // dst.
4285*3f1979aaSAndroid Build Coastguard Worker //
4286*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4287*3f1979aaSAndroid Build Coastguard Worker //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4288*3f1979aaSAndroid Build Coastguard Worker //   dst[95:64] := a[95:64]
4289*3f1979aaSAndroid Build Coastguard Worker //   dst[127:96] := a[127:96]
4290*3f1979aaSAndroid Build Coastguard Worker //
4291*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
_mm_cvt_pi2ps(__m128 a,__m64 b)4292*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
4293*3f1979aaSAndroid Build Coastguard Worker {
4294*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
4295*3f1979aaSAndroid Build Coastguard Worker         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4296*3f1979aaSAndroid Build Coastguard Worker                      vget_high_f32(vreinterpretq_f32_m128(a))));
4297*3f1979aaSAndroid Build Coastguard Worker }
4298*3f1979aaSAndroid Build Coastguard Worker 
4299*3f1979aaSAndroid Build Coastguard Worker // Convert the signed 32-bit integer b to a single-precision (32-bit)
4300*3f1979aaSAndroid Build Coastguard Worker // floating-point element, store the result in the lower element of dst, and
4301*3f1979aaSAndroid Build Coastguard Worker // copy the upper 3 packed elements from a to the upper elements of dst.
4302*3f1979aaSAndroid Build Coastguard Worker //
4303*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4304*3f1979aaSAndroid Build Coastguard Worker //   dst[127:32] := a[127:32]
4305*3f1979aaSAndroid Build Coastguard Worker //
4306*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
_mm_cvt_si2ss(__m128 a,int b)4307*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
4308*3f1979aaSAndroid Build Coastguard Worker {
4309*3f1979aaSAndroid Build Coastguard Worker     __m128 ret = a;
4310*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
4311*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_f32((float) b, vreinterpretq_f32_m128(ret), 0));
4312*3f1979aaSAndroid Build Coastguard Worker }
4313*3f1979aaSAndroid Build Coastguard Worker 
4314*3f1979aaSAndroid Build Coastguard Worker // Convert the lower single-precision (32-bit) floating-point element in a to a
4315*3f1979aaSAndroid Build Coastguard Worker // 32-bit integer, and store the result in dst.
4316*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
_mm_cvt_ss2si(__m128 a)4317*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
4318*3f1979aaSAndroid Build Coastguard Worker {
4319*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4320*3f1979aaSAndroid Build Coastguard Worker     return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
4321*3f1979aaSAndroid Build Coastguard Worker #else
4322*3f1979aaSAndroid Build Coastguard Worker     float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4323*3f1979aaSAndroid Build Coastguard Worker     float32_t diff = data - floor(data);
4324*3f1979aaSAndroid Build Coastguard Worker     if (diff > 0.5)
4325*3f1979aaSAndroid Build Coastguard Worker         return (int32_t) ceil(data);
4326*3f1979aaSAndroid Build Coastguard Worker     if (diff == 0.5) {
4327*3f1979aaSAndroid Build Coastguard Worker         int32_t f = (int32_t) floor(data);
4328*3f1979aaSAndroid Build Coastguard Worker         int32_t c = (int32_t) ceil(data);
4329*3f1979aaSAndroid Build Coastguard Worker         return c & 1 ? f : c;
4330*3f1979aaSAndroid Build Coastguard Worker     }
4331*3f1979aaSAndroid Build Coastguard Worker     return (int32_t) floor(data);
4332*3f1979aaSAndroid Build Coastguard Worker #endif
4333*3f1979aaSAndroid Build Coastguard Worker }
4334*3f1979aaSAndroid Build Coastguard Worker 
4335*3f1979aaSAndroid Build Coastguard Worker // Convert packed 16-bit integers in a to packed single-precision (32-bit)
4336*3f1979aaSAndroid Build Coastguard Worker // floating-point elements, and store the results in dst.
4337*3f1979aaSAndroid Build Coastguard Worker //
4338*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
4339*3f1979aaSAndroid Build Coastguard Worker //      i := j*16
4340*3f1979aaSAndroid Build Coastguard Worker //      m := j*32
4341*3f1979aaSAndroid Build Coastguard Worker //      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
4342*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
4343*3f1979aaSAndroid Build Coastguard Worker //
4344*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
_mm_cvtpi16_ps(__m64 a)4345*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
4346*3f1979aaSAndroid Build Coastguard Worker {
4347*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
4348*3f1979aaSAndroid Build Coastguard Worker         vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
4349*3f1979aaSAndroid Build Coastguard Worker }
4350*3f1979aaSAndroid Build Coastguard Worker 
4351*3f1979aaSAndroid Build Coastguard Worker // Convert packed 32-bit integers in b to packed single-precision (32-bit)
4352*3f1979aaSAndroid Build Coastguard Worker // floating-point elements, store the results in the lower 2 elements of dst,
4353*3f1979aaSAndroid Build Coastguard Worker // and copy the upper 2 packed elements from a to the upper elements of dst.
4354*3f1979aaSAndroid Build Coastguard Worker //
4355*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4356*3f1979aaSAndroid Build Coastguard Worker //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4357*3f1979aaSAndroid Build Coastguard Worker //   dst[95:64] := a[95:64]
4358*3f1979aaSAndroid Build Coastguard Worker //   dst[127:96] := a[127:96]
4359*3f1979aaSAndroid Build Coastguard Worker //
4360*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
_mm_cvtpi32_ps(__m128 a,__m64 b)4361*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
4362*3f1979aaSAndroid Build Coastguard Worker {
4363*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
4364*3f1979aaSAndroid Build Coastguard Worker         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4365*3f1979aaSAndroid Build Coastguard Worker                      vget_high_f32(vreinterpretq_f32_m128(a))));
4366*3f1979aaSAndroid Build Coastguard Worker }
4367*3f1979aaSAndroid Build Coastguard Worker 
4368*3f1979aaSAndroid Build Coastguard Worker // Convert packed signed 32-bit integers in a to packed single-precision
4369*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point elements, store the results in the lower 2 elements
4370*3f1979aaSAndroid Build Coastguard Worker // of dst, then covert the packed signed 32-bit integers in b to
4371*3f1979aaSAndroid Build Coastguard Worker // single-precision (32-bit) floating-point element, and store the results in
4372*3f1979aaSAndroid Build Coastguard Worker // the upper 2 elements of dst.
4373*3f1979aaSAndroid Build Coastguard Worker //
4374*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
4375*3f1979aaSAndroid Build Coastguard Worker //   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
4376*3f1979aaSAndroid Build Coastguard Worker //   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
4377*3f1979aaSAndroid Build Coastguard Worker //   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
4378*3f1979aaSAndroid Build Coastguard Worker //
4379*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
_mm_cvtpi32x2_ps(__m64 a,__m64 b)4380*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
4381*3f1979aaSAndroid Build Coastguard Worker {
4382*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcvtq_f32_s32(
4383*3f1979aaSAndroid Build Coastguard Worker         vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
4384*3f1979aaSAndroid Build Coastguard Worker }
4385*3f1979aaSAndroid Build Coastguard Worker 
4386*3f1979aaSAndroid Build Coastguard Worker // Convert the lower packed 8-bit integers in a to packed single-precision
4387*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point elements, and store the results in dst.
4388*3f1979aaSAndroid Build Coastguard Worker //
4389*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
4390*3f1979aaSAndroid Build Coastguard Worker //      i := j*8
4391*3f1979aaSAndroid Build Coastguard Worker //      m := j*32
4392*3f1979aaSAndroid Build Coastguard Worker //      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
4393*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
4394*3f1979aaSAndroid Build Coastguard Worker //
4395*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
_mm_cvtpi8_ps(__m64 a)4396*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
4397*3f1979aaSAndroid Build Coastguard Worker {
4398*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcvtq_f32_s32(
4399*3f1979aaSAndroid Build Coastguard Worker         vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
4400*3f1979aaSAndroid Build Coastguard Worker }
4401*3f1979aaSAndroid Build Coastguard Worker 
4402*3f1979aaSAndroid Build Coastguard Worker // Convert packed unsigned 16-bit integers in a to packed single-precision
4403*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point elements, and store the results in dst.
4404*3f1979aaSAndroid Build Coastguard Worker //
4405*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
4406*3f1979aaSAndroid Build Coastguard Worker //      i := j*16
4407*3f1979aaSAndroid Build Coastguard Worker //      m := j*32
4408*3f1979aaSAndroid Build Coastguard Worker //      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
4409*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
4410*3f1979aaSAndroid Build Coastguard Worker //
4411*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
_mm_cvtpu16_ps(__m64 a)4412*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
4413*3f1979aaSAndroid Build Coastguard Worker {
4414*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
4415*3f1979aaSAndroid Build Coastguard Worker         vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
4416*3f1979aaSAndroid Build Coastguard Worker }
4417*3f1979aaSAndroid Build Coastguard Worker 
4418*3f1979aaSAndroid Build Coastguard Worker // Convert the lower packed unsigned 8-bit integers in a to packed
4419*3f1979aaSAndroid Build Coastguard Worker // single-precision (32-bit) floating-point elements, and store the results in
4420*3f1979aaSAndroid Build Coastguard Worker // dst.
4421*3f1979aaSAndroid Build Coastguard Worker //
4422*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 3
4423*3f1979aaSAndroid Build Coastguard Worker //      i := j*8
4424*3f1979aaSAndroid Build Coastguard Worker //      m := j*32
4425*3f1979aaSAndroid Build Coastguard Worker //      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
4426*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
4427*3f1979aaSAndroid Build Coastguard Worker //
4428*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
_mm_cvtpu8_ps(__m64 a)4429*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
4430*3f1979aaSAndroid Build Coastguard Worker {
4431*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcvtq_f32_u32(
4432*3f1979aaSAndroid Build Coastguard Worker         vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
4433*3f1979aaSAndroid Build Coastguard Worker }
4434*3f1979aaSAndroid Build Coastguard Worker 
4435*3f1979aaSAndroid Build Coastguard Worker // Converts the four single-precision, floating-point values of a to signed
4436*3f1979aaSAndroid Build Coastguard Worker // 32-bit integer values using truncate.
4437*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
_mm_cvttps_epi32(__m128 a)4438*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4439*3f1979aaSAndroid Build Coastguard Worker {
4440*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4441*3f1979aaSAndroid Build Coastguard Worker }
4442*3f1979aaSAndroid Build Coastguard Worker 
4443*3f1979aaSAndroid Build Coastguard Worker // Converts the four signed 32-bit integer values of a to single-precision,
4444*3f1979aaSAndroid Build Coastguard Worker // floating-point values
4445*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
_mm_cvtepi32_ps(__m128i a)4446*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
4447*3f1979aaSAndroid Build Coastguard Worker {
4448*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
4449*3f1979aaSAndroid Build Coastguard Worker }
4450*3f1979aaSAndroid Build Coastguard Worker 
4451*3f1979aaSAndroid Build Coastguard Worker // Converts the four unsigned 8-bit integers in the lower 16 bits to four
4452*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers.
_mm_cvtepu8_epi16(__m128i a)4453*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
4454*3f1979aaSAndroid Build Coastguard Worker {
4455*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
4456*3f1979aaSAndroid Build Coastguard Worker     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4457*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u16(u16x8);
4458*3f1979aaSAndroid Build Coastguard Worker }
4459*3f1979aaSAndroid Build Coastguard Worker 
4460*3f1979aaSAndroid Build Coastguard Worker // Converts the four unsigned 8-bit integers in the lower 32 bits to four
4461*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers.
4462*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
_mm_cvtepu8_epi32(__m128i a)4463*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
4464*3f1979aaSAndroid Build Coastguard Worker {
4465*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
4466*3f1979aaSAndroid Build Coastguard Worker     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
4467*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
4468*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u32(u32x4);
4469*3f1979aaSAndroid Build Coastguard Worker }
4470*3f1979aaSAndroid Build Coastguard Worker 
4471*3f1979aaSAndroid Build Coastguard Worker // Converts the two unsigned 8-bit integers in the lower 16 bits to two
4472*3f1979aaSAndroid Build Coastguard Worker // unsigned 64-bit integers.
_mm_cvtepu8_epi64(__m128i a)4473*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
4474*3f1979aaSAndroid Build Coastguard Worker {
4475*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
4476*3f1979aaSAndroid Build Coastguard Worker     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
4477*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4478*3f1979aaSAndroid Build Coastguard Worker     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4479*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u64(u64x2);
4480*3f1979aaSAndroid Build Coastguard Worker }
4481*3f1979aaSAndroid Build Coastguard Worker 
4482*3f1979aaSAndroid Build Coastguard Worker // Converts the four unsigned 8-bit integers in the lower 16 bits to four
4483*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers.
_mm_cvtepi8_epi16(__m128i a)4484*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
4485*3f1979aaSAndroid Build Coastguard Worker {
4486*3f1979aaSAndroid Build Coastguard Worker     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
4487*3f1979aaSAndroid Build Coastguard Worker     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4488*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(s16x8);
4489*3f1979aaSAndroid Build Coastguard Worker }
4490*3f1979aaSAndroid Build Coastguard Worker 
4491*3f1979aaSAndroid Build Coastguard Worker // Converts the four unsigned 8-bit integers in the lower 32 bits to four
4492*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers.
_mm_cvtepi8_epi32(__m128i a)4493*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
4494*3f1979aaSAndroid Build Coastguard Worker {
4495*3f1979aaSAndroid Build Coastguard Worker     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
4496*3f1979aaSAndroid Build Coastguard Worker     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
4497*3f1979aaSAndroid Build Coastguard Worker     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
4498*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(s32x4);
4499*3f1979aaSAndroid Build Coastguard Worker }
4500*3f1979aaSAndroid Build Coastguard Worker 
4501*3f1979aaSAndroid Build Coastguard Worker // Converts the two signed 8-bit integers in the lower 32 bits to four
4502*3f1979aaSAndroid Build Coastguard Worker // signed 64-bit integers.
_mm_cvtepi8_epi64(__m128i a)4503*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
4504*3f1979aaSAndroid Build Coastguard Worker {
4505*3f1979aaSAndroid Build Coastguard Worker     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
4506*3f1979aaSAndroid Build Coastguard Worker     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
4507*3f1979aaSAndroid Build Coastguard Worker     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4508*3f1979aaSAndroid Build Coastguard Worker     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4509*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(s64x2);
4510*3f1979aaSAndroid Build Coastguard Worker }
4511*3f1979aaSAndroid Build Coastguard Worker 
4512*3f1979aaSAndroid Build Coastguard Worker // Converts the four signed 16-bit integers in the lower 64 bits to four signed
4513*3f1979aaSAndroid Build Coastguard Worker // 32-bit integers.
_mm_cvtepi16_epi32(__m128i a)4514*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
4515*3f1979aaSAndroid Build Coastguard Worker {
4516*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
4517*3f1979aaSAndroid Build Coastguard Worker         vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
4518*3f1979aaSAndroid Build Coastguard Worker }
4519*3f1979aaSAndroid Build Coastguard Worker 
4520*3f1979aaSAndroid Build Coastguard Worker // Converts the two signed 16-bit integers in the lower 32 bits two signed
4521*3f1979aaSAndroid Build Coastguard Worker // 32-bit integers.
_mm_cvtepi16_epi64(__m128i a)4522*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
4523*3f1979aaSAndroid Build Coastguard Worker {
4524*3f1979aaSAndroid Build Coastguard Worker     int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
4525*3f1979aaSAndroid Build Coastguard Worker     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4526*3f1979aaSAndroid Build Coastguard Worker     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4527*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(s64x2);
4528*3f1979aaSAndroid Build Coastguard Worker }
4529*3f1979aaSAndroid Build Coastguard Worker 
4530*3f1979aaSAndroid Build Coastguard Worker // Converts the four unsigned 16-bit integers in the lower 64 bits to four
4531*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers.
_mm_cvtepu16_epi32(__m128i a)4532*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
4533*3f1979aaSAndroid Build Coastguard Worker {
4534*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u32(
4535*3f1979aaSAndroid Build Coastguard Worker         vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
4536*3f1979aaSAndroid Build Coastguard Worker }
4537*3f1979aaSAndroid Build Coastguard Worker 
4538*3f1979aaSAndroid Build Coastguard Worker // Converts the two unsigned 16-bit integers in the lower 32 bits to two
4539*3f1979aaSAndroid Build Coastguard Worker // unsigned 64-bit integers.
_mm_cvtepu16_epi64(__m128i a)4540*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
4541*3f1979aaSAndroid Build Coastguard Worker {
4542*3f1979aaSAndroid Build Coastguard Worker     uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
4543*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4544*3f1979aaSAndroid Build Coastguard Worker     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4545*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u64(u64x2);
4546*3f1979aaSAndroid Build Coastguard Worker }
4547*3f1979aaSAndroid Build Coastguard Worker 
4548*3f1979aaSAndroid Build Coastguard Worker // Converts the two unsigned 32-bit integers in the lower 64 bits to two
4549*3f1979aaSAndroid Build Coastguard Worker // unsigned 64-bit integers.
_mm_cvtepu32_epi64(__m128i a)4550*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
4551*3f1979aaSAndroid Build Coastguard Worker {
4552*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u64(
4553*3f1979aaSAndroid Build Coastguard Worker         vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
4554*3f1979aaSAndroid Build Coastguard Worker }
4555*3f1979aaSAndroid Build Coastguard Worker 
4556*3f1979aaSAndroid Build Coastguard Worker // Converts the two signed 32-bit integers in the lower 64 bits to two signed
4557*3f1979aaSAndroid Build Coastguard Worker // 64-bit integers.
_mm_cvtepi32_epi64(__m128i a)4558*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
4559*3f1979aaSAndroid Build Coastguard Worker {
4560*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(
4561*3f1979aaSAndroid Build Coastguard Worker         vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
4562*3f1979aaSAndroid Build Coastguard Worker }
4563*3f1979aaSAndroid Build Coastguard Worker 
4564*3f1979aaSAndroid Build Coastguard Worker // Converts the four single-precision, floating-point values of a to signed
4565*3f1979aaSAndroid Build Coastguard Worker // 32-bit integer values.
4566*3f1979aaSAndroid Build Coastguard Worker //
4567*3f1979aaSAndroid Build Coastguard Worker //   r0 := (int) a0
4568*3f1979aaSAndroid Build Coastguard Worker //   r1 := (int) a1
4569*3f1979aaSAndroid Build Coastguard Worker //   r2 := (int) a2
4570*3f1979aaSAndroid Build Coastguard Worker //   r3 := (int) a3
4571*3f1979aaSAndroid Build Coastguard Worker //
4572*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
4573*3f1979aaSAndroid Build Coastguard Worker // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
4574*3f1979aaSAndroid Build Coastguard Worker // does not support! It is supported on ARMv8-A however.
_mm_cvtps_epi32(__m128 a)4575*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
4576*3f1979aaSAndroid Build Coastguard Worker {
4577*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4578*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4579*3f1979aaSAndroid Build Coastguard Worker #else
4580*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t signmask = vdupq_n_u32(0x80000000);
4581*3f1979aaSAndroid Build Coastguard Worker     float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4582*3f1979aaSAndroid Build Coastguard Worker                                  vdupq_n_f32(0.5f)); /* +/- 0.5 */
4583*3f1979aaSAndroid Build Coastguard Worker     int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4584*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4585*3f1979aaSAndroid Build Coastguard Worker     int32x4_t r_trunc =
4586*3f1979aaSAndroid Build Coastguard Worker         vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4587*3f1979aaSAndroid Build Coastguard Worker     int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4588*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4589*3f1979aaSAndroid Build Coastguard Worker     int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4590*3f1979aaSAndroid Build Coastguard Worker                                  vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4591*3f1979aaSAndroid Build Coastguard Worker     float32x4_t delta = vsubq_f32(
4592*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_f32_m128(a),
4593*3f1979aaSAndroid Build Coastguard Worker         vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4594*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
4595*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
4596*3f1979aaSAndroid Build Coastguard Worker #endif
4597*3f1979aaSAndroid Build Coastguard Worker }
4598*3f1979aaSAndroid Build Coastguard Worker 
4599*3f1979aaSAndroid Build Coastguard Worker // Copy the lower 32-bit integer in a to dst.
4600*3f1979aaSAndroid Build Coastguard Worker //
4601*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := a[31:0]
4602*3f1979aaSAndroid Build Coastguard Worker //
4603*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
_mm_cvtsi128_si32(__m128i a)4604*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4605*3f1979aaSAndroid Build Coastguard Worker {
4606*3f1979aaSAndroid Build Coastguard Worker     return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4607*3f1979aaSAndroid Build Coastguard Worker }
4608*3f1979aaSAndroid Build Coastguard Worker 
4609*3f1979aaSAndroid Build Coastguard Worker // Copy the lower 64-bit integer in a to dst.
4610*3f1979aaSAndroid Build Coastguard Worker //
4611*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := a[63:0]
4612*3f1979aaSAndroid Build Coastguard Worker //
4613*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
_mm_cvtsi128_si64(__m128i a)4614*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4615*3f1979aaSAndroid Build Coastguard Worker {
4616*3f1979aaSAndroid Build Coastguard Worker     return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4617*3f1979aaSAndroid Build Coastguard Worker }
4618*3f1979aaSAndroid Build Coastguard Worker 
4619*3f1979aaSAndroid Build Coastguard Worker // Copy the lower 64-bit integer in a to dst.
4620*3f1979aaSAndroid Build Coastguard Worker //
4621*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := a[63:0]
4622*3f1979aaSAndroid Build Coastguard Worker //
4623*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4624*3f1979aaSAndroid Build Coastguard Worker #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4625*3f1979aaSAndroid Build Coastguard Worker 
4626*3f1979aaSAndroid Build Coastguard Worker // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4627*3f1979aaSAndroid Build Coastguard Worker // zero extending the upper bits.
4628*3f1979aaSAndroid Build Coastguard Worker //
4629*3f1979aaSAndroid Build Coastguard Worker //   r0 := a
4630*3f1979aaSAndroid Build Coastguard Worker //   r1 := 0x0
4631*3f1979aaSAndroid Build Coastguard Worker //   r2 := 0x0
4632*3f1979aaSAndroid Build Coastguard Worker //   r3 := 0x0
4633*3f1979aaSAndroid Build Coastguard Worker //
4634*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
_mm_cvtsi32_si128(int a)4635*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4636*3f1979aaSAndroid Build Coastguard Worker {
4637*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4638*3f1979aaSAndroid Build Coastguard Worker }
4639*3f1979aaSAndroid Build Coastguard Worker 
4640*3f1979aaSAndroid Build Coastguard Worker // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4641*3f1979aaSAndroid Build Coastguard Worker // zero extending the upper bits.
4642*3f1979aaSAndroid Build Coastguard Worker //
4643*3f1979aaSAndroid Build Coastguard Worker //   r0 := a
4644*3f1979aaSAndroid Build Coastguard Worker //   r1 := 0x0
_mm_cvtsi64_si128(int64_t a)4645*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4646*3f1979aaSAndroid Build Coastguard Worker {
4647*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4648*3f1979aaSAndroid Build Coastguard Worker }
4649*3f1979aaSAndroid Build Coastguard Worker 
4650*3f1979aaSAndroid Build Coastguard Worker // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
4651*3f1979aaSAndroid Build Coastguard Worker // compilation and does not generate any instructions, thus it has zero latency.
4652*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
_mm_castps_pd(__m128 a)4653*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
4654*3f1979aaSAndroid Build Coastguard Worker {
4655*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
4656*3f1979aaSAndroid Build Coastguard Worker }
4657*3f1979aaSAndroid Build Coastguard Worker 
4658*3f1979aaSAndroid Build Coastguard Worker // Applies a type cast to reinterpret four 32-bit floating point values passed
4659*3f1979aaSAndroid Build Coastguard Worker // in as a 128-bit parameter as packed 32-bit integers.
4660*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb514099.aspx
_mm_castps_si128(__m128 a)4661*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
4662*3f1979aaSAndroid Build Coastguard Worker {
4663*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
4664*3f1979aaSAndroid Build Coastguard Worker }
4665*3f1979aaSAndroid Build Coastguard Worker 
4666*3f1979aaSAndroid Build Coastguard Worker // Applies a type cast to reinterpret four 32-bit integers passed in as a
4667*3f1979aaSAndroid Build Coastguard Worker // 128-bit parameter as packed 32-bit floating point values.
4668*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb514029.aspx
_mm_castsi128_ps(__m128i a)4669*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
4670*3f1979aaSAndroid Build Coastguard Worker {
4671*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
4672*3f1979aaSAndroid Build Coastguard Worker }
4673*3f1979aaSAndroid Build Coastguard Worker 
4674*3f1979aaSAndroid Build Coastguard Worker // Loads 128-bit value. :
4675*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
_mm_load_si128(const __m128i * p)4676*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4677*3f1979aaSAndroid Build Coastguard Worker {
4678*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4679*3f1979aaSAndroid Build Coastguard Worker }
4680*3f1979aaSAndroid Build Coastguard Worker 
4681*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into both
4682*3f1979aaSAndroid Build Coastguard Worker // elements of dst.
4683*3f1979aaSAndroid Build Coastguard Worker //
4684*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4685*3f1979aaSAndroid Build Coastguard Worker //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4686*3f1979aaSAndroid Build Coastguard Worker //
4687*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
_mm_load1_pd(const double * p)4688*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4689*3f1979aaSAndroid Build Coastguard Worker {
4690*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4691*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4692*3f1979aaSAndroid Build Coastguard Worker #else
4693*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4694*3f1979aaSAndroid Build Coastguard Worker #endif
4695*3f1979aaSAndroid Build Coastguard Worker }
4696*3f1979aaSAndroid Build Coastguard Worker 
4697*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into the
4698*3f1979aaSAndroid Build Coastguard Worker // upper element of dst, and copy the lower element from a to dst. mem_addr does
4699*3f1979aaSAndroid Build Coastguard Worker // not need to be aligned on any particular boundary.
4700*3f1979aaSAndroid Build Coastguard Worker //
4701*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := a[63:0]
4702*3f1979aaSAndroid Build Coastguard Worker //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4703*3f1979aaSAndroid Build Coastguard Worker //
4704*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
_mm_loadh_pd(__m128d a,const double * p)4705*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4706*3f1979aaSAndroid Build Coastguard Worker {
4707*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4708*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f64(
4709*3f1979aaSAndroid Build Coastguard Worker         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4710*3f1979aaSAndroid Build Coastguard Worker #else
4711*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f32(vcombine_f32(
4712*3f1979aaSAndroid Build Coastguard Worker         vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4713*3f1979aaSAndroid Build Coastguard Worker #endif
4714*3f1979aaSAndroid Build Coastguard Worker }
4715*3f1979aaSAndroid Build Coastguard Worker 
4716*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into both
4717*3f1979aaSAndroid Build Coastguard Worker // elements of dst.
4718*3f1979aaSAndroid Build Coastguard Worker //
4719*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4720*3f1979aaSAndroid Build Coastguard Worker //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4721*3f1979aaSAndroid Build Coastguard Worker //
4722*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4723*3f1979aaSAndroid Build Coastguard Worker #define _mm_load_pd1 _mm_load1_pd
4724*3f1979aaSAndroid Build Coastguard Worker 
4725*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into both
4726*3f1979aaSAndroid Build Coastguard Worker // elements of dst.
4727*3f1979aaSAndroid Build Coastguard Worker //
4728*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4729*3f1979aaSAndroid Build Coastguard Worker //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4730*3f1979aaSAndroid Build Coastguard Worker //
4731*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
4732*3f1979aaSAndroid Build Coastguard Worker #define _mm_loaddup_pd _mm_load1_pd
4733*3f1979aaSAndroid Build Coastguard Worker 
4734*3f1979aaSAndroid Build Coastguard Worker // Loads 128-bit value. :
4735*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
_mm_loadu_si128(const __m128i * p)4736*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4737*3f1979aaSAndroid Build Coastguard Worker {
4738*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4739*3f1979aaSAndroid Build Coastguard Worker }
4740*3f1979aaSAndroid Build Coastguard Worker 
4741*3f1979aaSAndroid Build Coastguard Worker // Load unaligned 32-bit integer from memory into the first element of dst.
4742*3f1979aaSAndroid Build Coastguard Worker //
4743*3f1979aaSAndroid Build Coastguard Worker //   dst[31:0] := MEM[mem_addr+31:mem_addr]
4744*3f1979aaSAndroid Build Coastguard Worker //   dst[MAX:32] := 0
4745*3f1979aaSAndroid Build Coastguard Worker //
4746*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
_mm_loadu_si32(const void * p)4747*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4748*3f1979aaSAndroid Build Coastguard Worker {
4749*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
4750*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4751*3f1979aaSAndroid Build Coastguard Worker }
4752*3f1979aaSAndroid Build Coastguard Worker 
4753*3f1979aaSAndroid Build Coastguard Worker // Convert packed double-precision (64-bit) floating-point elements in a to
4754*3f1979aaSAndroid Build Coastguard Worker // packed single-precision (32-bit) floating-point elements, and store the
4755*3f1979aaSAndroid Build Coastguard Worker // results in dst.
4756*3f1979aaSAndroid Build Coastguard Worker //
4757*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 1
4758*3f1979aaSAndroid Build Coastguard Worker //     i := 32*j
4759*3f1979aaSAndroid Build Coastguard Worker //     k := 64*j
4760*3f1979aaSAndroid Build Coastguard Worker //     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
4761*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
4762*3f1979aaSAndroid Build Coastguard Worker //   dst[127:64] := 0
4763*3f1979aaSAndroid Build Coastguard Worker //
4764*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
_mm_cvtpd_ps(__m128d a)4765*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
4766*3f1979aaSAndroid Build Coastguard Worker {
4767*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4768*3f1979aaSAndroid Build Coastguard Worker     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
4769*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
4770*3f1979aaSAndroid Build Coastguard Worker #else
4771*3f1979aaSAndroid Build Coastguard Worker     float a0 = (float) ((double *) &a)[0];
4772*3f1979aaSAndroid Build Coastguard Worker     float a1 = (float) ((double *) &a)[1];
4773*3f1979aaSAndroid Build Coastguard Worker     return _mm_set_ps(0, 0, a1, a0);
4774*3f1979aaSAndroid Build Coastguard Worker #endif
4775*3f1979aaSAndroid Build Coastguard Worker }
4776*3f1979aaSAndroid Build Coastguard Worker 
4777*3f1979aaSAndroid Build Coastguard Worker // Copy the lower double-precision (64-bit) floating-point element of a to dst.
4778*3f1979aaSAndroid Build Coastguard Worker //
4779*3f1979aaSAndroid Build Coastguard Worker //   dst[63:0] := a[63:0]
4780*3f1979aaSAndroid Build Coastguard Worker //
4781*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
_mm_cvtsd_f64(__m128d a)4782*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
4783*3f1979aaSAndroid Build Coastguard Worker {
4784*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4785*3f1979aaSAndroid Build Coastguard Worker     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4786*3f1979aaSAndroid Build Coastguard Worker #else
4787*3f1979aaSAndroid Build Coastguard Worker     return ((double *) &a)[0];
4788*3f1979aaSAndroid Build Coastguard Worker #endif
4789*3f1979aaSAndroid Build Coastguard Worker }
4790*3f1979aaSAndroid Build Coastguard Worker 
4791*3f1979aaSAndroid Build Coastguard Worker // Convert packed single-precision (32-bit) floating-point elements in a to
4792*3f1979aaSAndroid Build Coastguard Worker // packed double-precision (64-bit) floating-point elements, and store the
4793*3f1979aaSAndroid Build Coastguard Worker // results in dst.
4794*3f1979aaSAndroid Build Coastguard Worker //
4795*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 1
4796*3f1979aaSAndroid Build Coastguard Worker //     i := 64*j
4797*3f1979aaSAndroid Build Coastguard Worker //     k := 32*j
4798*3f1979aaSAndroid Build Coastguard Worker //     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4799*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
4800*3f1979aaSAndroid Build Coastguard Worker //
4801*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
_mm_cvtps_pd(__m128 a)4802*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
4803*3f1979aaSAndroid Build Coastguard Worker {
4804*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4805*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128d_f64(
4806*3f1979aaSAndroid Build Coastguard Worker         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4807*3f1979aaSAndroid Build Coastguard Worker #else
4808*3f1979aaSAndroid Build Coastguard Worker     double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4809*3f1979aaSAndroid Build Coastguard Worker     double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4810*3f1979aaSAndroid Build Coastguard Worker     return _mm_set_pd(a1, a0);
4811*3f1979aaSAndroid Build Coastguard Worker #endif
4812*3f1979aaSAndroid Build Coastguard Worker }
4813*3f1979aaSAndroid Build Coastguard Worker 
4814*3f1979aaSAndroid Build Coastguard Worker // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
4815*3f1979aaSAndroid Build Coastguard Worker // compilation and does not generate any instructions, thus it has zero latency.
4816*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
_mm_castpd_si128(__m128d a)4817*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
4818*3f1979aaSAndroid Build Coastguard Worker {
4819*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
4820*3f1979aaSAndroid Build Coastguard Worker }
4821*3f1979aaSAndroid Build Coastguard Worker 
4822*3f1979aaSAndroid Build Coastguard Worker // Blend packed single-precision (32-bit) floating-point elements from a and b
4823*3f1979aaSAndroid Build Coastguard Worker // using mask, and store the results in dst.
4824*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
_mm_blendv_ps(__m128 a,__m128 b,__m128 mask)4825*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
4826*3f1979aaSAndroid Build Coastguard Worker {
4827*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
4828*3f1979aaSAndroid Build Coastguard Worker                                             vreinterpretq_f32_m128(b),
4829*3f1979aaSAndroid Build Coastguard Worker                                             vreinterpretq_f32_m128(a)));
4830*3f1979aaSAndroid Build Coastguard Worker }
4831*3f1979aaSAndroid Build Coastguard Worker 
4832*3f1979aaSAndroid Build Coastguard Worker // Round the packed single-precision (32-bit) floating-point elements in a using
4833*3f1979aaSAndroid Build Coastguard Worker // the rounding parameter, and store the results as packed single-precision
4834*3f1979aaSAndroid Build Coastguard Worker // floating-point elements in dst.
4835*3f1979aaSAndroid Build Coastguard Worker // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
_mm_round_ps(__m128 a,int rounding)4836*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
4837*3f1979aaSAndroid Build Coastguard Worker {
4838*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4839*3f1979aaSAndroid Build Coastguard Worker     switch (rounding) {
4840*3f1979aaSAndroid Build Coastguard Worker     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
4841*3f1979aaSAndroid Build Coastguard Worker         return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
4842*3f1979aaSAndroid Build Coastguard Worker     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
4843*3f1979aaSAndroid Build Coastguard Worker         return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
4844*3f1979aaSAndroid Build Coastguard Worker     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
4845*3f1979aaSAndroid Build Coastguard Worker         return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
4846*3f1979aaSAndroid Build Coastguard Worker     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
4847*3f1979aaSAndroid Build Coastguard Worker         return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
4848*3f1979aaSAndroid Build Coastguard Worker     default:  //_MM_FROUND_CUR_DIRECTION
4849*3f1979aaSAndroid Build Coastguard Worker         return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
4850*3f1979aaSAndroid Build Coastguard Worker     }
4851*3f1979aaSAndroid Build Coastguard Worker #else
4852*3f1979aaSAndroid Build Coastguard Worker     float *v_float = (float *) &a;
4853*3f1979aaSAndroid Build Coastguard Worker     __m128 zero, neg_inf, pos_inf;
4854*3f1979aaSAndroid Build Coastguard Worker 
4855*3f1979aaSAndroid Build Coastguard Worker     switch (rounding) {
4856*3f1979aaSAndroid Build Coastguard Worker     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
4857*3f1979aaSAndroid Build Coastguard Worker         return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
4858*3f1979aaSAndroid Build Coastguard Worker     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
4859*3f1979aaSAndroid Build Coastguard Worker         return (__m128){floorf(v_float[0]), floorf(v_float[1]),
4860*3f1979aaSAndroid Build Coastguard Worker                         floorf(v_float[2]), floorf(v_float[3])};
4861*3f1979aaSAndroid Build Coastguard Worker     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
4862*3f1979aaSAndroid Build Coastguard Worker         return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
4863*3f1979aaSAndroid Build Coastguard Worker                         ceilf(v_float[3])};
4864*3f1979aaSAndroid Build Coastguard Worker     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
4865*3f1979aaSAndroid Build Coastguard Worker         zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
4866*3f1979aaSAndroid Build Coastguard Worker         neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
4867*3f1979aaSAndroid Build Coastguard Worker                              floorf(v_float[2]), floorf(v_float[3]));
4868*3f1979aaSAndroid Build Coastguard Worker         pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
4869*3f1979aaSAndroid Build Coastguard Worker                              ceilf(v_float[2]), ceilf(v_float[3]));
4870*3f1979aaSAndroid Build Coastguard Worker         return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
4871*3f1979aaSAndroid Build Coastguard Worker     default:  //_MM_FROUND_CUR_DIRECTION
4872*3f1979aaSAndroid Build Coastguard Worker         return (__m128){roundf(v_float[0]), roundf(v_float[1]),
4873*3f1979aaSAndroid Build Coastguard Worker                         roundf(v_float[2]), roundf(v_float[3])};
4874*3f1979aaSAndroid Build Coastguard Worker     }
4875*3f1979aaSAndroid Build Coastguard Worker #endif
4876*3f1979aaSAndroid Build Coastguard Worker }
4877*3f1979aaSAndroid Build Coastguard Worker 
4878*3f1979aaSAndroid Build Coastguard Worker // Round the packed single-precision (32-bit) floating-point elements in a up to
4879*3f1979aaSAndroid Build Coastguard Worker // an integer value, and store the results as packed single-precision
4880*3f1979aaSAndroid Build Coastguard Worker // floating-point elements in dst.
4881*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
_mm_ceil_ps(__m128 a)4882*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
4883*3f1979aaSAndroid Build Coastguard Worker {
4884*3f1979aaSAndroid Build Coastguard Worker     return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4885*3f1979aaSAndroid Build Coastguard Worker }
4886*3f1979aaSAndroid Build Coastguard Worker 
4887*3f1979aaSAndroid Build Coastguard Worker // Round the packed single-precision (32-bit) floating-point elements in a down
4888*3f1979aaSAndroid Build Coastguard Worker // to an integer value, and store the results as packed single-precision
4889*3f1979aaSAndroid Build Coastguard Worker // floating-point elements in dst.
4890*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
_mm_floor_ps(__m128 a)4891*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
4892*3f1979aaSAndroid Build Coastguard Worker {
4893*3f1979aaSAndroid Build Coastguard Worker     return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4894*3f1979aaSAndroid Build Coastguard Worker }
4895*3f1979aaSAndroid Build Coastguard Worker 
4896*3f1979aaSAndroid Build Coastguard Worker 
4897*3f1979aaSAndroid Build Coastguard Worker // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
4898*3f1979aaSAndroid Build Coastguard Worker // may perform better than _mm_loadu_si128 when the data crosses a cache line
4899*3f1979aaSAndroid Build Coastguard Worker // boundary.
4900*3f1979aaSAndroid Build Coastguard Worker //
4901*3f1979aaSAndroid Build Coastguard Worker //   dst[127:0] := MEM[mem_addr+127:mem_addr]
4902*3f1979aaSAndroid Build Coastguard Worker //
4903*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
4904*3f1979aaSAndroid Build Coastguard Worker #define _mm_lddqu_si128 _mm_loadu_si128
4905*3f1979aaSAndroid Build Coastguard Worker 
4906*3f1979aaSAndroid Build Coastguard Worker /* Miscellaneous Operations */
4907*3f1979aaSAndroid Build Coastguard Worker 
4908*3f1979aaSAndroid Build Coastguard Worker // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
4909*3f1979aaSAndroid Build Coastguard Worker // in the sign bit.
4910*3f1979aaSAndroid Build Coastguard Worker //
4911*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 >> count
4912*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 >> count
4913*3f1979aaSAndroid Build Coastguard Worker //   ...
4914*3f1979aaSAndroid Build Coastguard Worker //   r7 := a7 >> count
4915*3f1979aaSAndroid Build Coastguard Worker //
4916*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
_mm_sra_epi16(__m128i a,__m128i count)4917*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
4918*3f1979aaSAndroid Build Coastguard Worker {
4919*3f1979aaSAndroid Build Coastguard Worker     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
4920*3f1979aaSAndroid Build Coastguard Worker     if (c > 15)
4921*3f1979aaSAndroid Build Coastguard Worker         return _mm_cmplt_epi16(a, _mm_setzero_si128());
4922*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
4923*3f1979aaSAndroid Build Coastguard Worker }
4924*3f1979aaSAndroid Build Coastguard Worker 
4925*3f1979aaSAndroid Build Coastguard Worker // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
4926*3f1979aaSAndroid Build Coastguard Worker // in the sign bit.
4927*3f1979aaSAndroid Build Coastguard Worker //
4928*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0 >> count
4929*3f1979aaSAndroid Build Coastguard Worker //   r1 := a1 >> count
4930*3f1979aaSAndroid Build Coastguard Worker //   r2 := a2 >> count
4931*3f1979aaSAndroid Build Coastguard Worker //   r3 := a3 >> count
4932*3f1979aaSAndroid Build Coastguard Worker //
4933*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
_mm_sra_epi32(__m128i a,__m128i count)4934*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
4935*3f1979aaSAndroid Build Coastguard Worker {
4936*3f1979aaSAndroid Build Coastguard Worker     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
4937*3f1979aaSAndroid Build Coastguard Worker     if (c > 31)
4938*3f1979aaSAndroid Build Coastguard Worker         return _mm_cmplt_epi32(a, _mm_setzero_si128());
4939*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
4940*3f1979aaSAndroid Build Coastguard Worker }
4941*3f1979aaSAndroid Build Coastguard Worker 
4942*3f1979aaSAndroid Build Coastguard Worker // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
4943*3f1979aaSAndroid Build Coastguard Worker // saturates.
4944*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
_mm_packs_epi16(__m128i a,__m128i b)4945*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4946*3f1979aaSAndroid Build Coastguard Worker {
4947*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(
4948*3f1979aaSAndroid Build Coastguard Worker         vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4949*3f1979aaSAndroid Build Coastguard Worker                     vqmovn_s16(vreinterpretq_s16_m128i(b))));
4950*3f1979aaSAndroid Build Coastguard Worker }
4951*3f1979aaSAndroid Build Coastguard Worker 
4952*3f1979aaSAndroid Build Coastguard Worker // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
4953*3f1979aaSAndroid Build Coastguard Worker // integers and saturates.
4954*3f1979aaSAndroid Build Coastguard Worker //
4955*3f1979aaSAndroid Build Coastguard Worker //   r0 := UnsignedSaturate(a0)
4956*3f1979aaSAndroid Build Coastguard Worker //   r1 := UnsignedSaturate(a1)
4957*3f1979aaSAndroid Build Coastguard Worker //   ...
4958*3f1979aaSAndroid Build Coastguard Worker //   r7 := UnsignedSaturate(a7)
4959*3f1979aaSAndroid Build Coastguard Worker //   r8 := UnsignedSaturate(b0)
4960*3f1979aaSAndroid Build Coastguard Worker //   r9 := UnsignedSaturate(b1)
4961*3f1979aaSAndroid Build Coastguard Worker //   ...
4962*3f1979aaSAndroid Build Coastguard Worker //   r15 := UnsignedSaturate(b7)
4963*3f1979aaSAndroid Build Coastguard Worker //
4964*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
_mm_packus_epi16(const __m128i a,const __m128i b)4965*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4966*3f1979aaSAndroid Build Coastguard Worker {
4967*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(
4968*3f1979aaSAndroid Build Coastguard Worker         vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4969*3f1979aaSAndroid Build Coastguard Worker                     vqmovun_s16(vreinterpretq_s16_m128i(b))));
4970*3f1979aaSAndroid Build Coastguard Worker }
4971*3f1979aaSAndroid Build Coastguard Worker 
4972*3f1979aaSAndroid Build Coastguard Worker // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
4973*3f1979aaSAndroid Build Coastguard Worker // and saturates.
4974*3f1979aaSAndroid Build Coastguard Worker //
4975*3f1979aaSAndroid Build Coastguard Worker //   r0 := SignedSaturate(a0)
4976*3f1979aaSAndroid Build Coastguard Worker //   r1 := SignedSaturate(a1)
4977*3f1979aaSAndroid Build Coastguard Worker //   r2 := SignedSaturate(a2)
4978*3f1979aaSAndroid Build Coastguard Worker //   r3 := SignedSaturate(a3)
4979*3f1979aaSAndroid Build Coastguard Worker //   r4 := SignedSaturate(b0)
4980*3f1979aaSAndroid Build Coastguard Worker //   r5 := SignedSaturate(b1)
4981*3f1979aaSAndroid Build Coastguard Worker //   r6 := SignedSaturate(b2)
4982*3f1979aaSAndroid Build Coastguard Worker //   r7 := SignedSaturate(b3)
4983*3f1979aaSAndroid Build Coastguard Worker //
4984*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
_mm_packs_epi32(__m128i a,__m128i b)4985*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4986*3f1979aaSAndroid Build Coastguard Worker {
4987*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
4988*3f1979aaSAndroid Build Coastguard Worker         vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4989*3f1979aaSAndroid Build Coastguard Worker                      vqmovn_s32(vreinterpretq_s32_m128i(b))));
4990*3f1979aaSAndroid Build Coastguard Worker }
4991*3f1979aaSAndroid Build Coastguard Worker 
4992*3f1979aaSAndroid Build Coastguard Worker // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
4993*3f1979aaSAndroid Build Coastguard Worker // integers and saturates.
4994*3f1979aaSAndroid Build Coastguard Worker //
4995*3f1979aaSAndroid Build Coastguard Worker //   r0 := UnsignedSaturate(a0)
4996*3f1979aaSAndroid Build Coastguard Worker //   r1 := UnsignedSaturate(a1)
4997*3f1979aaSAndroid Build Coastguard Worker //   r2 := UnsignedSaturate(a2)
4998*3f1979aaSAndroid Build Coastguard Worker //   r3 := UnsignedSaturate(a3)
4999*3f1979aaSAndroid Build Coastguard Worker //   r4 := UnsignedSaturate(b0)
5000*3f1979aaSAndroid Build Coastguard Worker //   r5 := UnsignedSaturate(b1)
5001*3f1979aaSAndroid Build Coastguard Worker //   r6 := UnsignedSaturate(b2)
5002*3f1979aaSAndroid Build Coastguard Worker //   r7 := UnsignedSaturate(b3)
_mm_packus_epi32(__m128i a,__m128i b)5003*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
5004*3f1979aaSAndroid Build Coastguard Worker {
5005*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u16(
5006*3f1979aaSAndroid Build Coastguard Worker         vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
5007*3f1979aaSAndroid Build Coastguard Worker                      vqmovun_s32(vreinterpretq_s32_m128i(b))));
5008*3f1979aaSAndroid Build Coastguard Worker }
5009*3f1979aaSAndroid Build Coastguard Worker 
5010*3f1979aaSAndroid Build Coastguard Worker // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
5011*3f1979aaSAndroid Build Coastguard Worker // 8 signed or unsigned 8-bit integers in b.
5012*3f1979aaSAndroid Build Coastguard Worker //
5013*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0
5014*3f1979aaSAndroid Build Coastguard Worker //   r1 := b0
5015*3f1979aaSAndroid Build Coastguard Worker //   r2 := a1
5016*3f1979aaSAndroid Build Coastguard Worker //   r3 := b1
5017*3f1979aaSAndroid Build Coastguard Worker //   ...
5018*3f1979aaSAndroid Build Coastguard Worker //   r14 := a7
5019*3f1979aaSAndroid Build Coastguard Worker //   r15 := b7
5020*3f1979aaSAndroid Build Coastguard Worker //
5021*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
_mm_unpacklo_epi8(__m128i a,__m128i b)5022*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
5023*3f1979aaSAndroid Build Coastguard Worker {
5024*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5025*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(
5026*3f1979aaSAndroid Build Coastguard Worker         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5027*3f1979aaSAndroid Build Coastguard Worker #else
5028*3f1979aaSAndroid Build Coastguard Worker     int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5029*3f1979aaSAndroid Build Coastguard Worker     int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5030*3f1979aaSAndroid Build Coastguard Worker     int8x8x2_t result = vzip_s8(a1, b1);
5031*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5032*3f1979aaSAndroid Build Coastguard Worker #endif
5033*3f1979aaSAndroid Build Coastguard Worker }
5034*3f1979aaSAndroid Build Coastguard Worker 
5035*3f1979aaSAndroid Build Coastguard Worker // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
5036*3f1979aaSAndroid Build Coastguard Worker // lower 4 signed or unsigned 16-bit integers in b.
5037*3f1979aaSAndroid Build Coastguard Worker //
5038*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0
5039*3f1979aaSAndroid Build Coastguard Worker //   r1 := b0
5040*3f1979aaSAndroid Build Coastguard Worker //   r2 := a1
5041*3f1979aaSAndroid Build Coastguard Worker //   r3 := b1
5042*3f1979aaSAndroid Build Coastguard Worker //   r4 := a2
5043*3f1979aaSAndroid Build Coastguard Worker //   r5 := b2
5044*3f1979aaSAndroid Build Coastguard Worker //   r6 := a3
5045*3f1979aaSAndroid Build Coastguard Worker //   r7 := b3
5046*3f1979aaSAndroid Build Coastguard Worker //
5047*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
_mm_unpacklo_epi16(__m128i a,__m128i b)5048*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
5049*3f1979aaSAndroid Build Coastguard Worker {
5050*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5051*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
5052*3f1979aaSAndroid Build Coastguard Worker         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5053*3f1979aaSAndroid Build Coastguard Worker #else
5054*3f1979aaSAndroid Build Coastguard Worker     int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5055*3f1979aaSAndroid Build Coastguard Worker     int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5056*3f1979aaSAndroid Build Coastguard Worker     int16x4x2_t result = vzip_s16(a1, b1);
5057*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5058*3f1979aaSAndroid Build Coastguard Worker #endif
5059*3f1979aaSAndroid Build Coastguard Worker }
5060*3f1979aaSAndroid Build Coastguard Worker 
5061*3f1979aaSAndroid Build Coastguard Worker // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
5062*3f1979aaSAndroid Build Coastguard Worker // lower 2 signed or unsigned 32 - bit integers in b.
5063*3f1979aaSAndroid Build Coastguard Worker //
5064*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0
5065*3f1979aaSAndroid Build Coastguard Worker //   r1 := b0
5066*3f1979aaSAndroid Build Coastguard Worker //   r2 := a1
5067*3f1979aaSAndroid Build Coastguard Worker //   r3 := b1
5068*3f1979aaSAndroid Build Coastguard Worker //
5069*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
_mm_unpacklo_epi32(__m128i a,__m128i b)5070*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
5071*3f1979aaSAndroid Build Coastguard Worker {
5072*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5073*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
5074*3f1979aaSAndroid Build Coastguard Worker         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5075*3f1979aaSAndroid Build Coastguard Worker #else
5076*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5077*3f1979aaSAndroid Build Coastguard Worker     int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5078*3f1979aaSAndroid Build Coastguard Worker     int32x2x2_t result = vzip_s32(a1, b1);
5079*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5080*3f1979aaSAndroid Build Coastguard Worker #endif
5081*3f1979aaSAndroid Build Coastguard Worker }
5082*3f1979aaSAndroid Build Coastguard Worker 
_mm_unpacklo_epi64(__m128i a,__m128i b)5083*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
5084*3f1979aaSAndroid Build Coastguard Worker {
5085*3f1979aaSAndroid Build Coastguard Worker     int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5086*3f1979aaSAndroid Build Coastguard Worker     int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5087*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5088*3f1979aaSAndroid Build Coastguard Worker }
5089*3f1979aaSAndroid Build Coastguard Worker 
5090*3f1979aaSAndroid Build Coastguard Worker // Selects and interleaves the lower two single-precision, floating-point values
5091*3f1979aaSAndroid Build Coastguard Worker // from a and b.
5092*3f1979aaSAndroid Build Coastguard Worker //
5093*3f1979aaSAndroid Build Coastguard Worker //   r0 := a0
5094*3f1979aaSAndroid Build Coastguard Worker //   r1 := b0
5095*3f1979aaSAndroid Build Coastguard Worker //   r2 := a1
5096*3f1979aaSAndroid Build Coastguard Worker //   r3 := b1
5097*3f1979aaSAndroid Build Coastguard Worker //
5098*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
_mm_unpacklo_ps(__m128 a,__m128 b)5099*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
5100*3f1979aaSAndroid Build Coastguard Worker {
5101*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5102*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
5103*3f1979aaSAndroid Build Coastguard Worker         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5104*3f1979aaSAndroid Build Coastguard Worker #else
5105*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
5106*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
5107*3f1979aaSAndroid Build Coastguard Worker     float32x2x2_t result = vzip_f32(a1, b1);
5108*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5109*3f1979aaSAndroid Build Coastguard Worker #endif
5110*3f1979aaSAndroid Build Coastguard Worker }
5111*3f1979aaSAndroid Build Coastguard Worker 
5112*3f1979aaSAndroid Build Coastguard Worker // Selects and interleaves the upper two single-precision, floating-point values
5113*3f1979aaSAndroid Build Coastguard Worker // from a and b.
5114*3f1979aaSAndroid Build Coastguard Worker //
5115*3f1979aaSAndroid Build Coastguard Worker //   r0 := a2
5116*3f1979aaSAndroid Build Coastguard Worker //   r1 := b2
5117*3f1979aaSAndroid Build Coastguard Worker //   r2 := a3
5118*3f1979aaSAndroid Build Coastguard Worker //   r3 := b3
5119*3f1979aaSAndroid Build Coastguard Worker //
5120*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
_mm_unpackhi_ps(__m128 a,__m128 b)5121*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
5122*3f1979aaSAndroid Build Coastguard Worker {
5123*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5124*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(
5125*3f1979aaSAndroid Build Coastguard Worker         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5126*3f1979aaSAndroid Build Coastguard Worker #else
5127*3f1979aaSAndroid Build Coastguard Worker     float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
5128*3f1979aaSAndroid Build Coastguard Worker     float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
5129*3f1979aaSAndroid Build Coastguard Worker     float32x2x2_t result = vzip_f32(a1, b1);
5130*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5131*3f1979aaSAndroid Build Coastguard Worker #endif
5132*3f1979aaSAndroid Build Coastguard Worker }
5133*3f1979aaSAndroid Build Coastguard Worker 
5134*3f1979aaSAndroid Build Coastguard Worker // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
5135*3f1979aaSAndroid Build Coastguard Worker // 8 signed or unsigned 8-bit integers in b.
5136*3f1979aaSAndroid Build Coastguard Worker //
5137*3f1979aaSAndroid Build Coastguard Worker //   r0 := a8
5138*3f1979aaSAndroid Build Coastguard Worker //   r1 := b8
5139*3f1979aaSAndroid Build Coastguard Worker //   r2 := a9
5140*3f1979aaSAndroid Build Coastguard Worker //   r3 := b9
5141*3f1979aaSAndroid Build Coastguard Worker //   ...
5142*3f1979aaSAndroid Build Coastguard Worker //   r14 := a15
5143*3f1979aaSAndroid Build Coastguard Worker //   r15 := b15
5144*3f1979aaSAndroid Build Coastguard Worker //
5145*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
_mm_unpackhi_epi8(__m128i a,__m128i b)5146*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5147*3f1979aaSAndroid Build Coastguard Worker {
5148*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5149*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(
5150*3f1979aaSAndroid Build Coastguard Worker         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5151*3f1979aaSAndroid Build Coastguard Worker #else
5152*3f1979aaSAndroid Build Coastguard Worker     int8x8_t a1 =
5153*3f1979aaSAndroid Build Coastguard Worker         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5154*3f1979aaSAndroid Build Coastguard Worker     int8x8_t b1 =
5155*3f1979aaSAndroid Build Coastguard Worker         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5156*3f1979aaSAndroid Build Coastguard Worker     int8x8x2_t result = vzip_s8(a1, b1);
5157*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5158*3f1979aaSAndroid Build Coastguard Worker #endif
5159*3f1979aaSAndroid Build Coastguard Worker }
5160*3f1979aaSAndroid Build Coastguard Worker 
5161*3f1979aaSAndroid Build Coastguard Worker // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
5162*3f1979aaSAndroid Build Coastguard Worker // upper 4 signed or unsigned 16-bit integers in b.
5163*3f1979aaSAndroid Build Coastguard Worker //
5164*3f1979aaSAndroid Build Coastguard Worker //   r0 := a4
5165*3f1979aaSAndroid Build Coastguard Worker //   r1 := b4
5166*3f1979aaSAndroid Build Coastguard Worker //   r2 := a5
5167*3f1979aaSAndroid Build Coastguard Worker //   r3 := b5
5168*3f1979aaSAndroid Build Coastguard Worker //   r4 := a6
5169*3f1979aaSAndroid Build Coastguard Worker //   r5 := b6
5170*3f1979aaSAndroid Build Coastguard Worker //   r6 := a7
5171*3f1979aaSAndroid Build Coastguard Worker //   r7 := b7
5172*3f1979aaSAndroid Build Coastguard Worker //
5173*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
_mm_unpackhi_epi16(__m128i a,__m128i b)5174*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5175*3f1979aaSAndroid Build Coastguard Worker {
5176*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5177*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(
5178*3f1979aaSAndroid Build Coastguard Worker         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5179*3f1979aaSAndroid Build Coastguard Worker #else
5180*3f1979aaSAndroid Build Coastguard Worker     int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5181*3f1979aaSAndroid Build Coastguard Worker     int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5182*3f1979aaSAndroid Build Coastguard Worker     int16x4x2_t result = vzip_s16(a1, b1);
5183*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5184*3f1979aaSAndroid Build Coastguard Worker #endif
5185*3f1979aaSAndroid Build Coastguard Worker }
5186*3f1979aaSAndroid Build Coastguard Worker 
5187*3f1979aaSAndroid Build Coastguard Worker // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
5188*3f1979aaSAndroid Build Coastguard Worker // upper 2 signed or unsigned 32-bit integers in b.
5189*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
_mm_unpackhi_epi32(__m128i a,__m128i b)5190*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5191*3f1979aaSAndroid Build Coastguard Worker {
5192*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5193*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(
5194*3f1979aaSAndroid Build Coastguard Worker         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5195*3f1979aaSAndroid Build Coastguard Worker #else
5196*3f1979aaSAndroid Build Coastguard Worker     int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5197*3f1979aaSAndroid Build Coastguard Worker     int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5198*3f1979aaSAndroid Build Coastguard Worker     int32x2x2_t result = vzip_s32(a1, b1);
5199*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5200*3f1979aaSAndroid Build Coastguard Worker #endif
5201*3f1979aaSAndroid Build Coastguard Worker }
5202*3f1979aaSAndroid Build Coastguard Worker 
5203*3f1979aaSAndroid Build Coastguard Worker // Interleaves the upper signed or unsigned 64-bit integer in a with the
5204*3f1979aaSAndroid Build Coastguard Worker // upper signed or unsigned 64-bit integer in b.
5205*3f1979aaSAndroid Build Coastguard Worker //
5206*3f1979aaSAndroid Build Coastguard Worker //   r0 := a1
5207*3f1979aaSAndroid Build Coastguard Worker //   r1 := b1
_mm_unpackhi_epi64(__m128i a,__m128i b)5208*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5209*3f1979aaSAndroid Build Coastguard Worker {
5210*3f1979aaSAndroid Build Coastguard Worker     int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5211*3f1979aaSAndroid Build Coastguard Worker     int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5212*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5213*3f1979aaSAndroid Build Coastguard Worker }
5214*3f1979aaSAndroid Build Coastguard Worker 
5215*3f1979aaSAndroid Build Coastguard Worker // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
5216*3f1979aaSAndroid Build Coastguard Worker // in a, store the minimum and index in dst, and zero the remaining bits in dst.
5217*3f1979aaSAndroid Build Coastguard Worker //
5218*3f1979aaSAndroid Build Coastguard Worker //   index[2:0] := 0
5219*3f1979aaSAndroid Build Coastguard Worker //   min[15:0] := a[15:0]
5220*3f1979aaSAndroid Build Coastguard Worker //   FOR j := 0 to 7
5221*3f1979aaSAndroid Build Coastguard Worker //       i := j*16
5222*3f1979aaSAndroid Build Coastguard Worker //       IF a[i+15:i] < min[15:0]
5223*3f1979aaSAndroid Build Coastguard Worker //           index[2:0] := j
5224*3f1979aaSAndroid Build Coastguard Worker //           min[15:0] := a[i+15:i]
5225*3f1979aaSAndroid Build Coastguard Worker //       FI
5226*3f1979aaSAndroid Build Coastguard Worker //   ENDFOR
5227*3f1979aaSAndroid Build Coastguard Worker //   dst[15:0] := min[15:0]
5228*3f1979aaSAndroid Build Coastguard Worker //   dst[18:16] := index[2:0]
5229*3f1979aaSAndroid Build Coastguard Worker //   dst[127:19] := 0
5230*3f1979aaSAndroid Build Coastguard Worker //
5231*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
_mm_minpos_epu16(__m128i a)5232*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
5233*3f1979aaSAndroid Build Coastguard Worker {
5234*3f1979aaSAndroid Build Coastguard Worker     __m128i dst;
5235*3f1979aaSAndroid Build Coastguard Worker     uint16_t min, idx = 0;
5236*3f1979aaSAndroid Build Coastguard Worker     // Find the minimum value
5237*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5238*3f1979aaSAndroid Build Coastguard Worker     min = vminvq_u16(vreinterpretq_u16_m128i(a));
5239*3f1979aaSAndroid Build Coastguard Worker #else
5240*3f1979aaSAndroid Build Coastguard Worker     __m64 tmp;
5241*3f1979aaSAndroid Build Coastguard Worker     tmp = vreinterpret_m64_u16(
5242*3f1979aaSAndroid Build Coastguard Worker         vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
5243*3f1979aaSAndroid Build Coastguard Worker                  vget_high_u16(vreinterpretq_u16_m128i(a))));
5244*3f1979aaSAndroid Build Coastguard Worker     tmp = vreinterpret_m64_u16(
5245*3f1979aaSAndroid Build Coastguard Worker         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5246*3f1979aaSAndroid Build Coastguard Worker     tmp = vreinterpret_m64_u16(
5247*3f1979aaSAndroid Build Coastguard Worker         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5248*3f1979aaSAndroid Build Coastguard Worker     min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
5249*3f1979aaSAndroid Build Coastguard Worker #endif
5250*3f1979aaSAndroid Build Coastguard Worker     // Get the index of the minimum value
5251*3f1979aaSAndroid Build Coastguard Worker     int i;
5252*3f1979aaSAndroid Build Coastguard Worker     for (i = 0; i < 8; i++) {
5253*3f1979aaSAndroid Build Coastguard Worker         if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
5254*3f1979aaSAndroid Build Coastguard Worker             idx = (uint16_t) i;
5255*3f1979aaSAndroid Build Coastguard Worker             break;
5256*3f1979aaSAndroid Build Coastguard Worker         }
5257*3f1979aaSAndroid Build Coastguard Worker         a = _mm_srli_si128(a, 2);
5258*3f1979aaSAndroid Build Coastguard Worker     }
5259*3f1979aaSAndroid Build Coastguard Worker     // Generate result
5260*3f1979aaSAndroid Build Coastguard Worker     dst = _mm_setzero_si128();
5261*3f1979aaSAndroid Build Coastguard Worker     dst = vreinterpretq_m128i_u16(
5262*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
5263*3f1979aaSAndroid Build Coastguard Worker     dst = vreinterpretq_m128i_u16(
5264*3f1979aaSAndroid Build Coastguard Worker         vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
5265*3f1979aaSAndroid Build Coastguard Worker     return dst;
5266*3f1979aaSAndroid Build Coastguard Worker }
5267*3f1979aaSAndroid Build Coastguard Worker 
5268*3f1979aaSAndroid Build Coastguard Worker // shift to right
5269*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
5270*3f1979aaSAndroid Build Coastguard Worker // http://blog.csdn.net/hemmingway/article/details/44828303
5271*3f1979aaSAndroid Build Coastguard Worker // Clang requires a macro here, as it is extremely picky about c being a
5272*3f1979aaSAndroid Build Coastguard Worker // literal.
5273*3f1979aaSAndroid Build Coastguard Worker #define _mm_alignr_epi8(a, b, c) \
5274*3f1979aaSAndroid Build Coastguard Worker     ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
5275*3f1979aaSAndroid Build Coastguard Worker 
5276*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5277*3f1979aaSAndroid Build Coastguard Worker // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5278*3f1979aaSAndroid Build Coastguard Worker // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5279*3f1979aaSAndroid Build Coastguard Worker // otherwise set CF to 0. Return the CF value.
5280*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
_mm_testc_si128(__m128i a,__m128i b)5281*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
5282*3f1979aaSAndroid Build Coastguard Worker {
5283*3f1979aaSAndroid Build Coastguard Worker     int64x2_t s64 =
5284*3f1979aaSAndroid Build Coastguard Worker         vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
5285*3f1979aaSAndroid Build Coastguard Worker                   vreinterpretq_s64_m128i(b));
5286*3f1979aaSAndroid Build Coastguard Worker     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5287*3f1979aaSAndroid Build Coastguard Worker }
5288*3f1979aaSAndroid Build Coastguard Worker 
5289*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5290*3f1979aaSAndroid Build Coastguard Worker // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5291*3f1979aaSAndroid Build Coastguard Worker // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5292*3f1979aaSAndroid Build Coastguard Worker // otherwise set CF to 0. Return the ZF value.
5293*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
_mm_testz_si128(__m128i a,__m128i b)5294*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
5295*3f1979aaSAndroid Build Coastguard Worker {
5296*3f1979aaSAndroid Build Coastguard Worker     int64x2_t s64 =
5297*3f1979aaSAndroid Build Coastguard Worker         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
5298*3f1979aaSAndroid Build Coastguard Worker     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5299*3f1979aaSAndroid Build Coastguard Worker }
5300*3f1979aaSAndroid Build Coastguard Worker 
5301*3f1979aaSAndroid Build Coastguard Worker // Extracts the selected signed or unsigned 8-bit integer from a and zero
5302*3f1979aaSAndroid Build Coastguard Worker // extends.
5303*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
5304*3f1979aaSAndroid Build Coastguard Worker #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
5305*3f1979aaSAndroid Build Coastguard Worker 
5306*3f1979aaSAndroid Build Coastguard Worker // Inserts the least significant 8 bits of b into the selected 8-bit integer
5307*3f1979aaSAndroid Build Coastguard Worker // of a.
5308*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
5309*3f1979aaSAndroid Build Coastguard Worker //                                      __constrange(0,16) int imm)
5310*3f1979aaSAndroid Build Coastguard Worker #define _mm_insert_epi8(a, b, imm)                                 \
5311*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                \
5312*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s8(                                    \
5313*3f1979aaSAndroid Build Coastguard Worker             vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
5314*3f1979aaSAndroid Build Coastguard Worker     })
5315*3f1979aaSAndroid Build Coastguard Worker 
5316*3f1979aaSAndroid Build Coastguard Worker // Extracts the selected signed or unsigned 16-bit integer from a and zero
5317*3f1979aaSAndroid Build Coastguard Worker // extends.
5318*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
5319*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
5320*3f1979aaSAndroid Build Coastguard Worker #define _mm_extract_epi16(a, imm) \
5321*3f1979aaSAndroid Build Coastguard Worker     vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
5322*3f1979aaSAndroid Build Coastguard Worker 
5323*3f1979aaSAndroid Build Coastguard Worker // Inserts the least significant 16 bits of b into the selected 16-bit integer
5324*3f1979aaSAndroid Build Coastguard Worker // of a.
5325*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
5326*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
5327*3f1979aaSAndroid Build Coastguard Worker //                                       __constrange(0,8) int imm)
5328*3f1979aaSAndroid Build Coastguard Worker #define _mm_insert_epi16(a, b, imm)                                  \
5329*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                  \
5330*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s16(                                     \
5331*3f1979aaSAndroid Build Coastguard Worker             vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
5332*3f1979aaSAndroid Build Coastguard Worker     })
5333*3f1979aaSAndroid Build Coastguard Worker 
5334*3f1979aaSAndroid Build Coastguard Worker // Extracts the selected signed or unsigned 32-bit integer from a and zero
5335*3f1979aaSAndroid Build Coastguard Worker // extends.
5336*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
5337*3f1979aaSAndroid Build Coastguard Worker #define _mm_extract_epi32(a, imm) \
5338*3f1979aaSAndroid Build Coastguard Worker     vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
5339*3f1979aaSAndroid Build Coastguard Worker 
5340*3f1979aaSAndroid Build Coastguard Worker // Extracts the selected single-precision (32-bit) floating-point from a.
5341*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
5342*3f1979aaSAndroid Build Coastguard Worker #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
5343*3f1979aaSAndroid Build Coastguard Worker 
5344*3f1979aaSAndroid Build Coastguard Worker // Inserts the least significant 32 bits of b into the selected 32-bit integer
5345*3f1979aaSAndroid Build Coastguard Worker // of a.
5346*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
5347*3f1979aaSAndroid Build Coastguard Worker //                                       __constrange(0,4) int imm)
5348*3f1979aaSAndroid Build Coastguard Worker #define _mm_insert_epi32(a, b, imm)                                  \
5349*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                  \
5350*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s32(                                     \
5351*3f1979aaSAndroid Build Coastguard Worker             vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
5352*3f1979aaSAndroid Build Coastguard Worker     })
5353*3f1979aaSAndroid Build Coastguard Worker 
5354*3f1979aaSAndroid Build Coastguard Worker // Extracts the selected signed or unsigned 64-bit integer from a and zero
5355*3f1979aaSAndroid Build Coastguard Worker // extends.
5356*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
5357*3f1979aaSAndroid Build Coastguard Worker #define _mm_extract_epi64(a, imm) \
5358*3f1979aaSAndroid Build Coastguard Worker     vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
5359*3f1979aaSAndroid Build Coastguard Worker 
5360*3f1979aaSAndroid Build Coastguard Worker // Inserts the least significant 64 bits of b into the selected 64-bit integer
5361*3f1979aaSAndroid Build Coastguard Worker // of a.
5362*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
5363*3f1979aaSAndroid Build Coastguard Worker //                                       __constrange(0,2) int imm)
5364*3f1979aaSAndroid Build Coastguard Worker #define _mm_insert_epi64(a, b, imm)                                  \
5365*3f1979aaSAndroid Build Coastguard Worker     __extension__({                                                  \
5366*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_m128i_s64(                                     \
5367*3f1979aaSAndroid Build Coastguard Worker             vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
5368*3f1979aaSAndroid Build Coastguard Worker     })
5369*3f1979aaSAndroid Build Coastguard Worker 
5370*3f1979aaSAndroid Build Coastguard Worker // Count the number of bits set to 1 in unsigned 32-bit integer a, and
5371*3f1979aaSAndroid Build Coastguard Worker // return that count in dst.
5372*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
_mm_popcnt_u32(unsigned int a)5373*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
5374*3f1979aaSAndroid Build Coastguard Worker {
5375*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5376*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_popcount)
5377*3f1979aaSAndroid Build Coastguard Worker     return __builtin_popcount(a);
5378*3f1979aaSAndroid Build Coastguard Worker #else
5379*3f1979aaSAndroid Build Coastguard Worker     return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
5380*3f1979aaSAndroid Build Coastguard Worker #endif
5381*3f1979aaSAndroid Build Coastguard Worker #else
5382*3f1979aaSAndroid Build Coastguard Worker     uint32_t count = 0;
5383*3f1979aaSAndroid Build Coastguard Worker     uint8x8_t input_val, count8x8_val;
5384*3f1979aaSAndroid Build Coastguard Worker     uint16x4_t count16x4_val;
5385*3f1979aaSAndroid Build Coastguard Worker     uint32x2_t count32x2_val;
5386*3f1979aaSAndroid Build Coastguard Worker 
5387*3f1979aaSAndroid Build Coastguard Worker     input_val = vld1_u8((uint8_t *) &a);
5388*3f1979aaSAndroid Build Coastguard Worker     count8x8_val = vcnt_u8(input_val);
5389*3f1979aaSAndroid Build Coastguard Worker     count16x4_val = vpaddl_u8(count8x8_val);
5390*3f1979aaSAndroid Build Coastguard Worker     count32x2_val = vpaddl_u16(count16x4_val);
5391*3f1979aaSAndroid Build Coastguard Worker 
5392*3f1979aaSAndroid Build Coastguard Worker     vst1_u32(&count, count32x2_val);
5393*3f1979aaSAndroid Build Coastguard Worker     return count;
5394*3f1979aaSAndroid Build Coastguard Worker #endif
5395*3f1979aaSAndroid Build Coastguard Worker }
5396*3f1979aaSAndroid Build Coastguard Worker 
5397*3f1979aaSAndroid Build Coastguard Worker // Count the number of bits set to 1 in unsigned 64-bit integer a, and
5398*3f1979aaSAndroid Build Coastguard Worker // return that count in dst.
5399*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
_mm_popcnt_u64(uint64_t a)5400*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
5401*3f1979aaSAndroid Build Coastguard Worker {
5402*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5403*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_popcountll)
5404*3f1979aaSAndroid Build Coastguard Worker     return __builtin_popcountll(a);
5405*3f1979aaSAndroid Build Coastguard Worker #else
5406*3f1979aaSAndroid Build Coastguard Worker     return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
5407*3f1979aaSAndroid Build Coastguard Worker #endif
5408*3f1979aaSAndroid Build Coastguard Worker #else
5409*3f1979aaSAndroid Build Coastguard Worker     uint64_t count = 0;
5410*3f1979aaSAndroid Build Coastguard Worker     uint8x8_t input_val, count8x8_val;
5411*3f1979aaSAndroid Build Coastguard Worker     uint16x4_t count16x4_val;
5412*3f1979aaSAndroid Build Coastguard Worker     uint32x2_t count32x2_val;
5413*3f1979aaSAndroid Build Coastguard Worker     uint64x1_t count64x1_val;
5414*3f1979aaSAndroid Build Coastguard Worker 
5415*3f1979aaSAndroid Build Coastguard Worker     input_val = vld1_u8((uint8_t *) &a);
5416*3f1979aaSAndroid Build Coastguard Worker     count8x8_val = vcnt_u8(input_val);
5417*3f1979aaSAndroid Build Coastguard Worker     count16x4_val = vpaddl_u8(count8x8_val);
5418*3f1979aaSAndroid Build Coastguard Worker     count32x2_val = vpaddl_u16(count16x4_val);
5419*3f1979aaSAndroid Build Coastguard Worker     count64x1_val = vpaddl_u32(count32x2_val);
5420*3f1979aaSAndroid Build Coastguard Worker     vst1_u64(&count, count64x1_val);
5421*3f1979aaSAndroid Build Coastguard Worker     return count;
5422*3f1979aaSAndroid Build Coastguard Worker #endif
5423*3f1979aaSAndroid Build Coastguard Worker }
5424*3f1979aaSAndroid Build Coastguard Worker 
5425*3f1979aaSAndroid Build Coastguard Worker // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
5426*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
5427*3f1979aaSAndroid Build Coastguard Worker // transposed matrix in these vectors (row0 now contains column 0, etc.).
5428*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
5429*3f1979aaSAndroid Build Coastguard Worker #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
5430*3f1979aaSAndroid Build Coastguard Worker     do {                                                  \
5431*3f1979aaSAndroid Build Coastguard Worker         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
5432*3f1979aaSAndroid Build Coastguard Worker         float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
5433*3f1979aaSAndroid Build Coastguard Worker         row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
5434*3f1979aaSAndroid Build Coastguard Worker                             vget_low_f32(ROW23.val[0]));  \
5435*3f1979aaSAndroid Build Coastguard Worker         row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
5436*3f1979aaSAndroid Build Coastguard Worker                             vget_low_f32(ROW23.val[1]));  \
5437*3f1979aaSAndroid Build Coastguard Worker         row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
5438*3f1979aaSAndroid Build Coastguard Worker                             vget_high_f32(ROW23.val[0])); \
5439*3f1979aaSAndroid Build Coastguard Worker         row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
5440*3f1979aaSAndroid Build Coastguard Worker                             vget_high_f32(ROW23.val[1])); \
5441*3f1979aaSAndroid Build Coastguard Worker     } while (0)
5442*3f1979aaSAndroid Build Coastguard Worker 
5443*3f1979aaSAndroid Build Coastguard Worker /* Crypto Extensions */
5444*3f1979aaSAndroid Build Coastguard Worker 
5445*3f1979aaSAndroid Build Coastguard Worker #if defined(__ARM_FEATURE_CRYPTO)
5446*3f1979aaSAndroid Build Coastguard Worker // Wraps vmull_p64
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)5447*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5448*3f1979aaSAndroid Build Coastguard Worker {
5449*3f1979aaSAndroid Build Coastguard Worker     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
5450*3f1979aaSAndroid Build Coastguard Worker     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
5451*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_u64_p128(vmull_p64(a, b));
5452*3f1979aaSAndroid Build Coastguard Worker }
5453*3f1979aaSAndroid Build Coastguard Worker #else  // ARMv7 polyfill
5454*3f1979aaSAndroid Build Coastguard Worker // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
5455*3f1979aaSAndroid Build Coastguard Worker //
5456*3f1979aaSAndroid Build Coastguard Worker // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
5457*3f1979aaSAndroid Build Coastguard Worker // 64-bit->128-bit polynomial multiply.
5458*3f1979aaSAndroid Build Coastguard Worker //
5459*3f1979aaSAndroid Build Coastguard Worker // It needs some work and is somewhat slow, but it is still faster than all
5460*3f1979aaSAndroid Build Coastguard Worker // known scalar methods.
5461*3f1979aaSAndroid Build Coastguard Worker //
5462*3f1979aaSAndroid Build Coastguard Worker // Algorithm adapted to C from
5463*3f1979aaSAndroid Build Coastguard Worker // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
5464*3f1979aaSAndroid Build Coastguard Worker // from "Fast Software Polynomial Multiplication on ARM Processors Using the
5465*3f1979aaSAndroid Build Coastguard Worker // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
5466*3f1979aaSAndroid Build Coastguard Worker // (https://hal.inria.fr/hal-01506572)
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)5467*3f1979aaSAndroid Build Coastguard Worker static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5468*3f1979aaSAndroid Build Coastguard Worker {
5469*3f1979aaSAndroid Build Coastguard Worker     poly8x8_t a = vreinterpret_p8_u64(_a);
5470*3f1979aaSAndroid Build Coastguard Worker     poly8x8_t b = vreinterpret_p8_u64(_b);
5471*3f1979aaSAndroid Build Coastguard Worker 
5472*3f1979aaSAndroid Build Coastguard Worker     // Masks
5473*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
5474*3f1979aaSAndroid Build Coastguard Worker                                     vcreate_u8(0x00000000ffffffff));
5475*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
5476*3f1979aaSAndroid Build Coastguard Worker                                     vcreate_u8(0x0000000000000000));
5477*3f1979aaSAndroid Build Coastguard Worker 
5478*3f1979aaSAndroid Build Coastguard Worker     // Do the multiplies, rotating with vext to get all combinations
5479*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
5480*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t e =
5481*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
5482*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t f =
5483*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
5484*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t g =
5485*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
5486*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t h =
5487*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
5488*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t i =
5489*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
5490*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t j =
5491*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
5492*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t k =
5493*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
5494*3f1979aaSAndroid Build Coastguard Worker 
5495*3f1979aaSAndroid Build Coastguard Worker     // Add cross products
5496*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t l = veorq_u8(e, f);  // L = E + F
5497*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t m = veorq_u8(g, h);  // M = G + H
5498*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t n = veorq_u8(i, j);  // N = I + J
5499*3f1979aaSAndroid Build Coastguard Worker 
5500*3f1979aaSAndroid Build Coastguard Worker     // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
5501*3f1979aaSAndroid Build Coastguard Worker     // instructions.
5502*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5503*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t lm_p0 = vreinterpretq_u8_u64(
5504*3f1979aaSAndroid Build Coastguard Worker         vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5505*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t lm_p1 = vreinterpretq_u8_u64(
5506*3f1979aaSAndroid Build Coastguard Worker         vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5507*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t nk_p0 = vreinterpretq_u8_u64(
5508*3f1979aaSAndroid Build Coastguard Worker         vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5509*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t nk_p1 = vreinterpretq_u8_u64(
5510*3f1979aaSAndroid Build Coastguard Worker         vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5511*3f1979aaSAndroid Build Coastguard Worker #else
5512*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
5513*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
5514*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
5515*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
5516*3f1979aaSAndroid Build Coastguard Worker #endif
5517*3f1979aaSAndroid Build Coastguard Worker     // t0 = (L) (P0 + P1) << 8
5518*3f1979aaSAndroid Build Coastguard Worker     // t1 = (M) (P2 + P3) << 16
5519*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
5520*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
5521*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
5522*3f1979aaSAndroid Build Coastguard Worker 
5523*3f1979aaSAndroid Build Coastguard Worker     // t2 = (N) (P4 + P5) << 24
5524*3f1979aaSAndroid Build Coastguard Worker     // t3 = (K) (P6 + P7) << 32
5525*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
5526*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
5527*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
5528*3f1979aaSAndroid Build Coastguard Worker 
5529*3f1979aaSAndroid Build Coastguard Worker     // De-interleave
5530*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5531*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t0 = vreinterpretq_u8_u64(
5532*3f1979aaSAndroid Build Coastguard Worker         vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5533*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t1 = vreinterpretq_u8_u64(
5534*3f1979aaSAndroid Build Coastguard Worker         vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5535*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t2 = vreinterpretq_u8_u64(
5536*3f1979aaSAndroid Build Coastguard Worker         vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5537*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t3 = vreinterpretq_u8_u64(
5538*3f1979aaSAndroid Build Coastguard Worker         vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5539*3f1979aaSAndroid Build Coastguard Worker #else
5540*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
5541*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
5542*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
5543*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
5544*3f1979aaSAndroid Build Coastguard Worker #endif
5545*3f1979aaSAndroid Build Coastguard Worker     // Shift the cross products
5546*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
5547*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
5548*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
5549*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
5550*3f1979aaSAndroid Build Coastguard Worker 
5551*3f1979aaSAndroid Build Coastguard Worker     // Accumulate the products
5552*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
5553*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
5554*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t mix = veorq_u8(d, cross1);
5555*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t r = veorq_u8(mix, cross2);
5556*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_u64_u8(r);
5557*3f1979aaSAndroid Build Coastguard Worker }
5558*3f1979aaSAndroid Build Coastguard Worker #endif  // ARMv7 polyfill
5559*3f1979aaSAndroid Build Coastguard Worker 
_mm_clmulepi64_si128(__m128i _a,__m128i _b,const int imm)5560*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
5561*3f1979aaSAndroid Build Coastguard Worker {
5562*3f1979aaSAndroid Build Coastguard Worker     uint64x2_t a = vreinterpretq_u64_m128i(_a);
5563*3f1979aaSAndroid Build Coastguard Worker     uint64x2_t b = vreinterpretq_u64_m128i(_b);
5564*3f1979aaSAndroid Build Coastguard Worker     switch (imm & 0x11) {
5565*3f1979aaSAndroid Build Coastguard Worker     case 0x00:
5566*3f1979aaSAndroid Build Coastguard Worker         return vreinterpretq_m128i_u64(
5567*3f1979aaSAndroid Build Coastguard Worker             _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
5568*3f1979aaSAndroid Build Coastguard Worker     case 0x01:
5569*3f1979aaSAndroid Build Coastguard Worker         return vreinterpretq_m128i_u64(
5570*3f1979aaSAndroid Build Coastguard Worker             _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
5571*3f1979aaSAndroid Build Coastguard Worker     case 0x10:
5572*3f1979aaSAndroid Build Coastguard Worker         return vreinterpretq_m128i_u64(
5573*3f1979aaSAndroid Build Coastguard Worker             _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
5574*3f1979aaSAndroid Build Coastguard Worker     case 0x11:
5575*3f1979aaSAndroid Build Coastguard Worker         return vreinterpretq_m128i_u64(
5576*3f1979aaSAndroid Build Coastguard Worker             _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
5577*3f1979aaSAndroid Build Coastguard Worker     default:
5578*3f1979aaSAndroid Build Coastguard Worker         abort();
5579*3f1979aaSAndroid Build Coastguard Worker     }
5580*3f1979aaSAndroid Build Coastguard Worker }
5581*3f1979aaSAndroid Build Coastguard Worker 
5582*3f1979aaSAndroid Build Coastguard Worker #if !defined(__ARM_FEATURE_CRYPTO)
5583*3f1979aaSAndroid Build Coastguard Worker /* clang-format off */
5584*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_DATA(w)                                           \
5585*3f1979aaSAndroid Build Coastguard Worker     {                                                                  \
5586*3f1979aaSAndroid Build Coastguard Worker         w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
5587*3f1979aaSAndroid Build Coastguard Worker         w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
5588*3f1979aaSAndroid Build Coastguard Worker         w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
5589*3f1979aaSAndroid Build Coastguard Worker         w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
5590*3f1979aaSAndroid Build Coastguard Worker         w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
5591*3f1979aaSAndroid Build Coastguard Worker         w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
5592*3f1979aaSAndroid Build Coastguard Worker         w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
5593*3f1979aaSAndroid Build Coastguard Worker         w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
5594*3f1979aaSAndroid Build Coastguard Worker         w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
5595*3f1979aaSAndroid Build Coastguard Worker         w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
5596*3f1979aaSAndroid Build Coastguard Worker         w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
5597*3f1979aaSAndroid Build Coastguard Worker         w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
5598*3f1979aaSAndroid Build Coastguard Worker         w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
5599*3f1979aaSAndroid Build Coastguard Worker         w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
5600*3f1979aaSAndroid Build Coastguard Worker         w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
5601*3f1979aaSAndroid Build Coastguard Worker         w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
5602*3f1979aaSAndroid Build Coastguard Worker         w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
5603*3f1979aaSAndroid Build Coastguard Worker         w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
5604*3f1979aaSAndroid Build Coastguard Worker         w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
5605*3f1979aaSAndroid Build Coastguard Worker         w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
5606*3f1979aaSAndroid Build Coastguard Worker         w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
5607*3f1979aaSAndroid Build Coastguard Worker         w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
5608*3f1979aaSAndroid Build Coastguard Worker         w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
5609*3f1979aaSAndroid Build Coastguard Worker         w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
5610*3f1979aaSAndroid Build Coastguard Worker         w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
5611*3f1979aaSAndroid Build Coastguard Worker         w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
5612*3f1979aaSAndroid Build Coastguard Worker         w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
5613*3f1979aaSAndroid Build Coastguard Worker         w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
5614*3f1979aaSAndroid Build Coastguard Worker         w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
5615*3f1979aaSAndroid Build Coastguard Worker         w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
5616*3f1979aaSAndroid Build Coastguard Worker         w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
5617*3f1979aaSAndroid Build Coastguard Worker         w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
5618*3f1979aaSAndroid Build Coastguard Worker         w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
5619*3f1979aaSAndroid Build Coastguard Worker         w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
5620*3f1979aaSAndroid Build Coastguard Worker         w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
5621*3f1979aaSAndroid Build Coastguard Worker         w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
5622*3f1979aaSAndroid Build Coastguard Worker         w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
5623*3f1979aaSAndroid Build Coastguard Worker     }
5624*3f1979aaSAndroid Build Coastguard Worker /* clang-format on */
5625*3f1979aaSAndroid Build Coastguard Worker 
5626*3f1979aaSAndroid Build Coastguard Worker /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
5627*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_H0(x) (x)
5628*3f1979aaSAndroid Build Coastguard Worker static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
5629*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_H0
5630*3f1979aaSAndroid Build Coastguard Worker 
5631*3f1979aaSAndroid Build Coastguard Worker // In the absence of crypto extensions, implement aesenc using regular neon
5632*3f1979aaSAndroid Build Coastguard Worker // intrinsics instead. See:
5633*3f1979aaSAndroid Build Coastguard Worker // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
5634*3f1979aaSAndroid Build Coastguard Worker // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
5635*3f1979aaSAndroid Build Coastguard Worker // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
5636*3f1979aaSAndroid Build Coastguard Worker // for more information Reproduced with permission of the author.
_mm_aesenc_si128(__m128i EncBlock,__m128i RoundKey)5637*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
5638*3f1979aaSAndroid Build Coastguard Worker {
5639*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5640*3f1979aaSAndroid Build Coastguard Worker     static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
5641*3f1979aaSAndroid Build Coastguard Worker                                          0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
5642*3f1979aaSAndroid Build Coastguard Worker                                          0xc, 0x1, 0x6, 0xb};
5643*3f1979aaSAndroid Build Coastguard Worker     static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
5644*3f1979aaSAndroid Build Coastguard Worker                                        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
5645*3f1979aaSAndroid Build Coastguard Worker 
5646*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t v;
5647*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
5648*3f1979aaSAndroid Build Coastguard Worker 
5649*3f1979aaSAndroid Build Coastguard Worker     // shift rows
5650*3f1979aaSAndroid Build Coastguard Worker     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
5651*3f1979aaSAndroid Build Coastguard Worker 
5652*3f1979aaSAndroid Build Coastguard Worker     // sub bytes
5653*3f1979aaSAndroid Build Coastguard Worker     v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
5654*3f1979aaSAndroid Build Coastguard Worker     v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
5655*3f1979aaSAndroid Build Coastguard Worker     v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
5656*3f1979aaSAndroid Build Coastguard Worker     v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
5657*3f1979aaSAndroid Build Coastguard Worker 
5658*3f1979aaSAndroid Build Coastguard Worker     // mix columns
5659*3f1979aaSAndroid Build Coastguard Worker     w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
5660*3f1979aaSAndroid Build Coastguard Worker     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
5661*3f1979aaSAndroid Build Coastguard Worker     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
5662*3f1979aaSAndroid Build Coastguard Worker 
5663*3f1979aaSAndroid Build Coastguard Worker     //  add round key
5664*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(w) ^ RoundKey;
5665*3f1979aaSAndroid Build Coastguard Worker 
5666*3f1979aaSAndroid Build Coastguard Worker #else /* ARMv7-A NEON implementation */
5667*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
5668*3f1979aaSAndroid Build Coastguard Worker     (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
5669*3f1979aaSAndroid Build Coastguard Worker      (b0))
5670*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
5671*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
5672*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_U0(p) \
5673*3f1979aaSAndroid Build Coastguard Worker     SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
5674*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_U1(p) \
5675*3f1979aaSAndroid Build Coastguard Worker     SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
5676*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_U2(p) \
5677*3f1979aaSAndroid Build Coastguard Worker     SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
5678*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_U3(p) \
5679*3f1979aaSAndroid Build Coastguard Worker     SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
5680*3f1979aaSAndroid Build Coastguard Worker     static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
5681*3f1979aaSAndroid Build Coastguard Worker         SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
5682*3f1979aaSAndroid Build Coastguard Worker         SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
5683*3f1979aaSAndroid Build Coastguard Worker         SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
5684*3f1979aaSAndroid Build Coastguard Worker         SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
5685*3f1979aaSAndroid Build Coastguard Worker     };
5686*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_B2W
5687*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_F2
5688*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_F3
5689*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_U0
5690*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_U1
5691*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_U2
5692*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_U3
5693*3f1979aaSAndroid Build Coastguard Worker 
5694*3f1979aaSAndroid Build Coastguard Worker     uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
5695*3f1979aaSAndroid Build Coastguard Worker     uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
5696*3f1979aaSAndroid Build Coastguard Worker     uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
5697*3f1979aaSAndroid Build Coastguard Worker     uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
5698*3f1979aaSAndroid Build Coastguard Worker 
5699*3f1979aaSAndroid Build Coastguard Worker     __m128i out = _mm_set_epi32(
5700*3f1979aaSAndroid Build Coastguard Worker         (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
5701*3f1979aaSAndroid Build Coastguard Worker          aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
5702*3f1979aaSAndroid Build Coastguard Worker         (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
5703*3f1979aaSAndroid Build Coastguard Worker          aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
5704*3f1979aaSAndroid Build Coastguard Worker         (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
5705*3f1979aaSAndroid Build Coastguard Worker          aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
5706*3f1979aaSAndroid Build Coastguard Worker         (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
5707*3f1979aaSAndroid Build Coastguard Worker          aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
5708*3f1979aaSAndroid Build Coastguard Worker 
5709*3f1979aaSAndroid Build Coastguard Worker     return _mm_xor_si128(out, RoundKey);
5710*3f1979aaSAndroid Build Coastguard Worker #endif
5711*3f1979aaSAndroid Build Coastguard Worker }
5712*3f1979aaSAndroid Build Coastguard Worker 
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)5713*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
5714*3f1979aaSAndroid Build Coastguard Worker {
5715*3f1979aaSAndroid Build Coastguard Worker     /* FIXME: optimized for NEON */
5716*3f1979aaSAndroid Build Coastguard Worker     uint8_t v[4][4] = {
5717*3f1979aaSAndroid Build Coastguard Worker         [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
5718*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
5719*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
5720*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
5721*3f1979aaSAndroid Build Coastguard Worker         [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
5722*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
5723*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
5724*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
5725*3f1979aaSAndroid Build Coastguard Worker         [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
5726*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
5727*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
5728*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
5729*3f1979aaSAndroid Build Coastguard Worker         [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
5730*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
5731*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
5732*3f1979aaSAndroid Build Coastguard Worker                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
5733*3f1979aaSAndroid Build Coastguard Worker     };
5734*3f1979aaSAndroid Build Coastguard Worker     for (int i = 0; i < 16; i++)
5735*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_nth_u8_m128i(a, i) =
5736*3f1979aaSAndroid Build Coastguard Worker             v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
5737*3f1979aaSAndroid Build Coastguard Worker     return a;
5738*3f1979aaSAndroid Build Coastguard Worker }
5739*3f1979aaSAndroid Build Coastguard Worker 
5740*3f1979aaSAndroid Build Coastguard Worker // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
5741*3f1979aaSAndroid Build Coastguard Worker // This instruction generates a round key for AES encryption. See
5742*3f1979aaSAndroid Build Coastguard Worker // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
5743*3f1979aaSAndroid Build Coastguard Worker // for details.
5744*3f1979aaSAndroid Build Coastguard Worker //
5745*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
_mm_aeskeygenassist_si128(__m128i key,const int rcon)5746*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
5747*3f1979aaSAndroid Build Coastguard Worker {
5748*3f1979aaSAndroid Build Coastguard Worker     uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
5749*3f1979aaSAndroid Build Coastguard Worker     uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
5750*3f1979aaSAndroid Build Coastguard Worker     for (int i = 0; i < 4; ++i) {
5751*3f1979aaSAndroid Build Coastguard Worker         ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
5752*3f1979aaSAndroid Build Coastguard Worker         ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
5753*3f1979aaSAndroid Build Coastguard Worker     }
5754*3f1979aaSAndroid Build Coastguard Worker     return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
5755*3f1979aaSAndroid Build Coastguard Worker                          ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
5756*3f1979aaSAndroid Build Coastguard Worker }
5757*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_DATA
5758*3f1979aaSAndroid Build Coastguard Worker 
5759*3f1979aaSAndroid Build Coastguard Worker #else /* __ARM_FEATURE_CRYPTO */
5760*3f1979aaSAndroid Build Coastguard Worker // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
5761*3f1979aaSAndroid Build Coastguard Worker // AESMC and then manually applying the real key as an xor operation. This
5762*3f1979aaSAndroid Build Coastguard Worker // unfortunately means an additional xor op; the compiler should be able to
5763*3f1979aaSAndroid Build Coastguard Worker // optimize this away for repeated calls however. See
5764*3f1979aaSAndroid Build Coastguard Worker // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
5765*3f1979aaSAndroid Build Coastguard Worker // for more details.
_mm_aesenc_si128(__m128i a,__m128i b)5766*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
5767*3f1979aaSAndroid Build Coastguard Worker {
5768*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(
5769*3f1979aaSAndroid Build Coastguard Worker         vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
5770*3f1979aaSAndroid Build Coastguard Worker         vreinterpretq_u8_m128i(b));
5771*3f1979aaSAndroid Build Coastguard Worker }
5772*3f1979aaSAndroid Build Coastguard Worker 
5773*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)5774*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
5775*3f1979aaSAndroid Build Coastguard Worker {
5776*3f1979aaSAndroid Build Coastguard Worker     return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
5777*3f1979aaSAndroid Build Coastguard Worker                              vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
5778*3f1979aaSAndroid Build Coastguard Worker                          RoundKey);
5779*3f1979aaSAndroid Build Coastguard Worker }
5780*3f1979aaSAndroid Build Coastguard Worker 
_mm_aeskeygenassist_si128(__m128i a,const int rcon)5781*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
5782*3f1979aaSAndroid Build Coastguard Worker {
5783*3f1979aaSAndroid Build Coastguard Worker     // AESE does ShiftRows and SubBytes on A
5784*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
5785*3f1979aaSAndroid Build Coastguard Worker 
5786*3f1979aaSAndroid Build Coastguard Worker     uint8x16_t dest = {
5787*3f1979aaSAndroid Build Coastguard Worker         // Undo ShiftRows step from AESE and extract X1 and X3
5788*3f1979aaSAndroid Build Coastguard Worker         u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
5789*3f1979aaSAndroid Build Coastguard Worker         u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
5790*3f1979aaSAndroid Build Coastguard Worker         u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
5791*3f1979aaSAndroid Build Coastguard Worker         u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
5792*3f1979aaSAndroid Build Coastguard Worker     };
5793*3f1979aaSAndroid Build Coastguard Worker     uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
5794*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
5795*3f1979aaSAndroid Build Coastguard Worker }
5796*3f1979aaSAndroid Build Coastguard Worker #endif
5797*3f1979aaSAndroid Build Coastguard Worker 
5798*3f1979aaSAndroid Build Coastguard Worker /* Streaming Extensions */
5799*3f1979aaSAndroid Build Coastguard Worker 
5800*3f1979aaSAndroid Build Coastguard Worker // Guarantees that every preceding store is globally visible before any
5801*3f1979aaSAndroid Build Coastguard Worker // subsequent store.
5802*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
_mm_sfence(void)5803*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_sfence(void)
5804*3f1979aaSAndroid Build Coastguard Worker {
5805*3f1979aaSAndroid Build Coastguard Worker     __sync_synchronize();
5806*3f1979aaSAndroid Build Coastguard Worker }
5807*3f1979aaSAndroid Build Coastguard Worker 
5808*3f1979aaSAndroid Build Coastguard Worker // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
5809*3f1979aaSAndroid Build Coastguard Worker // point elements) from a into memory using a non-temporal memory hint.
5810*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
_mm_stream_ps(float * p,__m128 a)5811*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
5812*3f1979aaSAndroid Build Coastguard Worker {
5813*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_nontemporal_store)
5814*3f1979aaSAndroid Build Coastguard Worker     __builtin_nontemporal_store(a, (float32x4_t *) p);
5815*3f1979aaSAndroid Build Coastguard Worker #else
5816*3f1979aaSAndroid Build Coastguard Worker     vst1q_f32(p, vreinterpretq_f32_m128(a));
5817*3f1979aaSAndroid Build Coastguard Worker #endif
5818*3f1979aaSAndroid Build Coastguard Worker }
5819*3f1979aaSAndroid Build Coastguard Worker 
5820*3f1979aaSAndroid Build Coastguard Worker // Stores the data in a to the address p without polluting the caches.  If the
5821*3f1979aaSAndroid Build Coastguard Worker // cache line containing address p is already in the cache, the cache will be
5822*3f1979aaSAndroid Build Coastguard Worker // updated.
5823*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
_mm_stream_si128(__m128i * p,__m128i a)5824*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5825*3f1979aaSAndroid Build Coastguard Worker {
5826*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_nontemporal_store)
5827*3f1979aaSAndroid Build Coastguard Worker     __builtin_nontemporal_store(a, p);
5828*3f1979aaSAndroid Build Coastguard Worker #else
5829*3f1979aaSAndroid Build Coastguard Worker     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5830*3f1979aaSAndroid Build Coastguard Worker #endif
5831*3f1979aaSAndroid Build Coastguard Worker }
5832*3f1979aaSAndroid Build Coastguard Worker 
5833*3f1979aaSAndroid Build Coastguard Worker // Load 128-bits of integer data from memory into dst using a non-temporal
5834*3f1979aaSAndroid Build Coastguard Worker // memory hint. mem_addr must be aligned on a 16-byte boundary or a
5835*3f1979aaSAndroid Build Coastguard Worker // general-protection exception may be generated.
5836*3f1979aaSAndroid Build Coastguard Worker //
5837*3f1979aaSAndroid Build Coastguard Worker //   dst[127:0] := MEM[mem_addr+127:mem_addr]
5838*3f1979aaSAndroid Build Coastguard Worker //
5839*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
_mm_stream_load_si128(__m128i * p)5840*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
5841*3f1979aaSAndroid Build Coastguard Worker {
5842*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_nontemporal_store)
5843*3f1979aaSAndroid Build Coastguard Worker     return __builtin_nontemporal_load(p);
5844*3f1979aaSAndroid Build Coastguard Worker #else
5845*3f1979aaSAndroid Build Coastguard Worker     return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
5846*3f1979aaSAndroid Build Coastguard Worker #endif
5847*3f1979aaSAndroid Build Coastguard Worker }
5848*3f1979aaSAndroid Build Coastguard Worker 
5849*3f1979aaSAndroid Build Coastguard Worker // Cache line containing p is flushed and invalidated from all caches in the
5850*3f1979aaSAndroid Build Coastguard Worker // coherency domain. :
5851*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
_mm_clflush(void const * p)5852*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_clflush(void const *p)
5853*3f1979aaSAndroid Build Coastguard Worker {
5854*3f1979aaSAndroid Build Coastguard Worker     (void) p;
5855*3f1979aaSAndroid Build Coastguard Worker     // no corollary for Neon?
5856*3f1979aaSAndroid Build Coastguard Worker }
5857*3f1979aaSAndroid Build Coastguard Worker 
5858*3f1979aaSAndroid Build Coastguard Worker // Allocate aligned blocks of memory.
5859*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/en-us/
5860*3f1979aaSAndroid Build Coastguard Worker //         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
_mm_malloc(size_t size,size_t align)5861*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
5862*3f1979aaSAndroid Build Coastguard Worker {
5863*3f1979aaSAndroid Build Coastguard Worker     void *ptr;
5864*3f1979aaSAndroid Build Coastguard Worker     if (align == 1)
5865*3f1979aaSAndroid Build Coastguard Worker         return malloc(size);
5866*3f1979aaSAndroid Build Coastguard Worker     if (align == 2 || (sizeof(void *) == 8 && align == 4))
5867*3f1979aaSAndroid Build Coastguard Worker         align = sizeof(void *);
5868*3f1979aaSAndroid Build Coastguard Worker     if (!posix_memalign(&ptr, align, size))
5869*3f1979aaSAndroid Build Coastguard Worker         return ptr;
5870*3f1979aaSAndroid Build Coastguard Worker     return NULL;
5871*3f1979aaSAndroid Build Coastguard Worker }
5872*3f1979aaSAndroid Build Coastguard Worker 
_mm_free(void * addr)5873*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_free(void *addr)
5874*3f1979aaSAndroid Build Coastguard Worker {
5875*3f1979aaSAndroid Build Coastguard Worker     free(addr);
5876*3f1979aaSAndroid Build Coastguard Worker }
5877*3f1979aaSAndroid Build Coastguard Worker 
5878*3f1979aaSAndroid Build Coastguard Worker // Starting with the initial value in crc, accumulates a CRC32 value for
5879*3f1979aaSAndroid Build Coastguard Worker // unsigned 8-bit integer v.
5880*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
_mm_crc32_u8(uint32_t crc,uint8_t v)5881*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
5882*3f1979aaSAndroid Build Coastguard Worker {
5883*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5884*3f1979aaSAndroid Build Coastguard Worker     __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
5885*3f1979aaSAndroid Build Coastguard Worker                          : [c] "+r"(crc)
5886*3f1979aaSAndroid Build Coastguard Worker                          : [v] "r"(v));
5887*3f1979aaSAndroid Build Coastguard Worker #else
5888*3f1979aaSAndroid Build Coastguard Worker     crc ^= v;
5889*3f1979aaSAndroid Build Coastguard Worker     for (int bit = 0; bit < 8; bit++) {
5890*3f1979aaSAndroid Build Coastguard Worker         if (crc & 1)
5891*3f1979aaSAndroid Build Coastguard Worker             crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
5892*3f1979aaSAndroid Build Coastguard Worker         else
5893*3f1979aaSAndroid Build Coastguard Worker             crc = (crc >> 1);
5894*3f1979aaSAndroid Build Coastguard Worker     }
5895*3f1979aaSAndroid Build Coastguard Worker #endif
5896*3f1979aaSAndroid Build Coastguard Worker     return crc;
5897*3f1979aaSAndroid Build Coastguard Worker }
5898*3f1979aaSAndroid Build Coastguard Worker 
5899*3f1979aaSAndroid Build Coastguard Worker // Starting with the initial value in crc, accumulates a CRC32 value for
5900*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integer v.
5901*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
_mm_crc32_u16(uint32_t crc,uint16_t v)5902*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
5903*3f1979aaSAndroid Build Coastguard Worker {
5904*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5905*3f1979aaSAndroid Build Coastguard Worker     __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
5906*3f1979aaSAndroid Build Coastguard Worker                          : [c] "+r"(crc)
5907*3f1979aaSAndroid Build Coastguard Worker                          : [v] "r"(v));
5908*3f1979aaSAndroid Build Coastguard Worker #else
5909*3f1979aaSAndroid Build Coastguard Worker     crc = _mm_crc32_u8(crc, v & 0xff);
5910*3f1979aaSAndroid Build Coastguard Worker     crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
5911*3f1979aaSAndroid Build Coastguard Worker #endif
5912*3f1979aaSAndroid Build Coastguard Worker     return crc;
5913*3f1979aaSAndroid Build Coastguard Worker }
5914*3f1979aaSAndroid Build Coastguard Worker 
5915*3f1979aaSAndroid Build Coastguard Worker // Starting with the initial value in crc, accumulates a CRC32 value for
5916*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integer v.
5917*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
_mm_crc32_u32(uint32_t crc,uint32_t v)5918*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
5919*3f1979aaSAndroid Build Coastguard Worker {
5920*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5921*3f1979aaSAndroid Build Coastguard Worker     __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
5922*3f1979aaSAndroid Build Coastguard Worker                          : [c] "+r"(crc)
5923*3f1979aaSAndroid Build Coastguard Worker                          : [v] "r"(v));
5924*3f1979aaSAndroid Build Coastguard Worker #else
5925*3f1979aaSAndroid Build Coastguard Worker     crc = _mm_crc32_u16(crc, v & 0xffff);
5926*3f1979aaSAndroid Build Coastguard Worker     crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
5927*3f1979aaSAndroid Build Coastguard Worker #endif
5928*3f1979aaSAndroid Build Coastguard Worker     return crc;
5929*3f1979aaSAndroid Build Coastguard Worker }
5930*3f1979aaSAndroid Build Coastguard Worker 
5931*3f1979aaSAndroid Build Coastguard Worker // Starting with the initial value in crc, accumulates a CRC32 value for
5932*3f1979aaSAndroid Build Coastguard Worker // unsigned 64-bit integer v.
5933*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
_mm_crc32_u64(uint64_t crc,uint64_t v)5934*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
5935*3f1979aaSAndroid Build Coastguard Worker {
5936*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5937*3f1979aaSAndroid Build Coastguard Worker     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
5938*3f1979aaSAndroid Build Coastguard Worker                          : [c] "+r"(crc)
5939*3f1979aaSAndroid Build Coastguard Worker                          : [v] "r"(v));
5940*3f1979aaSAndroid Build Coastguard Worker #else
5941*3f1979aaSAndroid Build Coastguard Worker     crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
5942*3f1979aaSAndroid Build Coastguard Worker     crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
5943*3f1979aaSAndroid Build Coastguard Worker #endif
5944*3f1979aaSAndroid Build Coastguard Worker     return crc;
5945*3f1979aaSAndroid Build Coastguard Worker }
5946*3f1979aaSAndroid Build Coastguard Worker 
5947*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__) || defined(__clang__)
5948*3f1979aaSAndroid Build Coastguard Worker #pragma pop_macro("ALIGN_STRUCT")
5949*3f1979aaSAndroid Build Coastguard Worker #pragma pop_macro("FORCE_INLINE")
5950*3f1979aaSAndroid Build Coastguard Worker #endif
5951*3f1979aaSAndroid Build Coastguard Worker 
5952*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__)
5953*3f1979aaSAndroid Build Coastguard Worker #pragma GCC pop_options
5954*3f1979aaSAndroid Build Coastguard Worker #endif
5955*3f1979aaSAndroid Build Coastguard Worker 
5956*3f1979aaSAndroid Build Coastguard Worker #endif
5957