1*3f1979aaSAndroid Build Coastguard Worker #ifndef SSE2NEON_H
2*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_H
3*3f1979aaSAndroid Build Coastguard Worker
4*3f1979aaSAndroid Build Coastguard Worker // This header file provides a simple API translation layer
5*3f1979aaSAndroid Build Coastguard Worker // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6*3f1979aaSAndroid Build Coastguard Worker //
7*3f1979aaSAndroid Build Coastguard Worker // This header file does not yet translate all of the SSE intrinsics.
8*3f1979aaSAndroid Build Coastguard Worker //
9*3f1979aaSAndroid Build Coastguard Worker // Contributors to this work are:
10*3f1979aaSAndroid Build Coastguard Worker // John W. Ratcliff <[email protected]>
11*3f1979aaSAndroid Build Coastguard Worker // Brandon Rowlett <[email protected]>
12*3f1979aaSAndroid Build Coastguard Worker // Ken Fast <[email protected]>
13*3f1979aaSAndroid Build Coastguard Worker // Eric van Beurden <[email protected]>
14*3f1979aaSAndroid Build Coastguard Worker // Alexander Potylitsin <[email protected]>
15*3f1979aaSAndroid Build Coastguard Worker // Hasindu Gamaarachchi <[email protected]>
16*3f1979aaSAndroid Build Coastguard Worker // Jim Huang <[email protected]>
17*3f1979aaSAndroid Build Coastguard Worker // Mark Cheng <[email protected]>
18*3f1979aaSAndroid Build Coastguard Worker // Malcolm James MacLeod <[email protected]>
19*3f1979aaSAndroid Build Coastguard Worker // Devin Hussey (easyaspi314) <[email protected]>
20*3f1979aaSAndroid Build Coastguard Worker // Sebastian Pop <[email protected]>
21*3f1979aaSAndroid Build Coastguard Worker // Developer Ecosystem Engineering <[email protected]>
22*3f1979aaSAndroid Build Coastguard Worker // Danila Kutenin <[email protected]>
23*3f1979aaSAndroid Build Coastguard Worker // François Turban (JishinMaster) <[email protected]>
24*3f1979aaSAndroid Build Coastguard Worker // Pei-Hsuan Hung <[email protected]>
25*3f1979aaSAndroid Build Coastguard Worker // Yang-Hao Yuan <[email protected]>
26*3f1979aaSAndroid Build Coastguard Worker
27*3f1979aaSAndroid Build Coastguard Worker /*
28*3f1979aaSAndroid Build Coastguard Worker * sse2neon is freely redistributable under the MIT License.
29*3f1979aaSAndroid Build Coastguard Worker *
30*3f1979aaSAndroid Build Coastguard Worker * Permission is hereby granted, free of charge, to any person obtaining a copy
31*3f1979aaSAndroid Build Coastguard Worker * of this software and associated documentation files (the "Software"), to deal
32*3f1979aaSAndroid Build Coastguard Worker * in the Software without restriction, including without limitation the rights
33*3f1979aaSAndroid Build Coastguard Worker * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34*3f1979aaSAndroid Build Coastguard Worker * copies of the Software, and to permit persons to whom the Software is
35*3f1979aaSAndroid Build Coastguard Worker * furnished to do so, subject to the following conditions:
36*3f1979aaSAndroid Build Coastguard Worker *
37*3f1979aaSAndroid Build Coastguard Worker * The above copyright notice and this permission notice shall be included in
38*3f1979aaSAndroid Build Coastguard Worker * all copies or substantial portions of the Software.
39*3f1979aaSAndroid Build Coastguard Worker *
40*3f1979aaSAndroid Build Coastguard Worker * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41*3f1979aaSAndroid Build Coastguard Worker * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42*3f1979aaSAndroid Build Coastguard Worker * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43*3f1979aaSAndroid Build Coastguard Worker * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44*3f1979aaSAndroid Build Coastguard Worker * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45*3f1979aaSAndroid Build Coastguard Worker * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
46*3f1979aaSAndroid Build Coastguard Worker * SOFTWARE.
47*3f1979aaSAndroid Build Coastguard Worker */
48*3f1979aaSAndroid Build Coastguard Worker
49*3f1979aaSAndroid Build Coastguard Worker /* Tunable configurations */
50*3f1979aaSAndroid Build Coastguard Worker
51*3f1979aaSAndroid Build Coastguard Worker /* Enable precise implementation of _mm_min_ps and _mm_max_ps
52*3f1979aaSAndroid Build Coastguard Worker * This would slow down the computation a bit, but gives consistent result with
53*3f1979aaSAndroid Build Coastguard Worker * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
54*3f1979aaSAndroid Build Coastguard Worker */
55*3f1979aaSAndroid Build Coastguard Worker #ifndef SSE2NEON_PRECISE_MINMAX
56*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_PRECISE_MINMAX (0)
57*3f1979aaSAndroid Build Coastguard Worker #endif
58*3f1979aaSAndroid Build Coastguard Worker
59*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__) || defined(__clang__)
60*3f1979aaSAndroid Build Coastguard Worker #pragma push_macro("FORCE_INLINE")
61*3f1979aaSAndroid Build Coastguard Worker #pragma push_macro("ALIGN_STRUCT")
62*3f1979aaSAndroid Build Coastguard Worker #define FORCE_INLINE static inline __attribute__((always_inline))
63*3f1979aaSAndroid Build Coastguard Worker #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
64*3f1979aaSAndroid Build Coastguard Worker #else
65*3f1979aaSAndroid Build Coastguard Worker #error "Macro name collisions may happen with unsupported compiler."
66*3f1979aaSAndroid Build Coastguard Worker #ifdef FORCE_INLINE
67*3f1979aaSAndroid Build Coastguard Worker #undef FORCE_INLINE
68*3f1979aaSAndroid Build Coastguard Worker #endif
69*3f1979aaSAndroid Build Coastguard Worker #define FORCE_INLINE static inline
70*3f1979aaSAndroid Build Coastguard Worker #ifndef ALIGN_STRUCT
71*3f1979aaSAndroid Build Coastguard Worker #define ALIGN_STRUCT(x) __declspec(align(x))
72*3f1979aaSAndroid Build Coastguard Worker #endif
73*3f1979aaSAndroid Build Coastguard Worker #endif
74*3f1979aaSAndroid Build Coastguard Worker
75*3f1979aaSAndroid Build Coastguard Worker #include <stdint.h>
76*3f1979aaSAndroid Build Coastguard Worker #include <stdlib.h>
77*3f1979aaSAndroid Build Coastguard Worker
78*3f1979aaSAndroid Build Coastguard Worker /* Architecture-specific build options */
79*3f1979aaSAndroid Build Coastguard Worker /* FIXME: #pragma GCC push_options is only available on GCC */
80*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__)
81*3f1979aaSAndroid Build Coastguard Worker #if defined(__arm__) && __ARM_ARCH == 7
82*3f1979aaSAndroid Build Coastguard Worker /* According to ARM C Language Extensions Architecture specification,
83*3f1979aaSAndroid Build Coastguard Worker * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
84*3f1979aaSAndroid Build Coastguard Worker * architecture supported.
85*3f1979aaSAndroid Build Coastguard Worker */
86*3f1979aaSAndroid Build Coastguard Worker #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
87*3f1979aaSAndroid Build Coastguard Worker #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
88*3f1979aaSAndroid Build Coastguard Worker #endif
89*3f1979aaSAndroid Build Coastguard Worker #pragma GCC push_options
90*3f1979aaSAndroid Build Coastguard Worker #pragma GCC target("fpu=neon")
91*3f1979aaSAndroid Build Coastguard Worker #elif defined(__aarch64__)
92*3f1979aaSAndroid Build Coastguard Worker #pragma GCC push_options
93*3f1979aaSAndroid Build Coastguard Worker #pragma GCC target("+simd")
94*3f1979aaSAndroid Build Coastguard Worker #else
95*3f1979aaSAndroid Build Coastguard Worker #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
96*3f1979aaSAndroid Build Coastguard Worker #endif
97*3f1979aaSAndroid Build Coastguard Worker #endif
98*3f1979aaSAndroid Build Coastguard Worker
99*3f1979aaSAndroid Build Coastguard Worker #include <arm_neon.h>
100*3f1979aaSAndroid Build Coastguard Worker
101*3f1979aaSAndroid Build Coastguard Worker /* Rounding functions require either Aarch64 instructions or libm failback */
102*3f1979aaSAndroid Build Coastguard Worker #if !defined(__aarch64__)
103*3f1979aaSAndroid Build Coastguard Worker #include <math.h>
104*3f1979aaSAndroid Build Coastguard Worker #endif
105*3f1979aaSAndroid Build Coastguard Worker
106*3f1979aaSAndroid Build Coastguard Worker /* "__has_builtin" can be used to query support for built-in functions
107*3f1979aaSAndroid Build Coastguard Worker * provided by gcc/clang and other compilers that support it.
108*3f1979aaSAndroid Build Coastguard Worker */
109*3f1979aaSAndroid Build Coastguard Worker #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
110*3f1979aaSAndroid Build Coastguard Worker /* Compatibility with gcc <= 9 */
111*3f1979aaSAndroid Build Coastguard Worker #if __GNUC__ <= 9
112*3f1979aaSAndroid Build Coastguard Worker #define __has_builtin(x) HAS##x
113*3f1979aaSAndroid Build Coastguard Worker #define HAS__builtin_popcount 1
114*3f1979aaSAndroid Build Coastguard Worker #define HAS__builtin_popcountll 1
115*3f1979aaSAndroid Build Coastguard Worker #else
116*3f1979aaSAndroid Build Coastguard Worker #define __has_builtin(x) 0
117*3f1979aaSAndroid Build Coastguard Worker #endif
118*3f1979aaSAndroid Build Coastguard Worker #endif
119*3f1979aaSAndroid Build Coastguard Worker
120*3f1979aaSAndroid Build Coastguard Worker /**
121*3f1979aaSAndroid Build Coastguard Worker * MACRO for shuffle parameter for _mm_shuffle_ps().
122*3f1979aaSAndroid Build Coastguard Worker * Argument fp3 is a digit[0123] that represents the fp from argument "b"
123*3f1979aaSAndroid Build Coastguard Worker * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
124*3f1979aaSAndroid Build Coastguard Worker * for fp2 in result. fp1 is a digit[0123] that represents the fp from
125*3f1979aaSAndroid Build Coastguard Worker * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
126*3f1979aaSAndroid Build Coastguard Worker * fp0 is the same for fp0 of result.
127*3f1979aaSAndroid Build Coastguard Worker */
128*3f1979aaSAndroid Build Coastguard Worker #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
129*3f1979aaSAndroid Build Coastguard Worker (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
130*3f1979aaSAndroid Build Coastguard Worker
131*3f1979aaSAndroid Build Coastguard Worker /* Rounding mode macros. */
132*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_TO_NEAREST_INT 0x00
133*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_TO_NEG_INF 0x01
134*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_TO_POS_INF 0x02
135*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_TO_ZERO 0x03
136*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_CUR_DIRECTION 0x04
137*3f1979aaSAndroid Build Coastguard Worker #define _MM_FROUND_NO_EXC 0x08
138*3f1979aaSAndroid Build Coastguard Worker
139*3f1979aaSAndroid Build Coastguard Worker /* indicate immediate constant argument in a given range */
140*3f1979aaSAndroid Build Coastguard Worker #define __constrange(a, b) const
141*3f1979aaSAndroid Build Coastguard Worker
142*3f1979aaSAndroid Build Coastguard Worker /* A few intrinsics accept traditional data types like ints or floats, but
143*3f1979aaSAndroid Build Coastguard Worker * most operate on data types that are specific to SSE.
144*3f1979aaSAndroid Build Coastguard Worker * If a vector type ends in d, it contains doubles, and if it does not have
145*3f1979aaSAndroid Build Coastguard Worker * a suffix, it contains floats. An integer vector type can contain any type
146*3f1979aaSAndroid Build Coastguard Worker * of integer, from chars to shorts to unsigned long longs.
147*3f1979aaSAndroid Build Coastguard Worker */
148*3f1979aaSAndroid Build Coastguard Worker typedef int64x1_t __m64;
149*3f1979aaSAndroid Build Coastguard Worker typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
150*3f1979aaSAndroid Build Coastguard Worker // On ARM 32-bit architecture, the float64x2_t is not supported.
151*3f1979aaSAndroid Build Coastguard Worker // The data type __m128d should be represented in a different way for related
152*3f1979aaSAndroid Build Coastguard Worker // intrinsic conversion.
153*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
154*3f1979aaSAndroid Build Coastguard Worker typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
155*3f1979aaSAndroid Build Coastguard Worker #else
156*3f1979aaSAndroid Build Coastguard Worker typedef float32x4_t __m128d;
157*3f1979aaSAndroid Build Coastguard Worker #endif
158*3f1979aaSAndroid Build Coastguard Worker typedef int64x2_t __m128i; /* 128-bit vector containing integers */
159*3f1979aaSAndroid Build Coastguard Worker
160*3f1979aaSAndroid Build Coastguard Worker /* type-safe casting between types */
161*3f1979aaSAndroid Build Coastguard Worker
162*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
163*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_f32(x) (x)
164*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
165*3f1979aaSAndroid Build Coastguard Worker
166*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
167*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
168*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
169*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
170*3f1979aaSAndroid Build Coastguard Worker
171*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
172*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
173*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
174*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
175*3f1979aaSAndroid Build Coastguard Worker
176*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
177*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_f32_m128(x) (x)
178*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
179*3f1979aaSAndroid Build Coastguard Worker
180*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
181*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
182*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
183*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
184*3f1979aaSAndroid Build Coastguard Worker
185*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
186*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
187*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
188*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
189*3f1979aaSAndroid Build Coastguard Worker
190*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
191*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
192*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
193*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_s64(x) (x)
194*3f1979aaSAndroid Build Coastguard Worker
195*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
196*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
197*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
198*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
199*3f1979aaSAndroid Build Coastguard Worker
200*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
201*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
202*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
203*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s64_m128i(x) (x)
204*3f1979aaSAndroid Build Coastguard Worker
205*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
206*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
207*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
208*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
209*3f1979aaSAndroid Build Coastguard Worker
210*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
211*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
212*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
213*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_s64(x) (x)
214*3f1979aaSAndroid Build Coastguard Worker
215*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
216*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
217*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
218*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
219*3f1979aaSAndroid Build Coastguard Worker
220*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
221*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
222*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
223*3f1979aaSAndroid Build Coastguard Worker
224*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
225*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
226*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
227*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
228*3f1979aaSAndroid Build Coastguard Worker
229*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
230*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
231*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
232*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_s64_m64(x) (x)
233*3f1979aaSAndroid Build Coastguard Worker
234*3f1979aaSAndroid Build Coastguard Worker #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
235*3f1979aaSAndroid Build Coastguard Worker
236*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
237*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
238*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
239*3f1979aaSAndroid Build Coastguard Worker
240*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_f64(x) (x)
241*3f1979aaSAndroid Build Coastguard Worker
242*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
243*3f1979aaSAndroid Build Coastguard Worker
244*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_f64_m128d(x) (x)
245*3f1979aaSAndroid Build Coastguard Worker #else
246*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
247*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
248*3f1979aaSAndroid Build Coastguard Worker
249*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_m128d_f32(x) (x)
250*3f1979aaSAndroid Build Coastguard Worker
251*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
252*3f1979aaSAndroid Build Coastguard Worker
253*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_f32_m128d(x) (x)
254*3f1979aaSAndroid Build Coastguard Worker #endif
255*3f1979aaSAndroid Build Coastguard Worker
256*3f1979aaSAndroid Build Coastguard Worker // A struct is defined in this header file called 'SIMDVec' which can be used
257*3f1979aaSAndroid Build Coastguard Worker // by applications which attempt to access the contents of an _m128 struct
258*3f1979aaSAndroid Build Coastguard Worker // directly. It is important to note that accessing the __m128 struct directly
259*3f1979aaSAndroid Build Coastguard Worker // is bad coding practice by Microsoft: @see:
260*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
261*3f1979aaSAndroid Build Coastguard Worker //
262*3f1979aaSAndroid Build Coastguard Worker // However, some legacy source code may try to access the contents of an __m128
263*3f1979aaSAndroid Build Coastguard Worker // struct directly so the developer can use the SIMDVec as an alias for it. Any
264*3f1979aaSAndroid Build Coastguard Worker // casting must be done manually by the developer, as you cannot cast or
265*3f1979aaSAndroid Build Coastguard Worker // otherwise alias the base NEON data type for intrinsic operations.
266*3f1979aaSAndroid Build Coastguard Worker //
267*3f1979aaSAndroid Build Coastguard Worker // union intended to allow direct access to an __m128 variable using the names
268*3f1979aaSAndroid Build Coastguard Worker // that the MSVC compiler provides. This union should really only be used when
269*3f1979aaSAndroid Build Coastguard Worker // trying to access the members of the vector as integer values. GCC/clang
270*3f1979aaSAndroid Build Coastguard Worker // allow native access to the float members through a simple array access
271*3f1979aaSAndroid Build Coastguard Worker // operator (in C since 4.6, in C++ since 4.8).
272*3f1979aaSAndroid Build Coastguard Worker //
273*3f1979aaSAndroid Build Coastguard Worker // Ideally direct accesses to SIMD vectors should not be used since it can cause
274*3f1979aaSAndroid Build Coastguard Worker // a performance hit. If it really is needed however, the original __m128
275*3f1979aaSAndroid Build Coastguard Worker // variable can be aliased with a pointer to this union and used to access
276*3f1979aaSAndroid Build Coastguard Worker // individual components. The use of this union should be hidden behind a macro
277*3f1979aaSAndroid Build Coastguard Worker // that is used throughout the codebase to access the members instead of always
278*3f1979aaSAndroid Build Coastguard Worker // declaring this type of variable.
279*3f1979aaSAndroid Build Coastguard Worker typedef union ALIGN_STRUCT(16) SIMDVec {
280*3f1979aaSAndroid Build Coastguard Worker float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
281*3f1979aaSAndroid Build Coastguard Worker int8_t m128_i8[16]; // as signed 8-bit integers.
282*3f1979aaSAndroid Build Coastguard Worker int16_t m128_i16[8]; // as signed 16-bit integers.
283*3f1979aaSAndroid Build Coastguard Worker int32_t m128_i32[4]; // as signed 32-bit integers.
284*3f1979aaSAndroid Build Coastguard Worker int64_t m128_i64[2]; // as signed 64-bit integers.
285*3f1979aaSAndroid Build Coastguard Worker uint8_t m128_u8[16]; // as unsigned 8-bit integers.
286*3f1979aaSAndroid Build Coastguard Worker uint16_t m128_u16[8]; // as unsigned 16-bit integers.
287*3f1979aaSAndroid Build Coastguard Worker uint32_t m128_u32[4]; // as unsigned 32-bit integers.
288*3f1979aaSAndroid Build Coastguard Worker uint64_t m128_u64[2]; // as unsigned 64-bit integers.
289*3f1979aaSAndroid Build Coastguard Worker } SIMDVec;
290*3f1979aaSAndroid Build Coastguard Worker
291*3f1979aaSAndroid Build Coastguard Worker // casting using SIMDVec
292*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
293*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
294*3f1979aaSAndroid Build Coastguard Worker #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
295*3f1979aaSAndroid Build Coastguard Worker
296*3f1979aaSAndroid Build Coastguard Worker /* Backwards compatibility for compilers with lack of specific type support */
297*3f1979aaSAndroid Build Coastguard Worker
298*3f1979aaSAndroid Build Coastguard Worker // Older gcc does not define vld1q_u8_x4 type
299*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__) && !defined(__clang__)
300*3f1979aaSAndroid Build Coastguard Worker #if __GNUC__ <= 9
vld1q_u8_x4(const uint8_t * p)301*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
302*3f1979aaSAndroid Build Coastguard Worker {
303*3f1979aaSAndroid Build Coastguard Worker uint8x16x4_t ret;
304*3f1979aaSAndroid Build Coastguard Worker ret.val[0] = vld1q_u8(p + 0);
305*3f1979aaSAndroid Build Coastguard Worker ret.val[1] = vld1q_u8(p + 16);
306*3f1979aaSAndroid Build Coastguard Worker ret.val[2] = vld1q_u8(p + 32);
307*3f1979aaSAndroid Build Coastguard Worker ret.val[3] = vld1q_u8(p + 48);
308*3f1979aaSAndroid Build Coastguard Worker return ret;
309*3f1979aaSAndroid Build Coastguard Worker }
310*3f1979aaSAndroid Build Coastguard Worker #endif
311*3f1979aaSAndroid Build Coastguard Worker #endif
312*3f1979aaSAndroid Build Coastguard Worker
313*3f1979aaSAndroid Build Coastguard Worker /* Function Naming Conventions
314*3f1979aaSAndroid Build Coastguard Worker * The naming convention of SSE intrinsics is straightforward. A generic SSE
315*3f1979aaSAndroid Build Coastguard Worker * intrinsic function is given as follows:
316*3f1979aaSAndroid Build Coastguard Worker * _mm_<name>_<data_type>
317*3f1979aaSAndroid Build Coastguard Worker *
318*3f1979aaSAndroid Build Coastguard Worker * The parts of this format are given as follows:
319*3f1979aaSAndroid Build Coastguard Worker * 1. <name> describes the operation performed by the intrinsic
320*3f1979aaSAndroid Build Coastguard Worker * 2. <data_type> identifies the data type of the function's primary arguments
321*3f1979aaSAndroid Build Coastguard Worker *
322*3f1979aaSAndroid Build Coastguard Worker * This last part, <data_type>, is a little complicated. It identifies the
323*3f1979aaSAndroid Build Coastguard Worker * content of the input values, and can be set to any of the following values:
324*3f1979aaSAndroid Build Coastguard Worker * + ps - vectors contain floats (ps stands for packed single-precision)
325*3f1979aaSAndroid Build Coastguard Worker * + pd - vectors cantain doubles (pd stands for packed double-precision)
326*3f1979aaSAndroid Build Coastguard Worker * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
327*3f1979aaSAndroid Build Coastguard Worker * signed integers
328*3f1979aaSAndroid Build Coastguard Worker * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
329*3f1979aaSAndroid Build Coastguard Worker * unsigned integers
330*3f1979aaSAndroid Build Coastguard Worker * + si128 - unspecified 128-bit vector or 256-bit vector
331*3f1979aaSAndroid Build Coastguard Worker * + m128/m128i/m128d - identifies input vector types when they are different
332*3f1979aaSAndroid Build Coastguard Worker * than the type of the returned vector
333*3f1979aaSAndroid Build Coastguard Worker *
334*3f1979aaSAndroid Build Coastguard Worker * For example, _mm_setzero_ps. The _mm implies that the function returns
335*3f1979aaSAndroid Build Coastguard Worker * a 128-bit vector. The _ps at the end implies that the argument vectors
336*3f1979aaSAndroid Build Coastguard Worker * contain floats.
337*3f1979aaSAndroid Build Coastguard Worker *
338*3f1979aaSAndroid Build Coastguard Worker * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
339*3f1979aaSAndroid Build Coastguard Worker * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
340*3f1979aaSAndroid Build Coastguard Worker * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
341*3f1979aaSAndroid Build Coastguard Worker * // Set packed 8-bit integers
342*3f1979aaSAndroid Build Coastguard Worker * // 128 bits, 16 chars, per 8 bits
343*3f1979aaSAndroid Build Coastguard Worker * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
344*3f1979aaSAndroid Build Coastguard Worker * 4, 5, 12, 13, 6, 7, 14, 15);
345*3f1979aaSAndroid Build Coastguard Worker * // Shuffle packed 8-bit integers
346*3f1979aaSAndroid Build Coastguard Worker * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
347*3f1979aaSAndroid Build Coastguard Worker *
348*3f1979aaSAndroid Build Coastguard Worker * Data (Number, Binary, Byte Index):
349*3f1979aaSAndroid Build Coastguard Worker +------+------+-------------+------+------+-------------+
350*3f1979aaSAndroid Build Coastguard Worker | 1 | 2 | 3 | 4 | Number
351*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
352*3f1979aaSAndroid Build Coastguard Worker | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
353*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
354*3f1979aaSAndroid Build Coastguard Worker | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
355*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
356*3f1979aaSAndroid Build Coastguard Worker
357*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
358*3f1979aaSAndroid Build Coastguard Worker | 5 | 6 | 7 | 8 | Number
359*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
360*3f1979aaSAndroid Build Coastguard Worker | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
361*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
362*3f1979aaSAndroid Build Coastguard Worker | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
363*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
364*3f1979aaSAndroid Build Coastguard Worker * Index (Byte Index):
365*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
366*3f1979aaSAndroid Build Coastguard Worker | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
367*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
368*3f1979aaSAndroid Build Coastguard Worker
369*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
370*3f1979aaSAndroid Build Coastguard Worker | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
371*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
372*3f1979aaSAndroid Build Coastguard Worker * Result:
373*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
374*3f1979aaSAndroid Build Coastguard Worker | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
375*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
376*3f1979aaSAndroid Build Coastguard Worker | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
377*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
378*3f1979aaSAndroid Build Coastguard Worker | 256 | 2 | 5 | 6 | Number
379*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
380*3f1979aaSAndroid Build Coastguard Worker
381*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
382*3f1979aaSAndroid Build Coastguard Worker | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
383*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
384*3f1979aaSAndroid Build Coastguard Worker | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
385*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+------+------+
386*3f1979aaSAndroid Build Coastguard Worker | 3 | 7 | 4 | 8 | Number
387*3f1979aaSAndroid Build Coastguard Worker +------+------+------+------+------+------+-------------+
388*3f1979aaSAndroid Build Coastguard Worker */
389*3f1979aaSAndroid Build Coastguard Worker
390*3f1979aaSAndroid Build Coastguard Worker /* Set/get methods */
391*3f1979aaSAndroid Build Coastguard Worker
392*3f1979aaSAndroid Build Coastguard Worker /* Constants for use with _mm_prefetch. */
393*3f1979aaSAndroid Build Coastguard Worker enum _mm_hint {
394*3f1979aaSAndroid Build Coastguard Worker _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
395*3f1979aaSAndroid Build Coastguard Worker _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
396*3f1979aaSAndroid Build Coastguard Worker _MM_HINT_T1 = 2, /* load data to L2 cache only */
397*3f1979aaSAndroid Build Coastguard Worker _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
398*3f1979aaSAndroid Build Coastguard Worker _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
399*3f1979aaSAndroid Build Coastguard Worker _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
400*3f1979aaSAndroid Build Coastguard Worker _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
401*3f1979aaSAndroid Build Coastguard Worker _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
402*3f1979aaSAndroid Build Coastguard Worker };
403*3f1979aaSAndroid Build Coastguard Worker
404*3f1979aaSAndroid Build Coastguard Worker // Loads one cache line of data from address p to a location closer to the
405*3f1979aaSAndroid Build Coastguard Worker // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
_mm_prefetch(const void * p,int i)406*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_prefetch(const void *p, int i)
407*3f1979aaSAndroid Build Coastguard Worker {
408*3f1979aaSAndroid Build Coastguard Worker (void) i;
409*3f1979aaSAndroid Build Coastguard Worker __builtin_prefetch(p);
410*3f1979aaSAndroid Build Coastguard Worker }
411*3f1979aaSAndroid Build Coastguard Worker
412*3f1979aaSAndroid Build Coastguard Worker // Copy the lower single-precision (32-bit) floating-point element of a to dst.
413*3f1979aaSAndroid Build Coastguard Worker //
414*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := a[31:0]
415*3f1979aaSAndroid Build Coastguard Worker //
416*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
_mm_cvtss_f32(__m128 a)417*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE float _mm_cvtss_f32(__m128 a)
418*3f1979aaSAndroid Build Coastguard Worker {
419*3f1979aaSAndroid Build Coastguard Worker return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
420*3f1979aaSAndroid Build Coastguard Worker }
421*3f1979aaSAndroid Build Coastguard Worker
422*3f1979aaSAndroid Build Coastguard Worker // Sets the 128-bit value to zero
423*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
_mm_setzero_si128(void)424*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_setzero_si128(void)
425*3f1979aaSAndroid Build Coastguard Worker {
426*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vdupq_n_s32(0));
427*3f1979aaSAndroid Build Coastguard Worker }
428*3f1979aaSAndroid Build Coastguard Worker
429*3f1979aaSAndroid Build Coastguard Worker // Clears the four single-precision, floating-point values.
430*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
_mm_setzero_ps(void)431*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_setzero_ps(void)
432*3f1979aaSAndroid Build Coastguard Worker {
433*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vdupq_n_f32(0));
434*3f1979aaSAndroid Build Coastguard Worker }
435*3f1979aaSAndroid Build Coastguard Worker
436*3f1979aaSAndroid Build Coastguard Worker // Sets the four single-precision, floating-point values to w.
437*3f1979aaSAndroid Build Coastguard Worker //
438*3f1979aaSAndroid Build Coastguard Worker // r0 := r1 := r2 := r3 := w
439*3f1979aaSAndroid Build Coastguard Worker //
440*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set1_ps(float _w)441*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_set1_ps(float _w)
442*3f1979aaSAndroid Build Coastguard Worker {
443*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vdupq_n_f32(_w));
444*3f1979aaSAndroid Build Coastguard Worker }
445*3f1979aaSAndroid Build Coastguard Worker
446*3f1979aaSAndroid Build Coastguard Worker // Sets the four single-precision, floating-point values to w.
447*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set_ps1(float _w)448*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_set_ps1(float _w)
449*3f1979aaSAndroid Build Coastguard Worker {
450*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vdupq_n_f32(_w));
451*3f1979aaSAndroid Build Coastguard Worker }
452*3f1979aaSAndroid Build Coastguard Worker
453*3f1979aaSAndroid Build Coastguard Worker // Sets the four single-precision, floating-point values to the four inputs.
454*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
_mm_set_ps(float w,float z,float y,float x)455*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
456*3f1979aaSAndroid Build Coastguard Worker {
457*3f1979aaSAndroid Build Coastguard Worker float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
458*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vld1q_f32(data));
459*3f1979aaSAndroid Build Coastguard Worker }
460*3f1979aaSAndroid Build Coastguard Worker
461*3f1979aaSAndroid Build Coastguard Worker // Copy single-precision (32-bit) floating-point element a to the lower element
462*3f1979aaSAndroid Build Coastguard Worker // of dst, and zero the upper 3 elements.
463*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
_mm_set_ss(float a)464*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_set_ss(float a)
465*3f1979aaSAndroid Build Coastguard Worker {
466*3f1979aaSAndroid Build Coastguard Worker float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
467*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vld1q_f32(data));
468*3f1979aaSAndroid Build Coastguard Worker }
469*3f1979aaSAndroid Build Coastguard Worker
470*3f1979aaSAndroid Build Coastguard Worker // Sets the four single-precision, floating-point values to the four inputs in
471*3f1979aaSAndroid Build Coastguard Worker // reverse order.
472*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
_mm_setr_ps(float w,float z,float y,float x)473*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
474*3f1979aaSAndroid Build Coastguard Worker {
475*3f1979aaSAndroid Build Coastguard Worker float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
476*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vld1q_f32(data));
477*3f1979aaSAndroid Build Coastguard Worker }
478*3f1979aaSAndroid Build Coastguard Worker
479*3f1979aaSAndroid Build Coastguard Worker // Sets the 8 signed 16-bit integer values in reverse order.
480*3f1979aaSAndroid Build Coastguard Worker //
481*3f1979aaSAndroid Build Coastguard Worker // Return Value
482*3f1979aaSAndroid Build Coastguard Worker // r0 := w0
483*3f1979aaSAndroid Build Coastguard Worker // r1 := w1
484*3f1979aaSAndroid Build Coastguard Worker // ...
485*3f1979aaSAndroid Build Coastguard Worker // r7 := w7
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)486*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_setr_epi16(short w0,
487*3f1979aaSAndroid Build Coastguard Worker short w1,
488*3f1979aaSAndroid Build Coastguard Worker short w2,
489*3f1979aaSAndroid Build Coastguard Worker short w3,
490*3f1979aaSAndroid Build Coastguard Worker short w4,
491*3f1979aaSAndroid Build Coastguard Worker short w5,
492*3f1979aaSAndroid Build Coastguard Worker short w6,
493*3f1979aaSAndroid Build Coastguard Worker short w7)
494*3f1979aaSAndroid Build Coastguard Worker {
495*3f1979aaSAndroid Build Coastguard Worker int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
496*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
497*3f1979aaSAndroid Build Coastguard Worker }
498*3f1979aaSAndroid Build Coastguard Worker
499*3f1979aaSAndroid Build Coastguard Worker // Sets the 4 signed 32-bit integer values in reverse order
500*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
_mm_setr_epi32(int i3,int i2,int i1,int i0)501*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
502*3f1979aaSAndroid Build Coastguard Worker {
503*3f1979aaSAndroid Build Coastguard Worker int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
504*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vld1q_s32(data));
505*3f1979aaSAndroid Build Coastguard Worker }
506*3f1979aaSAndroid Build Coastguard Worker
507*3f1979aaSAndroid Build Coastguard Worker // Set packed 64-bit integers in dst with the supplied values in reverse order.
508*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
_mm_setr_epi64(__m64 e1,__m64 e0)509*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
510*3f1979aaSAndroid Build Coastguard Worker {
511*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
512*3f1979aaSAndroid Build Coastguard Worker }
513*3f1979aaSAndroid Build Coastguard Worker
514*3f1979aaSAndroid Build Coastguard Worker // Sets the 16 signed 8-bit integer values to b.
515*3f1979aaSAndroid Build Coastguard Worker //
516*3f1979aaSAndroid Build Coastguard Worker // r0 := b
517*3f1979aaSAndroid Build Coastguard Worker // r1 := b
518*3f1979aaSAndroid Build Coastguard Worker // ...
519*3f1979aaSAndroid Build Coastguard Worker // r15 := b
520*3f1979aaSAndroid Build Coastguard Worker //
521*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
_mm_set1_epi8(signed char w)522*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
523*3f1979aaSAndroid Build Coastguard Worker {
524*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(vdupq_n_s8(w));
525*3f1979aaSAndroid Build Coastguard Worker }
526*3f1979aaSAndroid Build Coastguard Worker
527*3f1979aaSAndroid Build Coastguard Worker // Sets the 8 signed 16-bit integer values to w.
528*3f1979aaSAndroid Build Coastguard Worker //
529*3f1979aaSAndroid Build Coastguard Worker // r0 := w
530*3f1979aaSAndroid Build Coastguard Worker // r1 := w
531*3f1979aaSAndroid Build Coastguard Worker // ...
532*3f1979aaSAndroid Build Coastguard Worker // r7 := w
533*3f1979aaSAndroid Build Coastguard Worker //
534*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
_mm_set1_epi16(short w)535*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set1_epi16(short w)
536*3f1979aaSAndroid Build Coastguard Worker {
537*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vdupq_n_s16(w));
538*3f1979aaSAndroid Build Coastguard Worker }
539*3f1979aaSAndroid Build Coastguard Worker
540*3f1979aaSAndroid Build Coastguard Worker // Sets the 16 signed 8-bit integer values.
541*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
_mm_set_epi8(signed char b15,signed char b14,signed char b13,signed char b12,signed char b11,signed char b10,signed char b9,signed char b8,signed char b7,signed char b6,signed char b5,signed char b4,signed char b3,signed char b2,signed char b1,signed char b0)542*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
543*3f1979aaSAndroid Build Coastguard Worker signed char b14,
544*3f1979aaSAndroid Build Coastguard Worker signed char b13,
545*3f1979aaSAndroid Build Coastguard Worker signed char b12,
546*3f1979aaSAndroid Build Coastguard Worker signed char b11,
547*3f1979aaSAndroid Build Coastguard Worker signed char b10,
548*3f1979aaSAndroid Build Coastguard Worker signed char b9,
549*3f1979aaSAndroid Build Coastguard Worker signed char b8,
550*3f1979aaSAndroid Build Coastguard Worker signed char b7,
551*3f1979aaSAndroid Build Coastguard Worker signed char b6,
552*3f1979aaSAndroid Build Coastguard Worker signed char b5,
553*3f1979aaSAndroid Build Coastguard Worker signed char b4,
554*3f1979aaSAndroid Build Coastguard Worker signed char b3,
555*3f1979aaSAndroid Build Coastguard Worker signed char b2,
556*3f1979aaSAndroid Build Coastguard Worker signed char b1,
557*3f1979aaSAndroid Build Coastguard Worker signed char b0)
558*3f1979aaSAndroid Build Coastguard Worker {
559*3f1979aaSAndroid Build Coastguard Worker int8_t ALIGN_STRUCT(16)
560*3f1979aaSAndroid Build Coastguard Worker data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
561*3f1979aaSAndroid Build Coastguard Worker (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
562*3f1979aaSAndroid Build Coastguard Worker (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
563*3f1979aaSAndroid Build Coastguard Worker (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
564*3f1979aaSAndroid Build Coastguard Worker return (__m128i) vld1q_s8(data);
565*3f1979aaSAndroid Build Coastguard Worker }
566*3f1979aaSAndroid Build Coastguard Worker
567*3f1979aaSAndroid Build Coastguard Worker // Sets the 8 signed 16-bit integer values.
568*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
_mm_set_epi16(short i7,short i6,short i5,short i4,short i3,short i2,short i1,short i0)569*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set_epi16(short i7,
570*3f1979aaSAndroid Build Coastguard Worker short i6,
571*3f1979aaSAndroid Build Coastguard Worker short i5,
572*3f1979aaSAndroid Build Coastguard Worker short i4,
573*3f1979aaSAndroid Build Coastguard Worker short i3,
574*3f1979aaSAndroid Build Coastguard Worker short i2,
575*3f1979aaSAndroid Build Coastguard Worker short i1,
576*3f1979aaSAndroid Build Coastguard Worker short i0)
577*3f1979aaSAndroid Build Coastguard Worker {
578*3f1979aaSAndroid Build Coastguard Worker int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
579*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vld1q_s16(data));
580*3f1979aaSAndroid Build Coastguard Worker }
581*3f1979aaSAndroid Build Coastguard Worker
582*3f1979aaSAndroid Build Coastguard Worker // Sets the 16 signed 8-bit integer values in reverse order.
583*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
_mm_setr_epi8(signed char b0,signed char b1,signed char b2,signed char b3,signed char b4,signed char b5,signed char b6,signed char b7,signed char b8,signed char b9,signed char b10,signed char b11,signed char b12,signed char b13,signed char b14,signed char b15)584*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
585*3f1979aaSAndroid Build Coastguard Worker signed char b1,
586*3f1979aaSAndroid Build Coastguard Worker signed char b2,
587*3f1979aaSAndroid Build Coastguard Worker signed char b3,
588*3f1979aaSAndroid Build Coastguard Worker signed char b4,
589*3f1979aaSAndroid Build Coastguard Worker signed char b5,
590*3f1979aaSAndroid Build Coastguard Worker signed char b6,
591*3f1979aaSAndroid Build Coastguard Worker signed char b7,
592*3f1979aaSAndroid Build Coastguard Worker signed char b8,
593*3f1979aaSAndroid Build Coastguard Worker signed char b9,
594*3f1979aaSAndroid Build Coastguard Worker signed char b10,
595*3f1979aaSAndroid Build Coastguard Worker signed char b11,
596*3f1979aaSAndroid Build Coastguard Worker signed char b12,
597*3f1979aaSAndroid Build Coastguard Worker signed char b13,
598*3f1979aaSAndroid Build Coastguard Worker signed char b14,
599*3f1979aaSAndroid Build Coastguard Worker signed char b15)
600*3f1979aaSAndroid Build Coastguard Worker {
601*3f1979aaSAndroid Build Coastguard Worker int8_t ALIGN_STRUCT(16)
602*3f1979aaSAndroid Build Coastguard Worker data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
603*3f1979aaSAndroid Build Coastguard Worker (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
604*3f1979aaSAndroid Build Coastguard Worker (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
605*3f1979aaSAndroid Build Coastguard Worker (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
606*3f1979aaSAndroid Build Coastguard Worker return (__m128i) vld1q_s8(data);
607*3f1979aaSAndroid Build Coastguard Worker }
608*3f1979aaSAndroid Build Coastguard Worker
609*3f1979aaSAndroid Build Coastguard Worker // Sets the 4 signed 32-bit integer values to i.
610*3f1979aaSAndroid Build Coastguard Worker //
611*3f1979aaSAndroid Build Coastguard Worker // r0 := i
612*3f1979aaSAndroid Build Coastguard Worker // r1 := i
613*3f1979aaSAndroid Build Coastguard Worker // r2 := i
614*3f1979aaSAndroid Build Coastguard Worker // r3 := I
615*3f1979aaSAndroid Build Coastguard Worker //
616*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
_mm_set1_epi32(int _i)617*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set1_epi32(int _i)
618*3f1979aaSAndroid Build Coastguard Worker {
619*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
620*3f1979aaSAndroid Build Coastguard Worker }
621*3f1979aaSAndroid Build Coastguard Worker
622*3f1979aaSAndroid Build Coastguard Worker // Sets the 2 signed 64-bit integer values to i.
623*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
_mm_set1_epi64(__m64 _i)624*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
625*3f1979aaSAndroid Build Coastguard Worker {
626*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
627*3f1979aaSAndroid Build Coastguard Worker }
628*3f1979aaSAndroid Build Coastguard Worker
629*3f1979aaSAndroid Build Coastguard Worker // Sets the 2 signed 64-bit integer values to i.
630*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
_mm_set1_epi64x(int64_t _i)631*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
632*3f1979aaSAndroid Build Coastguard Worker {
633*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
634*3f1979aaSAndroid Build Coastguard Worker }
635*3f1979aaSAndroid Build Coastguard Worker
636*3f1979aaSAndroid Build Coastguard Worker // Sets the 4 signed 32-bit integer values.
637*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
_mm_set_epi32(int i3,int i2,int i1,int i0)638*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
639*3f1979aaSAndroid Build Coastguard Worker {
640*3f1979aaSAndroid Build Coastguard Worker int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
641*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vld1q_s32(data));
642*3f1979aaSAndroid Build Coastguard Worker }
643*3f1979aaSAndroid Build Coastguard Worker
644*3f1979aaSAndroid Build Coastguard Worker // Returns the __m128i structure with its two 64-bit integer values
645*3f1979aaSAndroid Build Coastguard Worker // initialized to the values of the two 64-bit integers passed in.
646*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64x(int64_t i1,int64_t i2)647*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
648*3f1979aaSAndroid Build Coastguard Worker {
649*3f1979aaSAndroid Build Coastguard Worker int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
650*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vld1q_s64(data));
651*3f1979aaSAndroid Build Coastguard Worker }
652*3f1979aaSAndroid Build Coastguard Worker
653*3f1979aaSAndroid Build Coastguard Worker // Returns the __m128i structure with its two 64-bit integer values
654*3f1979aaSAndroid Build Coastguard Worker // initialized to the values of the two 64-bit integers passed in.
655*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64(__m64 i1,__m64 i2)656*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
657*3f1979aaSAndroid Build Coastguard Worker {
658*3f1979aaSAndroid Build Coastguard Worker return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
659*3f1979aaSAndroid Build Coastguard Worker }
660*3f1979aaSAndroid Build Coastguard Worker
661*3f1979aaSAndroid Build Coastguard Worker // Set packed double-precision (64-bit) floating-point elements in dst with the
662*3f1979aaSAndroid Build Coastguard Worker // supplied values.
663*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
_mm_set_pd(double e1,double e0)664*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
665*3f1979aaSAndroid Build Coastguard Worker {
666*3f1979aaSAndroid Build Coastguard Worker double ALIGN_STRUCT(16) data[2] = {e0, e1};
667*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
668*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
669*3f1979aaSAndroid Build Coastguard Worker #else
670*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
671*3f1979aaSAndroid Build Coastguard Worker #endif
672*3f1979aaSAndroid Build Coastguard Worker }
673*3f1979aaSAndroid Build Coastguard Worker
674*3f1979aaSAndroid Build Coastguard Worker // Stores four single-precision, floating-point values.
675*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
_mm_store_ps(float * p,__m128 a)676*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
677*3f1979aaSAndroid Build Coastguard Worker {
678*3f1979aaSAndroid Build Coastguard Worker vst1q_f32(p, vreinterpretq_f32_m128(a));
679*3f1979aaSAndroid Build Coastguard Worker }
680*3f1979aaSAndroid Build Coastguard Worker
681*3f1979aaSAndroid Build Coastguard Worker // Stores four single-precision, floating-point values.
682*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
_mm_storeu_ps(float * p,__m128 a)683*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
684*3f1979aaSAndroid Build Coastguard Worker {
685*3f1979aaSAndroid Build Coastguard Worker vst1q_f32(p, vreinterpretq_f32_m128(a));
686*3f1979aaSAndroid Build Coastguard Worker }
687*3f1979aaSAndroid Build Coastguard Worker
688*3f1979aaSAndroid Build Coastguard Worker // Stores four 32-bit integer values as (as a __m128i value) at the address p.
689*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_store_si128(__m128i * p,__m128i a)690*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
691*3f1979aaSAndroid Build Coastguard Worker {
692*3f1979aaSAndroid Build Coastguard Worker vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
693*3f1979aaSAndroid Build Coastguard Worker }
694*3f1979aaSAndroid Build Coastguard Worker
695*3f1979aaSAndroid Build Coastguard Worker // Stores four 32-bit integer values as (as a __m128i value) at the address p.
696*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_storeu_si128(__m128i * p,__m128i a)697*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
698*3f1979aaSAndroid Build Coastguard Worker {
699*3f1979aaSAndroid Build Coastguard Worker vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
700*3f1979aaSAndroid Build Coastguard Worker }
701*3f1979aaSAndroid Build Coastguard Worker
702*3f1979aaSAndroid Build Coastguard Worker // Stores the lower single - precision, floating - point value.
703*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
_mm_store_ss(float * p,__m128 a)704*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
705*3f1979aaSAndroid Build Coastguard Worker {
706*3f1979aaSAndroid Build Coastguard Worker vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
707*3f1979aaSAndroid Build Coastguard Worker }
708*3f1979aaSAndroid Build Coastguard Worker
709*3f1979aaSAndroid Build Coastguard Worker // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
710*3f1979aaSAndroid Build Coastguard Worker // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
711*3f1979aaSAndroid Build Coastguard Worker // or a general-protection exception may be generated.
712*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
_mm_store_pd(double * mem_addr,__m128d a)713*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
714*3f1979aaSAndroid Build Coastguard Worker {
715*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
716*3f1979aaSAndroid Build Coastguard Worker vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
717*3f1979aaSAndroid Build Coastguard Worker #else
718*3f1979aaSAndroid Build Coastguard Worker vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
719*3f1979aaSAndroid Build Coastguard Worker #endif
720*3f1979aaSAndroid Build Coastguard Worker }
721*3f1979aaSAndroid Build Coastguard Worker
722*3f1979aaSAndroid Build Coastguard Worker // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
723*3f1979aaSAndroid Build Coastguard Worker // elements) from a into memory. mem_addr does not need to be aligned on any
724*3f1979aaSAndroid Build Coastguard Worker // particular boundary.
725*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
_mm_storeu_pd(double * mem_addr,__m128d a)726*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
727*3f1979aaSAndroid Build Coastguard Worker {
728*3f1979aaSAndroid Build Coastguard Worker _mm_store_pd(mem_addr, a);
729*3f1979aaSAndroid Build Coastguard Worker }
730*3f1979aaSAndroid Build Coastguard Worker
731*3f1979aaSAndroid Build Coastguard Worker // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
732*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
_mm_storel_epi64(__m128i * a,__m128i b)733*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
734*3f1979aaSAndroid Build Coastguard Worker {
735*3f1979aaSAndroid Build Coastguard Worker uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
736*3f1979aaSAndroid Build Coastguard Worker uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
737*3f1979aaSAndroid Build Coastguard Worker *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
738*3f1979aaSAndroid Build Coastguard Worker }
739*3f1979aaSAndroid Build Coastguard Worker
740*3f1979aaSAndroid Build Coastguard Worker // Stores the lower two single-precision floating point values of a to the
741*3f1979aaSAndroid Build Coastguard Worker // address p.
742*3f1979aaSAndroid Build Coastguard Worker //
743*3f1979aaSAndroid Build Coastguard Worker // *p0 := a0
744*3f1979aaSAndroid Build Coastguard Worker // *p1 := a1
745*3f1979aaSAndroid Build Coastguard Worker //
746*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
_mm_storel_pi(__m64 * p,__m128 a)747*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
748*3f1979aaSAndroid Build Coastguard Worker {
749*3f1979aaSAndroid Build Coastguard Worker *p = vreinterpret_m64_f32(vget_low_f32(a));
750*3f1979aaSAndroid Build Coastguard Worker }
751*3f1979aaSAndroid Build Coastguard Worker
752*3f1979aaSAndroid Build Coastguard Worker // Stores the upper two single-precision, floating-point values of a to the
753*3f1979aaSAndroid Build Coastguard Worker // address p.
754*3f1979aaSAndroid Build Coastguard Worker //
755*3f1979aaSAndroid Build Coastguard Worker // *p0 := a2
756*3f1979aaSAndroid Build Coastguard Worker // *p1 := a3
757*3f1979aaSAndroid Build Coastguard Worker //
758*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
_mm_storeh_pi(__m64 * p,__m128 a)759*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
760*3f1979aaSAndroid Build Coastguard Worker {
761*3f1979aaSAndroid Build Coastguard Worker *p = vreinterpret_m64_f32(vget_high_f32(a));
762*3f1979aaSAndroid Build Coastguard Worker }
763*3f1979aaSAndroid Build Coastguard Worker
764*3f1979aaSAndroid Build Coastguard Worker // Loads a single single-precision, floating-point value, copying it into all
765*3f1979aaSAndroid Build Coastguard Worker // four words
766*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
_mm_load1_ps(const float * p)767*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_load1_ps(const float *p)
768*3f1979aaSAndroid Build Coastguard Worker {
769*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vld1q_dup_f32(p));
770*3f1979aaSAndroid Build Coastguard Worker }
771*3f1979aaSAndroid Build Coastguard Worker
772*3f1979aaSAndroid Build Coastguard Worker // Load a single-precision (32-bit) floating-point element from memory into all
773*3f1979aaSAndroid Build Coastguard Worker // elements of dst.
774*3f1979aaSAndroid Build Coastguard Worker //
775*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := MEM[mem_addr+31:mem_addr]
776*3f1979aaSAndroid Build Coastguard Worker // dst[63:32] := MEM[mem_addr+31:mem_addr]
777*3f1979aaSAndroid Build Coastguard Worker // dst[95:64] := MEM[mem_addr+31:mem_addr]
778*3f1979aaSAndroid Build Coastguard Worker // dst[127:96] := MEM[mem_addr+31:mem_addr]
779*3f1979aaSAndroid Build Coastguard Worker //
780*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
781*3f1979aaSAndroid Build Coastguard Worker #define _mm_load_ps1 _mm_load1_ps
782*3f1979aaSAndroid Build Coastguard Worker
783*3f1979aaSAndroid Build Coastguard Worker // Sets the lower two single-precision, floating-point values with 64
784*3f1979aaSAndroid Build Coastguard Worker // bits of data loaded from the address p; the upper two values are passed
785*3f1979aaSAndroid Build Coastguard Worker // through from a.
786*3f1979aaSAndroid Build Coastguard Worker //
787*3f1979aaSAndroid Build Coastguard Worker // Return Value
788*3f1979aaSAndroid Build Coastguard Worker // r0 := *p0
789*3f1979aaSAndroid Build Coastguard Worker // r1 := *p1
790*3f1979aaSAndroid Build Coastguard Worker // r2 := a2
791*3f1979aaSAndroid Build Coastguard Worker // r3 := a3
792*3f1979aaSAndroid Build Coastguard Worker //
793*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
_mm_loadl_pi(__m128 a,__m64 const * p)794*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
795*3f1979aaSAndroid Build Coastguard Worker {
796*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
797*3f1979aaSAndroid Build Coastguard Worker vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
798*3f1979aaSAndroid Build Coastguard Worker }
799*3f1979aaSAndroid Build Coastguard Worker
800*3f1979aaSAndroid Build Coastguard Worker // Load 4 single-precision (32-bit) floating-point elements from memory into dst
801*3f1979aaSAndroid Build Coastguard Worker // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
802*3f1979aaSAndroid Build Coastguard Worker // general-protection exception may be generated.
803*3f1979aaSAndroid Build Coastguard Worker //
804*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := MEM[mem_addr+127:mem_addr+96]
805*3f1979aaSAndroid Build Coastguard Worker // dst[63:32] := MEM[mem_addr+95:mem_addr+64]
806*3f1979aaSAndroid Build Coastguard Worker // dst[95:64] := MEM[mem_addr+63:mem_addr+32]
807*3f1979aaSAndroid Build Coastguard Worker // dst[127:96] := MEM[mem_addr+31:mem_addr]
808*3f1979aaSAndroid Build Coastguard Worker //
809*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
_mm_loadr_ps(const float * p)810*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
811*3f1979aaSAndroid Build Coastguard Worker {
812*3f1979aaSAndroid Build Coastguard Worker float32x4_t v = vrev64q_f32(vld1q_f32(p));
813*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
814*3f1979aaSAndroid Build Coastguard Worker }
815*3f1979aaSAndroid Build Coastguard Worker
816*3f1979aaSAndroid Build Coastguard Worker // Sets the upper two single-precision, floating-point values with 64
817*3f1979aaSAndroid Build Coastguard Worker // bits of data loaded from the address p; the lower two values are passed
818*3f1979aaSAndroid Build Coastguard Worker // through from a.
819*3f1979aaSAndroid Build Coastguard Worker //
820*3f1979aaSAndroid Build Coastguard Worker // r0 := a0
821*3f1979aaSAndroid Build Coastguard Worker // r1 := a1
822*3f1979aaSAndroid Build Coastguard Worker // r2 := *p0
823*3f1979aaSAndroid Build Coastguard Worker // r3 := *p1
824*3f1979aaSAndroid Build Coastguard Worker //
825*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
_mm_loadh_pi(__m128 a,__m64 const * p)826*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
827*3f1979aaSAndroid Build Coastguard Worker {
828*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
829*3f1979aaSAndroid Build Coastguard Worker vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
830*3f1979aaSAndroid Build Coastguard Worker }
831*3f1979aaSAndroid Build Coastguard Worker
832*3f1979aaSAndroid Build Coastguard Worker // Loads four single-precision, floating-point values.
833*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
_mm_load_ps(const float * p)834*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_load_ps(const float *p)
835*3f1979aaSAndroid Build Coastguard Worker {
836*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vld1q_f32(p));
837*3f1979aaSAndroid Build Coastguard Worker }
838*3f1979aaSAndroid Build Coastguard Worker
839*3f1979aaSAndroid Build Coastguard Worker // Loads four single-precision, floating-point values.
840*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
_mm_loadu_ps(const float * p)841*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
842*3f1979aaSAndroid Build Coastguard Worker {
843*3f1979aaSAndroid Build Coastguard Worker // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
844*3f1979aaSAndroid Build Coastguard Worker // equivalent for neon
845*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vld1q_f32(p));
846*3f1979aaSAndroid Build Coastguard Worker }
847*3f1979aaSAndroid Build Coastguard Worker
848*3f1979aaSAndroid Build Coastguard Worker // Load unaligned 16-bit integer from memory into the first element of dst.
849*3f1979aaSAndroid Build Coastguard Worker //
850*3f1979aaSAndroid Build Coastguard Worker // dst[15:0] := MEM[mem_addr+15:mem_addr]
851*3f1979aaSAndroid Build Coastguard Worker // dst[MAX:16] := 0
852*3f1979aaSAndroid Build Coastguard Worker //
853*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
_mm_loadu_si16(const void * p)854*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
855*3f1979aaSAndroid Build Coastguard Worker {
856*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
857*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
858*3f1979aaSAndroid Build Coastguard Worker }
859*3f1979aaSAndroid Build Coastguard Worker
860*3f1979aaSAndroid Build Coastguard Worker // Load unaligned 64-bit integer from memory into the first element of dst.
861*3f1979aaSAndroid Build Coastguard Worker //
862*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := MEM[mem_addr+63:mem_addr]
863*3f1979aaSAndroid Build Coastguard Worker // dst[MAX:64] := 0
864*3f1979aaSAndroid Build Coastguard Worker //
865*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
_mm_loadu_si64(const void * p)866*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
867*3f1979aaSAndroid Build Coastguard Worker {
868*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(
869*3f1979aaSAndroid Build Coastguard Worker vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
870*3f1979aaSAndroid Build Coastguard Worker }
871*3f1979aaSAndroid Build Coastguard Worker
872*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into the
873*3f1979aaSAndroid Build Coastguard Worker // lower of dst, and zero the upper element. mem_addr does not need to be
874*3f1979aaSAndroid Build Coastguard Worker // aligned on any particular boundary.
875*3f1979aaSAndroid Build Coastguard Worker //
876*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := MEM[mem_addr+63:mem_addr]
877*3f1979aaSAndroid Build Coastguard Worker // dst[127:64] := 0
878*3f1979aaSAndroid Build Coastguard Worker //
879*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
_mm_load_sd(const double * p)880*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_load_sd(const double *p)
881*3f1979aaSAndroid Build Coastguard Worker {
882*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
883*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
884*3f1979aaSAndroid Build Coastguard Worker #else
885*3f1979aaSAndroid Build Coastguard Worker const float *fp = (const float *) p;
886*3f1979aaSAndroid Build Coastguard Worker float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
887*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f32(vld1q_f32(data));
888*3f1979aaSAndroid Build Coastguard Worker #endif
889*3f1979aaSAndroid Build Coastguard Worker }
890*3f1979aaSAndroid Build Coastguard Worker
891*3f1979aaSAndroid Build Coastguard Worker // Loads two double-precision from 16-byte aligned memory, floating-point
892*3f1979aaSAndroid Build Coastguard Worker // values.
893*3f1979aaSAndroid Build Coastguard Worker //
894*3f1979aaSAndroid Build Coastguard Worker // dst[127:0] := MEM[mem_addr+127:mem_addr]
895*3f1979aaSAndroid Build Coastguard Worker //
896*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
_mm_load_pd(const double * p)897*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_load_pd(const double *p)
898*3f1979aaSAndroid Build Coastguard Worker {
899*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
900*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f64(vld1q_f64(p));
901*3f1979aaSAndroid Build Coastguard Worker #else
902*3f1979aaSAndroid Build Coastguard Worker const float *fp = (const float *) p;
903*3f1979aaSAndroid Build Coastguard Worker float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
904*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f32(vld1q_f32(data));
905*3f1979aaSAndroid Build Coastguard Worker #endif
906*3f1979aaSAndroid Build Coastguard Worker }
907*3f1979aaSAndroid Build Coastguard Worker
908*3f1979aaSAndroid Build Coastguard Worker // Loads two double-precision from unaligned memory, floating-point values.
909*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
_mm_loadu_pd(const double * p)910*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
911*3f1979aaSAndroid Build Coastguard Worker {
912*3f1979aaSAndroid Build Coastguard Worker return _mm_load_pd(p);
913*3f1979aaSAndroid Build Coastguard Worker }
914*3f1979aaSAndroid Build Coastguard Worker
915*3f1979aaSAndroid Build Coastguard Worker // Loads an single - precision, floating - point value into the low word and
916*3f1979aaSAndroid Build Coastguard Worker // clears the upper three words.
917*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
_mm_load_ss(const float * p)918*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_load_ss(const float *p)
919*3f1979aaSAndroid Build Coastguard Worker {
920*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
921*3f1979aaSAndroid Build Coastguard Worker }
922*3f1979aaSAndroid Build Coastguard Worker
_mm_loadl_epi64(__m128i const * p)923*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
924*3f1979aaSAndroid Build Coastguard Worker {
925*3f1979aaSAndroid Build Coastguard Worker /* Load the lower 64 bits of the value pointed to by p into the
926*3f1979aaSAndroid Build Coastguard Worker * lower 64 bits of the result, zeroing the upper 64 bits of the result.
927*3f1979aaSAndroid Build Coastguard Worker */
928*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
929*3f1979aaSAndroid Build Coastguard Worker vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
930*3f1979aaSAndroid Build Coastguard Worker }
931*3f1979aaSAndroid Build Coastguard Worker
932*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into the
933*3f1979aaSAndroid Build Coastguard Worker // lower element of dst, and copy the upper element from a to dst. mem_addr does
934*3f1979aaSAndroid Build Coastguard Worker // not need to be aligned on any particular boundary.
935*3f1979aaSAndroid Build Coastguard Worker //
936*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := MEM[mem_addr+63:mem_addr]
937*3f1979aaSAndroid Build Coastguard Worker // dst[127:64] := a[127:64]
938*3f1979aaSAndroid Build Coastguard Worker //
939*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
_mm_loadl_pd(__m128d a,const double * p)940*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
941*3f1979aaSAndroid Build Coastguard Worker {
942*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
943*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f64(
944*3f1979aaSAndroid Build Coastguard Worker vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
945*3f1979aaSAndroid Build Coastguard Worker #else
946*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f32(
947*3f1979aaSAndroid Build Coastguard Worker vcombine_f32(vld1_f32((const float *) p),
948*3f1979aaSAndroid Build Coastguard Worker vget_high_f32(vreinterpretq_f32_m128d(a))));
949*3f1979aaSAndroid Build Coastguard Worker #endif
950*3f1979aaSAndroid Build Coastguard Worker }
951*3f1979aaSAndroid Build Coastguard Worker
952*3f1979aaSAndroid Build Coastguard Worker // Load 2 double-precision (64-bit) floating-point elements from memory into dst
953*3f1979aaSAndroid Build Coastguard Worker // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
954*3f1979aaSAndroid Build Coastguard Worker // general-protection exception may be generated.
955*3f1979aaSAndroid Build Coastguard Worker //
956*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := MEM[mem_addr+127:mem_addr+64]
957*3f1979aaSAndroid Build Coastguard Worker // dst[127:64] := MEM[mem_addr+63:mem_addr]
958*3f1979aaSAndroid Build Coastguard Worker //
959*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
_mm_loadr_pd(const double * p)960*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
961*3f1979aaSAndroid Build Coastguard Worker {
962*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
963*3f1979aaSAndroid Build Coastguard Worker float64x2_t v = vld1q_f64(p);
964*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
965*3f1979aaSAndroid Build Coastguard Worker #else
966*3f1979aaSAndroid Build Coastguard Worker int64x2_t v = vld1q_s64((const int64_t *) p);
967*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
968*3f1979aaSAndroid Build Coastguard Worker #endif
969*3f1979aaSAndroid Build Coastguard Worker }
970*3f1979aaSAndroid Build Coastguard Worker
971*3f1979aaSAndroid Build Coastguard Worker // Sets the low word to the single-precision, floating-point value of b
972*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
_mm_move_ss(__m128 a,__m128 b)973*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
974*3f1979aaSAndroid Build Coastguard Worker {
975*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
976*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
977*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_f32_m128(a), 0));
978*3f1979aaSAndroid Build Coastguard Worker }
979*3f1979aaSAndroid Build Coastguard Worker
980*3f1979aaSAndroid Build Coastguard Worker // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
981*3f1979aaSAndroid Build Coastguard Worker // upper element.
982*3f1979aaSAndroid Build Coastguard Worker //
983*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := a[63:0]
984*3f1979aaSAndroid Build Coastguard Worker // dst[127:64] := 0
985*3f1979aaSAndroid Build Coastguard Worker //
986*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
_mm_move_epi64(__m128i a)987*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
988*3f1979aaSAndroid Build Coastguard Worker {
989*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(
990*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
991*3f1979aaSAndroid Build Coastguard Worker }
992*3f1979aaSAndroid Build Coastguard Worker
993*3f1979aaSAndroid Build Coastguard Worker // Return vector of type __m128 with undefined elements.
994*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
_mm_undefined_ps(void)995*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_undefined_ps(void)
996*3f1979aaSAndroid Build Coastguard Worker {
997*3f1979aaSAndroid Build Coastguard Worker __m128 a;
998*3f1979aaSAndroid Build Coastguard Worker return a;
999*3f1979aaSAndroid Build Coastguard Worker }
1000*3f1979aaSAndroid Build Coastguard Worker
1001*3f1979aaSAndroid Build Coastguard Worker /* Logic/Binary operations */
1002*3f1979aaSAndroid Build Coastguard Worker
1003*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise AND-NOT of the four single-precision, floating-point
1004*3f1979aaSAndroid Build Coastguard Worker // values of a and b.
1005*3f1979aaSAndroid Build Coastguard Worker //
1006*3f1979aaSAndroid Build Coastguard Worker // r0 := ~a0 & b0
1007*3f1979aaSAndroid Build Coastguard Worker // r1 := ~a1 & b1
1008*3f1979aaSAndroid Build Coastguard Worker // r2 := ~a2 & b2
1009*3f1979aaSAndroid Build Coastguard Worker // r3 := ~a3 & b3
1010*3f1979aaSAndroid Build Coastguard Worker //
1011*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
_mm_andnot_ps(__m128 a,__m128 b)1012*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1013*3f1979aaSAndroid Build Coastguard Worker {
1014*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_s32(
1015*3f1979aaSAndroid Build Coastguard Worker vbicq_s32(vreinterpretq_s32_m128(b),
1016*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1017*3f1979aaSAndroid Build Coastguard Worker }
1018*3f1979aaSAndroid Build Coastguard Worker
1019*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
1020*3f1979aaSAndroid Build Coastguard Worker // elements in a and then AND with b, and store the results in dst.
1021*3f1979aaSAndroid Build Coastguard Worker //
1022*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 1
1023*3f1979aaSAndroid Build Coastguard Worker // i := j*64
1024*3f1979aaSAndroid Build Coastguard Worker // dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
1025*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1026*3f1979aaSAndroid Build Coastguard Worker //
1027*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
_mm_andnot_pd(__m128d a,__m128d b)1028*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
1029*3f1979aaSAndroid Build Coastguard Worker {
1030*3f1979aaSAndroid Build Coastguard Worker // *NOTE* argument swap
1031*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_s64(
1032*3f1979aaSAndroid Build Coastguard Worker vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
1033*3f1979aaSAndroid Build Coastguard Worker }
1034*3f1979aaSAndroid Build Coastguard Worker
1035*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
1036*3f1979aaSAndroid Build Coastguard Worker // 128-bit value in a.
1037*3f1979aaSAndroid Build Coastguard Worker //
1038*3f1979aaSAndroid Build Coastguard Worker // r := (~a) & b
1039*3f1979aaSAndroid Build Coastguard Worker //
1040*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
_mm_andnot_si128(__m128i a,__m128i b)1041*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
1042*3f1979aaSAndroid Build Coastguard Worker {
1043*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
1044*3f1979aaSAndroid Build Coastguard Worker vbicq_s32(vreinterpretq_s32_m128i(b),
1045*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
1046*3f1979aaSAndroid Build Coastguard Worker }
1047*3f1979aaSAndroid Build Coastguard Worker
1048*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
1049*3f1979aaSAndroid Build Coastguard Worker // b.
1050*3f1979aaSAndroid Build Coastguard Worker //
1051*3f1979aaSAndroid Build Coastguard Worker // r := a & b
1052*3f1979aaSAndroid Build Coastguard Worker //
1053*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
_mm_and_si128(__m128i a,__m128i b)1054*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
1055*3f1979aaSAndroid Build Coastguard Worker {
1056*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
1057*3f1979aaSAndroid Build Coastguard Worker vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1058*3f1979aaSAndroid Build Coastguard Worker }
1059*3f1979aaSAndroid Build Coastguard Worker
1060*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise AND of the four single-precision, floating-point values
1061*3f1979aaSAndroid Build Coastguard Worker // of a and b.
1062*3f1979aaSAndroid Build Coastguard Worker //
1063*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 & b0
1064*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 & b1
1065*3f1979aaSAndroid Build Coastguard Worker // r2 := a2 & b2
1066*3f1979aaSAndroid Build Coastguard Worker // r3 := a3 & b3
1067*3f1979aaSAndroid Build Coastguard Worker //
1068*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
_mm_and_ps(__m128 a,__m128 b)1069*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1070*3f1979aaSAndroid Build Coastguard Worker {
1071*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_s32(
1072*3f1979aaSAndroid Build Coastguard Worker vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1073*3f1979aaSAndroid Build Coastguard Worker }
1074*3f1979aaSAndroid Build Coastguard Worker
1075*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise AND of packed double-precision (64-bit) floating-point
1076*3f1979aaSAndroid Build Coastguard Worker // elements in a and b, and store the results in dst.
1077*3f1979aaSAndroid Build Coastguard Worker //
1078*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 1
1079*3f1979aaSAndroid Build Coastguard Worker // i := j*64
1080*3f1979aaSAndroid Build Coastguard Worker // dst[i+63:i] := a[i+63:i] AND b[i+63:i]
1081*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1082*3f1979aaSAndroid Build Coastguard Worker //
1083*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
_mm_and_pd(__m128d a,__m128d b)1084*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
1085*3f1979aaSAndroid Build Coastguard Worker {
1086*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_s64(
1087*3f1979aaSAndroid Build Coastguard Worker vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1088*3f1979aaSAndroid Build Coastguard Worker }
1089*3f1979aaSAndroid Build Coastguard Worker
1090*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise OR of the four single-precision, floating-point values
1091*3f1979aaSAndroid Build Coastguard Worker // of a and b.
1092*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
_mm_or_ps(__m128 a,__m128 b)1093*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
1094*3f1979aaSAndroid Build Coastguard Worker {
1095*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_s32(
1096*3f1979aaSAndroid Build Coastguard Worker vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1097*3f1979aaSAndroid Build Coastguard Worker }
1098*3f1979aaSAndroid Build Coastguard Worker
1099*3f1979aaSAndroid Build Coastguard Worker // Computes bitwise EXOR (exclusive-or) of the four single-precision,
1100*3f1979aaSAndroid Build Coastguard Worker // floating-point values of a and b.
1101*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
_mm_xor_ps(__m128 a,__m128 b)1102*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
1103*3f1979aaSAndroid Build Coastguard Worker {
1104*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_s32(
1105*3f1979aaSAndroid Build Coastguard Worker veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1106*3f1979aaSAndroid Build Coastguard Worker }
1107*3f1979aaSAndroid Build Coastguard Worker
1108*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
1109*3f1979aaSAndroid Build Coastguard Worker // elements in a and b, and store the results in dst.
1110*3f1979aaSAndroid Build Coastguard Worker //
1111*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 1
1112*3f1979aaSAndroid Build Coastguard Worker // i := j*64
1113*3f1979aaSAndroid Build Coastguard Worker // dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
1114*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1115*3f1979aaSAndroid Build Coastguard Worker //
1116*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
_mm_xor_pd(__m128d a,__m128d b)1117*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
1118*3f1979aaSAndroid Build Coastguard Worker {
1119*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_s64(
1120*3f1979aaSAndroid Build Coastguard Worker veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1121*3f1979aaSAndroid Build Coastguard Worker }
1122*3f1979aaSAndroid Build Coastguard Worker
1123*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
1124*3f1979aaSAndroid Build Coastguard Worker //
1125*3f1979aaSAndroid Build Coastguard Worker // r := a | b
1126*3f1979aaSAndroid Build Coastguard Worker //
1127*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
_mm_or_si128(__m128i a,__m128i b)1128*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
1129*3f1979aaSAndroid Build Coastguard Worker {
1130*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
1131*3f1979aaSAndroid Build Coastguard Worker vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1132*3f1979aaSAndroid Build Coastguard Worker }
1133*3f1979aaSAndroid Build Coastguard Worker
1134*3f1979aaSAndroid Build Coastguard Worker // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
1135*3f1979aaSAndroid Build Coastguard Worker // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
_mm_xor_si128(__m128i a,__m128i b)1136*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
1137*3f1979aaSAndroid Build Coastguard Worker {
1138*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
1139*3f1979aaSAndroid Build Coastguard Worker veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1140*3f1979aaSAndroid Build Coastguard Worker }
1141*3f1979aaSAndroid Build Coastguard Worker
1142*3f1979aaSAndroid Build Coastguard Worker // Duplicate odd-indexed single-precision (32-bit) floating-point elements
1143*3f1979aaSAndroid Build Coastguard Worker // from a, and store the results in dst.
1144*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
_mm_movehdup_ps(__m128 a)1145*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
1146*3f1979aaSAndroid Build Coastguard Worker {
1147*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1148*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(__builtin_shufflevector(
1149*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
1150*3f1979aaSAndroid Build Coastguard Worker #else
1151*3f1979aaSAndroid Build Coastguard Worker float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
1152*3f1979aaSAndroid Build Coastguard Worker float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
1153*3f1979aaSAndroid Build Coastguard Worker float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
1154*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vld1q_f32(data));
1155*3f1979aaSAndroid Build Coastguard Worker #endif
1156*3f1979aaSAndroid Build Coastguard Worker }
1157*3f1979aaSAndroid Build Coastguard Worker
1158*3f1979aaSAndroid Build Coastguard Worker // Duplicate even-indexed single-precision (32-bit) floating-point elements
1159*3f1979aaSAndroid Build Coastguard Worker // from a, and store the results in dst.
1160*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
_mm_moveldup_ps(__m128 a)1161*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
1162*3f1979aaSAndroid Build Coastguard Worker {
1163*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1164*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(__builtin_shufflevector(
1165*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
1166*3f1979aaSAndroid Build Coastguard Worker #else
1167*3f1979aaSAndroid Build Coastguard Worker float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1168*3f1979aaSAndroid Build Coastguard Worker float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
1169*3f1979aaSAndroid Build Coastguard Worker float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
1170*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vld1q_f32(data));
1171*3f1979aaSAndroid Build Coastguard Worker #endif
1172*3f1979aaSAndroid Build Coastguard Worker }
1173*3f1979aaSAndroid Build Coastguard Worker
1174*3f1979aaSAndroid Build Coastguard Worker // Moves the upper two values of B into the lower two values of A.
1175*3f1979aaSAndroid Build Coastguard Worker //
1176*3f1979aaSAndroid Build Coastguard Worker // r3 := a3
1177*3f1979aaSAndroid Build Coastguard Worker // r2 := a2
1178*3f1979aaSAndroid Build Coastguard Worker // r1 := b3
1179*3f1979aaSAndroid Build Coastguard Worker // r0 := b2
_mm_movehl_ps(__m128 __A,__m128 __B)1180*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
1181*3f1979aaSAndroid Build Coastguard Worker {
1182*3f1979aaSAndroid Build Coastguard Worker float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
1183*3f1979aaSAndroid Build Coastguard Worker float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
1184*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
1185*3f1979aaSAndroid Build Coastguard Worker }
1186*3f1979aaSAndroid Build Coastguard Worker
1187*3f1979aaSAndroid Build Coastguard Worker // Moves the lower two values of B into the upper two values of A.
1188*3f1979aaSAndroid Build Coastguard Worker //
1189*3f1979aaSAndroid Build Coastguard Worker // r3 := b1
1190*3f1979aaSAndroid Build Coastguard Worker // r2 := b0
1191*3f1979aaSAndroid Build Coastguard Worker // r1 := a1
1192*3f1979aaSAndroid Build Coastguard Worker // r0 := a0
_mm_movelh_ps(__m128 __A,__m128 __B)1193*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
1194*3f1979aaSAndroid Build Coastguard Worker {
1195*3f1979aaSAndroid Build Coastguard Worker float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
1196*3f1979aaSAndroid Build Coastguard Worker float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
1197*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1198*3f1979aaSAndroid Build Coastguard Worker }
1199*3f1979aaSAndroid Build Coastguard Worker
1200*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 32-bit integers in a, and store
1201*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1202*3f1979aaSAndroid Build Coastguard Worker //
1203*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
1204*3f1979aaSAndroid Build Coastguard Worker // i := j*32
1205*3f1979aaSAndroid Build Coastguard Worker // dst[i+31:i] := ABS(a[i+31:i])
1206*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1207*3f1979aaSAndroid Build Coastguard Worker //
1208*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
_mm_abs_epi32(__m128i a)1209*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
1210*3f1979aaSAndroid Build Coastguard Worker {
1211*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
1212*3f1979aaSAndroid Build Coastguard Worker }
1213*3f1979aaSAndroid Build Coastguard Worker
1214*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 16-bit integers in a, and store
1215*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1216*3f1979aaSAndroid Build Coastguard Worker //
1217*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
1218*3f1979aaSAndroid Build Coastguard Worker // i := j*16
1219*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := ABS(a[i+15:i])
1220*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1221*3f1979aaSAndroid Build Coastguard Worker //
1222*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
_mm_abs_epi16(__m128i a)1223*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
1224*3f1979aaSAndroid Build Coastguard Worker {
1225*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
1226*3f1979aaSAndroid Build Coastguard Worker }
1227*3f1979aaSAndroid Build Coastguard Worker
1228*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 8-bit integers in a, and store
1229*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1230*3f1979aaSAndroid Build Coastguard Worker //
1231*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 15
1232*3f1979aaSAndroid Build Coastguard Worker // i := j*8
1233*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := ABS(a[i+7:i])
1234*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1235*3f1979aaSAndroid Build Coastguard Worker //
1236*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
_mm_abs_epi8(__m128i a)1237*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
1238*3f1979aaSAndroid Build Coastguard Worker {
1239*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
1240*3f1979aaSAndroid Build Coastguard Worker }
1241*3f1979aaSAndroid Build Coastguard Worker
1242*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 32-bit integers in a, and store
1243*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1244*3f1979aaSAndroid Build Coastguard Worker //
1245*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 1
1246*3f1979aaSAndroid Build Coastguard Worker // i := j*32
1247*3f1979aaSAndroid Build Coastguard Worker // dst[i+31:i] := ABS(a[i+31:i])
1248*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1249*3f1979aaSAndroid Build Coastguard Worker //
1250*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
_mm_abs_pi32(__m64 a)1251*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
1252*3f1979aaSAndroid Build Coastguard Worker {
1253*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
1254*3f1979aaSAndroid Build Coastguard Worker }
1255*3f1979aaSAndroid Build Coastguard Worker
1256*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 16-bit integers in a, and store
1257*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1258*3f1979aaSAndroid Build Coastguard Worker //
1259*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
1260*3f1979aaSAndroid Build Coastguard Worker // i := j*16
1261*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := ABS(a[i+15:i])
1262*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1263*3f1979aaSAndroid Build Coastguard Worker //
1264*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
_mm_abs_pi16(__m64 a)1265*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
1266*3f1979aaSAndroid Build Coastguard Worker {
1267*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
1268*3f1979aaSAndroid Build Coastguard Worker }
1269*3f1979aaSAndroid Build Coastguard Worker
1270*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute value of packed signed 8-bit integers in a, and store
1271*3f1979aaSAndroid Build Coastguard Worker // the unsigned results in dst.
1272*3f1979aaSAndroid Build Coastguard Worker //
1273*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
1274*3f1979aaSAndroid Build Coastguard Worker // i := j*8
1275*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := ABS(a[i+7:i])
1276*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1277*3f1979aaSAndroid Build Coastguard Worker //
1278*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
_mm_abs_pi8(__m64 a)1279*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
1280*3f1979aaSAndroid Build Coastguard Worker {
1281*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
1282*3f1979aaSAndroid Build Coastguard Worker }
1283*3f1979aaSAndroid Build Coastguard Worker
1284*3f1979aaSAndroid Build Coastguard Worker // Takes the upper 64 bits of a and places it in the low end of the result
1285*3f1979aaSAndroid Build Coastguard Worker // Takes the lower 64 bits of b and places it into the high end of the result.
_mm_shuffle_ps_1032(__m128 a,__m128 b)1286*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
1287*3f1979aaSAndroid Build Coastguard Worker {
1288*3f1979aaSAndroid Build Coastguard Worker float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1289*3f1979aaSAndroid Build Coastguard Worker float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1290*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
1291*3f1979aaSAndroid Build Coastguard Worker }
1292*3f1979aaSAndroid Build Coastguard Worker
1293*3f1979aaSAndroid Build Coastguard Worker // takes the lower two 32-bit values from a and swaps them and places in high
1294*3f1979aaSAndroid Build Coastguard Worker // end of result takes the higher two 32 bit values from b and swaps them and
1295*3f1979aaSAndroid Build Coastguard Worker // places in low end of result.
_mm_shuffle_ps_2301(__m128 a,__m128 b)1296*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
1297*3f1979aaSAndroid Build Coastguard Worker {
1298*3f1979aaSAndroid Build Coastguard Worker float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1299*3f1979aaSAndroid Build Coastguard Worker float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
1300*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
1301*3f1979aaSAndroid Build Coastguard Worker }
1302*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_0321(__m128 a,__m128 b)1303*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
1304*3f1979aaSAndroid Build Coastguard Worker {
1305*3f1979aaSAndroid Build Coastguard Worker float32x2_t a21 = vget_high_f32(
1306*3f1979aaSAndroid Build Coastguard Worker vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1307*3f1979aaSAndroid Build Coastguard Worker float32x2_t b03 = vget_low_f32(
1308*3f1979aaSAndroid Build Coastguard Worker vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1309*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
1310*3f1979aaSAndroid Build Coastguard Worker }
1311*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_2103(__m128 a,__m128 b)1312*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
1313*3f1979aaSAndroid Build Coastguard Worker {
1314*3f1979aaSAndroid Build Coastguard Worker float32x2_t a03 = vget_low_f32(
1315*3f1979aaSAndroid Build Coastguard Worker vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1316*3f1979aaSAndroid Build Coastguard Worker float32x2_t b21 = vget_high_f32(
1317*3f1979aaSAndroid Build Coastguard Worker vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1318*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
1319*3f1979aaSAndroid Build Coastguard Worker }
1320*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_1010(__m128 a,__m128 b)1321*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
1322*3f1979aaSAndroid Build Coastguard Worker {
1323*3f1979aaSAndroid Build Coastguard Worker float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1324*3f1979aaSAndroid Build Coastguard Worker float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1325*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1326*3f1979aaSAndroid Build Coastguard Worker }
1327*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_1001(__m128 a,__m128 b)1328*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
1329*3f1979aaSAndroid Build Coastguard Worker {
1330*3f1979aaSAndroid Build Coastguard Worker float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1331*3f1979aaSAndroid Build Coastguard Worker float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1332*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
1333*3f1979aaSAndroid Build Coastguard Worker }
1334*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_0101(__m128 a,__m128 b)1335*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
1336*3f1979aaSAndroid Build Coastguard Worker {
1337*3f1979aaSAndroid Build Coastguard Worker float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1338*3f1979aaSAndroid Build Coastguard Worker float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
1339*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
1340*3f1979aaSAndroid Build Coastguard Worker }
1341*3f1979aaSAndroid Build Coastguard Worker
1342*3f1979aaSAndroid Build Coastguard Worker // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
1343*3f1979aaSAndroid Build Coastguard Worker // high
_mm_shuffle_ps_3210(__m128 a,__m128 b)1344*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
1345*3f1979aaSAndroid Build Coastguard Worker {
1346*3f1979aaSAndroid Build Coastguard Worker float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1347*3f1979aaSAndroid Build Coastguard Worker float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1348*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
1349*3f1979aaSAndroid Build Coastguard Worker }
1350*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_0011(__m128 a,__m128 b)1351*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
1352*3f1979aaSAndroid Build Coastguard Worker {
1353*3f1979aaSAndroid Build Coastguard Worker float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
1354*3f1979aaSAndroid Build Coastguard Worker float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1355*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
1356*3f1979aaSAndroid Build Coastguard Worker }
1357*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_0022(__m128 a,__m128 b)1358*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
1359*3f1979aaSAndroid Build Coastguard Worker {
1360*3f1979aaSAndroid Build Coastguard Worker float32x2_t a22 =
1361*3f1979aaSAndroid Build Coastguard Worker vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1362*3f1979aaSAndroid Build Coastguard Worker float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1363*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
1364*3f1979aaSAndroid Build Coastguard Worker }
1365*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_2200(__m128 a,__m128 b)1366*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
1367*3f1979aaSAndroid Build Coastguard Worker {
1368*3f1979aaSAndroid Build Coastguard Worker float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
1369*3f1979aaSAndroid Build Coastguard Worker float32x2_t b22 =
1370*3f1979aaSAndroid Build Coastguard Worker vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
1371*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
1372*3f1979aaSAndroid Build Coastguard Worker }
1373*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_3202(__m128 a,__m128 b)1374*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
1375*3f1979aaSAndroid Build Coastguard Worker {
1376*3f1979aaSAndroid Build Coastguard Worker float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1377*3f1979aaSAndroid Build Coastguard Worker float32x2_t a22 =
1378*3f1979aaSAndroid Build Coastguard Worker vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1379*3f1979aaSAndroid Build Coastguard Worker float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
1380*3f1979aaSAndroid Build Coastguard Worker float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1381*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
1382*3f1979aaSAndroid Build Coastguard Worker }
1383*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_1133(__m128 a,__m128 b)1384*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
1385*3f1979aaSAndroid Build Coastguard Worker {
1386*3f1979aaSAndroid Build Coastguard Worker float32x2_t a33 =
1387*3f1979aaSAndroid Build Coastguard Worker vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
1388*3f1979aaSAndroid Build Coastguard Worker float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
1389*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
1390*3f1979aaSAndroid Build Coastguard Worker }
1391*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_2010(__m128 a,__m128 b)1392*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
1393*3f1979aaSAndroid Build Coastguard Worker {
1394*3f1979aaSAndroid Build Coastguard Worker float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1395*3f1979aaSAndroid Build Coastguard Worker float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
1396*3f1979aaSAndroid Build Coastguard Worker float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1397*3f1979aaSAndroid Build Coastguard Worker float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1398*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
1399*3f1979aaSAndroid Build Coastguard Worker }
1400*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_2001(__m128 a,__m128 b)1401*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
1402*3f1979aaSAndroid Build Coastguard Worker {
1403*3f1979aaSAndroid Build Coastguard Worker float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1404*3f1979aaSAndroid Build Coastguard Worker float32_t b2 = vgetq_lane_f32(b, 2);
1405*3f1979aaSAndroid Build Coastguard Worker float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1406*3f1979aaSAndroid Build Coastguard Worker float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1407*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
1408*3f1979aaSAndroid Build Coastguard Worker }
1409*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_ps_2032(__m128 a,__m128 b)1410*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
1411*3f1979aaSAndroid Build Coastguard Worker {
1412*3f1979aaSAndroid Build Coastguard Worker float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1413*3f1979aaSAndroid Build Coastguard Worker float32_t b2 = vgetq_lane_f32(b, 2);
1414*3f1979aaSAndroid Build Coastguard Worker float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1415*3f1979aaSAndroid Build Coastguard Worker float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1416*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
1417*3f1979aaSAndroid Build Coastguard Worker }
1418*3f1979aaSAndroid Build Coastguard Worker
1419*3f1979aaSAndroid Build Coastguard Worker // NEON does not support a general purpose permute intrinsic
1420*3f1979aaSAndroid Build Coastguard Worker // Selects four specific single-precision, floating-point values from a and b,
1421*3f1979aaSAndroid Build Coastguard Worker // based on the mask i.
1422*3f1979aaSAndroid Build Coastguard Worker //
1423*3f1979aaSAndroid Build Coastguard Worker // C equivalent:
1424*3f1979aaSAndroid Build Coastguard Worker // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1425*3f1979aaSAndroid Build Coastguard Worker // __constrange(0, 255) int imm) {
1426*3f1979aaSAndroid Build Coastguard Worker // __m128 ret;
1427*3f1979aaSAndroid Build Coastguard Worker // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1428*3f1979aaSAndroid Build Coastguard Worker // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
1429*3f1979aaSAndroid Build Coastguard Worker // return ret;
1430*3f1979aaSAndroid Build Coastguard Worker // }
1431*3f1979aaSAndroid Build Coastguard Worker //
1432*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
1433*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_ps_default(a, b, imm) \
1434*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1435*3f1979aaSAndroid Build Coastguard Worker float32x4_t ret; \
1436*3f1979aaSAndroid Build Coastguard Worker ret = vmovq_n_f32( \
1437*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
1438*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_f32( \
1439*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1440*3f1979aaSAndroid Build Coastguard Worker ret, 1); \
1441*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_f32( \
1442*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1443*3f1979aaSAndroid Build Coastguard Worker ret, 2); \
1444*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_f32( \
1445*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1446*3f1979aaSAndroid Build Coastguard Worker ret, 3); \
1447*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128_f32(ret); \
1448*3f1979aaSAndroid Build Coastguard Worker })
1449*3f1979aaSAndroid Build Coastguard Worker
1450*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
1451*3f1979aaSAndroid Build Coastguard Worker // int imm)
1452*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1453*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_ps(a, b, imm) \
1454*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1455*3f1979aaSAndroid Build Coastguard Worker float32x4_t _input1 = vreinterpretq_f32_m128(a); \
1456*3f1979aaSAndroid Build Coastguard Worker float32x4_t _input2 = vreinterpretq_f32_m128(b); \
1457*3f1979aaSAndroid Build Coastguard Worker float32x4_t _shuf = __builtin_shufflevector( \
1458*3f1979aaSAndroid Build Coastguard Worker _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1459*3f1979aaSAndroid Build Coastguard Worker (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
1460*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128_f32(_shuf); \
1461*3f1979aaSAndroid Build Coastguard Worker })
1462*3f1979aaSAndroid Build Coastguard Worker #else // generic
1463*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_ps(a, b, imm) \
1464*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1465*3f1979aaSAndroid Build Coastguard Worker __m128 ret; \
1466*3f1979aaSAndroid Build Coastguard Worker switch (imm) { \
1467*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(1, 0, 3, 2): \
1468*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_1032((a), (b)); \
1469*3f1979aaSAndroid Build Coastguard Worker break; \
1470*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(2, 3, 0, 1): \
1471*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_2301((a), (b)); \
1472*3f1979aaSAndroid Build Coastguard Worker break; \
1473*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(0, 3, 2, 1): \
1474*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_0321((a), (b)); \
1475*3f1979aaSAndroid Build Coastguard Worker break; \
1476*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(2, 1, 0, 3): \
1477*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_2103((a), (b)); \
1478*3f1979aaSAndroid Build Coastguard Worker break; \
1479*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(1, 0, 1, 0): \
1480*3f1979aaSAndroid Build Coastguard Worker ret = _mm_movelh_ps((a), (b)); \
1481*3f1979aaSAndroid Build Coastguard Worker break; \
1482*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(1, 0, 0, 1): \
1483*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_1001((a), (b)); \
1484*3f1979aaSAndroid Build Coastguard Worker break; \
1485*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(0, 1, 0, 1): \
1486*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_0101((a), (b)); \
1487*3f1979aaSAndroid Build Coastguard Worker break; \
1488*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(3, 2, 1, 0): \
1489*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_3210((a), (b)); \
1490*3f1979aaSAndroid Build Coastguard Worker break; \
1491*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(0, 0, 1, 1): \
1492*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_0011((a), (b)); \
1493*3f1979aaSAndroid Build Coastguard Worker break; \
1494*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(0, 0, 2, 2): \
1495*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_0022((a), (b)); \
1496*3f1979aaSAndroid Build Coastguard Worker break; \
1497*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(2, 2, 0, 0): \
1498*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_2200((a), (b)); \
1499*3f1979aaSAndroid Build Coastguard Worker break; \
1500*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(3, 2, 0, 2): \
1501*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_3202((a), (b)); \
1502*3f1979aaSAndroid Build Coastguard Worker break; \
1503*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(3, 2, 3, 2): \
1504*3f1979aaSAndroid Build Coastguard Worker ret = _mm_movehl_ps((b), (a)); \
1505*3f1979aaSAndroid Build Coastguard Worker break; \
1506*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(1, 1, 3, 3): \
1507*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_1133((a), (b)); \
1508*3f1979aaSAndroid Build Coastguard Worker break; \
1509*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(2, 0, 1, 0): \
1510*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_2010((a), (b)); \
1511*3f1979aaSAndroid Build Coastguard Worker break; \
1512*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(2, 0, 0, 1): \
1513*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_2001((a), (b)); \
1514*3f1979aaSAndroid Build Coastguard Worker break; \
1515*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(2, 0, 3, 2): \
1516*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_2032((a), (b)); \
1517*3f1979aaSAndroid Build Coastguard Worker break; \
1518*3f1979aaSAndroid Build Coastguard Worker default: \
1519*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_ps_default((a), (b), (imm)); \
1520*3f1979aaSAndroid Build Coastguard Worker break; \
1521*3f1979aaSAndroid Build Coastguard Worker } \
1522*3f1979aaSAndroid Build Coastguard Worker ret; \
1523*3f1979aaSAndroid Build Coastguard Worker })
1524*3f1979aaSAndroid Build Coastguard Worker #endif
1525*3f1979aaSAndroid Build Coastguard Worker
1526*3f1979aaSAndroid Build Coastguard Worker // Takes the upper 64 bits of a and places it in the low end of the result
1527*3f1979aaSAndroid Build Coastguard Worker // Takes the lower 64 bits of a and places it into the high end of the result.
_mm_shuffle_epi_1032(__m128i a)1528*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
1529*3f1979aaSAndroid Build Coastguard Worker {
1530*3f1979aaSAndroid Build Coastguard Worker int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1531*3f1979aaSAndroid Build Coastguard Worker int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1532*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1533*3f1979aaSAndroid Build Coastguard Worker }
1534*3f1979aaSAndroid Build Coastguard Worker
1535*3f1979aaSAndroid Build Coastguard Worker // takes the lower two 32-bit values from a and swaps them and places in low end
1536*3f1979aaSAndroid Build Coastguard Worker // of result takes the higher two 32 bit values from a and swaps them and places
1537*3f1979aaSAndroid Build Coastguard Worker // in high end of result.
_mm_shuffle_epi_2301(__m128i a)1538*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1539*3f1979aaSAndroid Build Coastguard Worker {
1540*3f1979aaSAndroid Build Coastguard Worker int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1541*3f1979aaSAndroid Build Coastguard Worker int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1542*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1543*3f1979aaSAndroid Build Coastguard Worker }
1544*3f1979aaSAndroid Build Coastguard Worker
1545*3f1979aaSAndroid Build Coastguard Worker // rotates the least significant 32 bits into the most signficant 32 bits, and
1546*3f1979aaSAndroid Build Coastguard Worker // shifts the rest down
_mm_shuffle_epi_0321(__m128i a)1547*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1548*3f1979aaSAndroid Build Coastguard Worker {
1549*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
1550*3f1979aaSAndroid Build Coastguard Worker vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1551*3f1979aaSAndroid Build Coastguard Worker }
1552*3f1979aaSAndroid Build Coastguard Worker
1553*3f1979aaSAndroid Build Coastguard Worker // rotates the most significant 32 bits into the least signficant 32 bits, and
1554*3f1979aaSAndroid Build Coastguard Worker // shifts the rest up
_mm_shuffle_epi_2103(__m128i a)1555*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1556*3f1979aaSAndroid Build Coastguard Worker {
1557*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
1558*3f1979aaSAndroid Build Coastguard Worker vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1559*3f1979aaSAndroid Build Coastguard Worker }
1560*3f1979aaSAndroid Build Coastguard Worker
1561*3f1979aaSAndroid Build Coastguard Worker // gets the lower 64 bits of a, and places it in the upper 64 bits
1562*3f1979aaSAndroid Build Coastguard Worker // gets the lower 64 bits of a and places it in the lower 64 bits
_mm_shuffle_epi_1010(__m128i a)1563*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1564*3f1979aaSAndroid Build Coastguard Worker {
1565*3f1979aaSAndroid Build Coastguard Worker int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1566*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1567*3f1979aaSAndroid Build Coastguard Worker }
1568*3f1979aaSAndroid Build Coastguard Worker
1569*3f1979aaSAndroid Build Coastguard Worker // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1570*3f1979aaSAndroid Build Coastguard Worker // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
_mm_shuffle_epi_1001(__m128i a)1571*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1572*3f1979aaSAndroid Build Coastguard Worker {
1573*3f1979aaSAndroid Build Coastguard Worker int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1574*3f1979aaSAndroid Build Coastguard Worker int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1575*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1576*3f1979aaSAndroid Build Coastguard Worker }
1577*3f1979aaSAndroid Build Coastguard Worker
1578*3f1979aaSAndroid Build Coastguard Worker // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1579*3f1979aaSAndroid Build Coastguard Worker // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1580*3f1979aaSAndroid Build Coastguard Worker // places it in the lower 64 bits
_mm_shuffle_epi_0101(__m128i a)1581*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1582*3f1979aaSAndroid Build Coastguard Worker {
1583*3f1979aaSAndroid Build Coastguard Worker int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1584*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1585*3f1979aaSAndroid Build Coastguard Worker }
1586*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_epi_2211(__m128i a)1587*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1588*3f1979aaSAndroid Build Coastguard Worker {
1589*3f1979aaSAndroid Build Coastguard Worker int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1590*3f1979aaSAndroid Build Coastguard Worker int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1591*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1592*3f1979aaSAndroid Build Coastguard Worker }
1593*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_epi_0122(__m128i a)1594*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1595*3f1979aaSAndroid Build Coastguard Worker {
1596*3f1979aaSAndroid Build Coastguard Worker int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1597*3f1979aaSAndroid Build Coastguard Worker int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1598*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1599*3f1979aaSAndroid Build Coastguard Worker }
1600*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_epi_3332(__m128i a)1601*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1602*3f1979aaSAndroid Build Coastguard Worker {
1603*3f1979aaSAndroid Build Coastguard Worker int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1604*3f1979aaSAndroid Build Coastguard Worker int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1605*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1606*3f1979aaSAndroid Build Coastguard Worker }
1607*3f1979aaSAndroid Build Coastguard Worker
1608*3f1979aaSAndroid Build Coastguard Worker // Shuffle packed 8-bit integers in a according to shuffle control mask in the
1609*3f1979aaSAndroid Build Coastguard Worker // corresponding 8-bit element of b, and store the results in dst.
1610*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
_mm_shuffle_epi8(__m128i a,__m128i b)1611*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
1612*3f1979aaSAndroid Build Coastguard Worker {
1613*3f1979aaSAndroid Build Coastguard Worker int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
1614*3f1979aaSAndroid Build Coastguard Worker uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
1615*3f1979aaSAndroid Build Coastguard Worker uint8x16_t idx_masked =
1616*3f1979aaSAndroid Build Coastguard Worker vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
1617*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
1618*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
1619*3f1979aaSAndroid Build Coastguard Worker #elif defined(__GNUC__)
1620*3f1979aaSAndroid Build Coastguard Worker int8x16_t ret;
1621*3f1979aaSAndroid Build Coastguard Worker // %e and %f represent the even and odd D registers
1622*3f1979aaSAndroid Build Coastguard Worker // respectively.
1623*3f1979aaSAndroid Build Coastguard Worker __asm__ __volatile__(
1624*3f1979aaSAndroid Build Coastguard Worker "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
1625*3f1979aaSAndroid Build Coastguard Worker "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
1626*3f1979aaSAndroid Build Coastguard Worker : [ret] "=&w"(ret)
1627*3f1979aaSAndroid Build Coastguard Worker : [tbl] "w"(tbl), [idx] "w"(idx_masked));
1628*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(ret);
1629*3f1979aaSAndroid Build Coastguard Worker #else
1630*3f1979aaSAndroid Build Coastguard Worker // use this line if testing on aarch64
1631*3f1979aaSAndroid Build Coastguard Worker int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
1632*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(
1633*3f1979aaSAndroid Build Coastguard Worker vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
1634*3f1979aaSAndroid Build Coastguard Worker vtbl2_s8(a_split, vget_high_u8(idx_masked))));
1635*3f1979aaSAndroid Build Coastguard Worker #endif
1636*3f1979aaSAndroid Build Coastguard Worker }
1637*3f1979aaSAndroid Build Coastguard Worker
1638*3f1979aaSAndroid Build Coastguard Worker // C equivalent:
1639*3f1979aaSAndroid Build Coastguard Worker // __m128i _mm_shuffle_epi32_default(__m128i a,
1640*3f1979aaSAndroid Build Coastguard Worker // __constrange(0, 255) int imm) {
1641*3f1979aaSAndroid Build Coastguard Worker // __m128i ret;
1642*3f1979aaSAndroid Build Coastguard Worker // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1643*3f1979aaSAndroid Build Coastguard Worker // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
1644*3f1979aaSAndroid Build Coastguard Worker // return ret;
1645*3f1979aaSAndroid Build Coastguard Worker // }
1646*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_epi32_default(a, imm) \
1647*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1648*3f1979aaSAndroid Build Coastguard Worker int32x4_t ret; \
1649*3f1979aaSAndroid Build Coastguard Worker ret = vmovq_n_s32( \
1650*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
1651*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s32( \
1652*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
1653*3f1979aaSAndroid Build Coastguard Worker ret, 1); \
1654*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s32( \
1655*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
1656*3f1979aaSAndroid Build Coastguard Worker ret, 2); \
1657*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s32( \
1658*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
1659*3f1979aaSAndroid Build Coastguard Worker ret, 3); \
1660*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s32(ret); \
1661*3f1979aaSAndroid Build Coastguard Worker })
1662*3f1979aaSAndroid Build Coastguard Worker
1663*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1664*3f1979aaSAndroid Build Coastguard Worker // int imm)
1665*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
1666*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_epi32_splat(a, imm) \
1667*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1668*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s32( \
1669*3f1979aaSAndroid Build Coastguard Worker vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1670*3f1979aaSAndroid Build Coastguard Worker })
1671*3f1979aaSAndroid Build Coastguard Worker #else
1672*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_epi32_splat(a, imm) \
1673*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1674*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s32( \
1675*3f1979aaSAndroid Build Coastguard Worker vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1676*3f1979aaSAndroid Build Coastguard Worker })
1677*3f1979aaSAndroid Build Coastguard Worker #endif
1678*3f1979aaSAndroid Build Coastguard Worker
1679*3f1979aaSAndroid Build Coastguard Worker // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
1680*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
1681*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
1682*3f1979aaSAndroid Build Coastguard Worker // __constrange(0,255) int imm)
1683*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1684*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_epi32(a, imm) \
1685*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1686*3f1979aaSAndroid Build Coastguard Worker int32x4_t _input = vreinterpretq_s32_m128i(a); \
1687*3f1979aaSAndroid Build Coastguard Worker int32x4_t _shuf = __builtin_shufflevector( \
1688*3f1979aaSAndroid Build Coastguard Worker _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1689*3f1979aaSAndroid Build Coastguard Worker ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
1690*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s32(_shuf); \
1691*3f1979aaSAndroid Build Coastguard Worker })
1692*3f1979aaSAndroid Build Coastguard Worker #else // generic
1693*3f1979aaSAndroid Build Coastguard Worker #define _mm_shuffle_epi32(a, imm) \
1694*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1695*3f1979aaSAndroid Build Coastguard Worker __m128i ret; \
1696*3f1979aaSAndroid Build Coastguard Worker switch (imm) { \
1697*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(1, 0, 3, 2): \
1698*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi_1032((a)); \
1699*3f1979aaSAndroid Build Coastguard Worker break; \
1700*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(2, 3, 0, 1): \
1701*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi_2301((a)); \
1702*3f1979aaSAndroid Build Coastguard Worker break; \
1703*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(0, 3, 2, 1): \
1704*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi_0321((a)); \
1705*3f1979aaSAndroid Build Coastguard Worker break; \
1706*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(2, 1, 0, 3): \
1707*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi_2103((a)); \
1708*3f1979aaSAndroid Build Coastguard Worker break; \
1709*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(1, 0, 1, 0): \
1710*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi_1010((a)); \
1711*3f1979aaSAndroid Build Coastguard Worker break; \
1712*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(1, 0, 0, 1): \
1713*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi_1001((a)); \
1714*3f1979aaSAndroid Build Coastguard Worker break; \
1715*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(0, 1, 0, 1): \
1716*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi_0101((a)); \
1717*3f1979aaSAndroid Build Coastguard Worker break; \
1718*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(2, 2, 1, 1): \
1719*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi_2211((a)); \
1720*3f1979aaSAndroid Build Coastguard Worker break; \
1721*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(0, 1, 2, 2): \
1722*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi_0122((a)); \
1723*3f1979aaSAndroid Build Coastguard Worker break; \
1724*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(3, 3, 3, 2): \
1725*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi_3332((a)); \
1726*3f1979aaSAndroid Build Coastguard Worker break; \
1727*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(0, 0, 0, 0): \
1728*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi32_splat((a), 0); \
1729*3f1979aaSAndroid Build Coastguard Worker break; \
1730*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(1, 1, 1, 1): \
1731*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi32_splat((a), 1); \
1732*3f1979aaSAndroid Build Coastguard Worker break; \
1733*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(2, 2, 2, 2): \
1734*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi32_splat((a), 2); \
1735*3f1979aaSAndroid Build Coastguard Worker break; \
1736*3f1979aaSAndroid Build Coastguard Worker case _MM_SHUFFLE(3, 3, 3, 3): \
1737*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi32_splat((a), 3); \
1738*3f1979aaSAndroid Build Coastguard Worker break; \
1739*3f1979aaSAndroid Build Coastguard Worker default: \
1740*3f1979aaSAndroid Build Coastguard Worker ret = _mm_shuffle_epi32_default((a), (imm)); \
1741*3f1979aaSAndroid Build Coastguard Worker break; \
1742*3f1979aaSAndroid Build Coastguard Worker } \
1743*3f1979aaSAndroid Build Coastguard Worker ret; \
1744*3f1979aaSAndroid Build Coastguard Worker })
1745*3f1979aaSAndroid Build Coastguard Worker #endif
1746*3f1979aaSAndroid Build Coastguard Worker
1747*3f1979aaSAndroid Build Coastguard Worker // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1748*3f1979aaSAndroid Build Coastguard Worker // by imm.
1749*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1750*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1751*3f1979aaSAndroid Build Coastguard Worker // __constrange(0,255) int
1752*3f1979aaSAndroid Build Coastguard Worker // imm)
1753*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflelo_epi16_function(a, imm) \
1754*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1755*3f1979aaSAndroid Build Coastguard Worker int16x8_t ret = vreinterpretq_s16_m128i(a); \
1756*3f1979aaSAndroid Build Coastguard Worker int16x4_t lowBits = vget_low_s16(ret); \
1757*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1758*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1759*3f1979aaSAndroid Build Coastguard Worker 1); \
1760*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1761*3f1979aaSAndroid Build Coastguard Worker 2); \
1762*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1763*3f1979aaSAndroid Build Coastguard Worker 3); \
1764*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s16(ret); \
1765*3f1979aaSAndroid Build Coastguard Worker })
1766*3f1979aaSAndroid Build Coastguard Worker
1767*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
1768*3f1979aaSAndroid Build Coastguard Worker // __constrange(0,255) int imm)
1769*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1770*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflelo_epi16(a, imm) \
1771*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1772*3f1979aaSAndroid Build Coastguard Worker int16x8_t _input = vreinterpretq_s16_m128i(a); \
1773*3f1979aaSAndroid Build Coastguard Worker int16x8_t _shuf = __builtin_shufflevector( \
1774*3f1979aaSAndroid Build Coastguard Worker _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
1775*3f1979aaSAndroid Build Coastguard Worker (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
1776*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s16(_shuf); \
1777*3f1979aaSAndroid Build Coastguard Worker })
1778*3f1979aaSAndroid Build Coastguard Worker #else // generic
1779*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
1780*3f1979aaSAndroid Build Coastguard Worker #endif
1781*3f1979aaSAndroid Build Coastguard Worker
1782*3f1979aaSAndroid Build Coastguard Worker // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1783*3f1979aaSAndroid Build Coastguard Worker // by imm.
1784*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1785*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1786*3f1979aaSAndroid Build Coastguard Worker // __constrange(0,255) int
1787*3f1979aaSAndroid Build Coastguard Worker // imm)
1788*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflehi_epi16_function(a, imm) \
1789*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1790*3f1979aaSAndroid Build Coastguard Worker int16x8_t ret = vreinterpretq_s16_m128i(a); \
1791*3f1979aaSAndroid Build Coastguard Worker int16x4_t highBits = vget_high_s16(ret); \
1792*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1793*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1794*3f1979aaSAndroid Build Coastguard Worker 5); \
1795*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1796*3f1979aaSAndroid Build Coastguard Worker 6); \
1797*3f1979aaSAndroid Build Coastguard Worker ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1798*3f1979aaSAndroid Build Coastguard Worker 7); \
1799*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s16(ret); \
1800*3f1979aaSAndroid Build Coastguard Worker })
1801*3f1979aaSAndroid Build Coastguard Worker
1802*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
1803*3f1979aaSAndroid Build Coastguard Worker // __constrange(0,255) int imm)
1804*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_shufflevector)
1805*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflehi_epi16(a, imm) \
1806*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1807*3f1979aaSAndroid Build Coastguard Worker int16x8_t _input = vreinterpretq_s16_m128i(a); \
1808*3f1979aaSAndroid Build Coastguard Worker int16x8_t _shuf = __builtin_shufflevector( \
1809*3f1979aaSAndroid Build Coastguard Worker _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
1810*3f1979aaSAndroid Build Coastguard Worker (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
1811*3f1979aaSAndroid Build Coastguard Worker (((imm) >> 6) & 0x3) + 4); \
1812*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s16(_shuf); \
1813*3f1979aaSAndroid Build Coastguard Worker })
1814*3f1979aaSAndroid Build Coastguard Worker #else // generic
1815*3f1979aaSAndroid Build Coastguard Worker #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
1816*3f1979aaSAndroid Build Coastguard Worker #endif
1817*3f1979aaSAndroid Build Coastguard Worker
1818*3f1979aaSAndroid Build Coastguard Worker // Blend packed 16-bit integers from a and b using control mask imm8, and store
1819*3f1979aaSAndroid Build Coastguard Worker // the results in dst.
1820*3f1979aaSAndroid Build Coastguard Worker //
1821*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
1822*3f1979aaSAndroid Build Coastguard Worker // i := j*16
1823*3f1979aaSAndroid Build Coastguard Worker // IF imm8[j]
1824*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := b[i+15:i]
1825*3f1979aaSAndroid Build Coastguard Worker // ELSE
1826*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := a[i+15:i]
1827*3f1979aaSAndroid Build Coastguard Worker // FI
1828*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1829*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
1830*3f1979aaSAndroid Build Coastguard Worker // __constrange(0,255) int imm)
1831*3f1979aaSAndroid Build Coastguard Worker #define _mm_blend_epi16(a, b, imm) \
1832*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1833*3f1979aaSAndroid Build Coastguard Worker const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \
1834*3f1979aaSAndroid Build Coastguard Worker ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \
1835*3f1979aaSAndroid Build Coastguard Worker ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \
1836*3f1979aaSAndroid Build Coastguard Worker ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \
1837*3f1979aaSAndroid Build Coastguard Worker ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \
1838*3f1979aaSAndroid Build Coastguard Worker ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \
1839*3f1979aaSAndroid Build Coastguard Worker ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \
1840*3f1979aaSAndroid Build Coastguard Worker ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
1841*3f1979aaSAndroid Build Coastguard Worker uint16x8_t _mask_vec = vld1q_u16(_mask); \
1842*3f1979aaSAndroid Build Coastguard Worker uint16x8_t _a = vreinterpretq_u16_m128i(a); \
1843*3f1979aaSAndroid Build Coastguard Worker uint16x8_t _b = vreinterpretq_u16_m128i(b); \
1844*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
1845*3f1979aaSAndroid Build Coastguard Worker })
1846*3f1979aaSAndroid Build Coastguard Worker
1847*3f1979aaSAndroid Build Coastguard Worker // Blend packed 8-bit integers from a and b using mask, and store the results in
1848*3f1979aaSAndroid Build Coastguard Worker // dst.
1849*3f1979aaSAndroid Build Coastguard Worker //
1850*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 15
1851*3f1979aaSAndroid Build Coastguard Worker // i := j*8
1852*3f1979aaSAndroid Build Coastguard Worker // IF mask[i+7]
1853*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := b[i+7:i]
1854*3f1979aaSAndroid Build Coastguard Worker // ELSE
1855*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := a[i+7:i]
1856*3f1979aaSAndroid Build Coastguard Worker // FI
1857*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
_mm_blendv_epi8(__m128i _a,__m128i _b,__m128i _mask)1858*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
1859*3f1979aaSAndroid Build Coastguard Worker {
1860*3f1979aaSAndroid Build Coastguard Worker // Use a signed shift right to create a mask with the sign bit
1861*3f1979aaSAndroid Build Coastguard Worker uint8x16_t mask =
1862*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
1863*3f1979aaSAndroid Build Coastguard Worker uint8x16_t a = vreinterpretq_u8_m128i(_a);
1864*3f1979aaSAndroid Build Coastguard Worker uint8x16_t b = vreinterpretq_u8_m128i(_b);
1865*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
1866*3f1979aaSAndroid Build Coastguard Worker }
1867*3f1979aaSAndroid Build Coastguard Worker
1868*3f1979aaSAndroid Build Coastguard Worker /* Shifts */
1869*3f1979aaSAndroid Build Coastguard Worker
1870*3f1979aaSAndroid Build Coastguard Worker
1871*3f1979aaSAndroid Build Coastguard Worker // Shift packed 16-bit integers in a right by imm while shifting in sign
1872*3f1979aaSAndroid Build Coastguard Worker // bits, and store the results in dst.
1873*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
_mm_srai_epi16(__m128i a,int imm)1874*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
1875*3f1979aaSAndroid Build Coastguard Worker {
1876*3f1979aaSAndroid Build Coastguard Worker const int count = (imm & ~15) ? 15 : imm;
1877*3f1979aaSAndroid Build Coastguard Worker return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
1878*3f1979aaSAndroid Build Coastguard Worker }
1879*3f1979aaSAndroid Build Coastguard Worker
1880*3f1979aaSAndroid Build Coastguard Worker // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1881*3f1979aaSAndroid Build Coastguard Worker // shifting in zeros.
1882*3f1979aaSAndroid Build Coastguard Worker //
1883*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 << count
1884*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 << count
1885*3f1979aaSAndroid Build Coastguard Worker // ...
1886*3f1979aaSAndroid Build Coastguard Worker // r7 := a7 << count
1887*3f1979aaSAndroid Build Coastguard Worker //
1888*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
1889*3f1979aaSAndroid Build Coastguard Worker #define _mm_slli_epi16(a, imm) \
1890*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1891*3f1979aaSAndroid Build Coastguard Worker __m128i ret; \
1892*3f1979aaSAndroid Build Coastguard Worker if ((imm) <= 0) { \
1893*3f1979aaSAndroid Build Coastguard Worker ret = a; \
1894*3f1979aaSAndroid Build Coastguard Worker } else if ((imm) > 15) { \
1895*3f1979aaSAndroid Build Coastguard Worker ret = _mm_setzero_si128(); \
1896*3f1979aaSAndroid Build Coastguard Worker } else { \
1897*3f1979aaSAndroid Build Coastguard Worker ret = vreinterpretq_m128i_s16( \
1898*3f1979aaSAndroid Build Coastguard Worker vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
1899*3f1979aaSAndroid Build Coastguard Worker } \
1900*3f1979aaSAndroid Build Coastguard Worker ret; \
1901*3f1979aaSAndroid Build Coastguard Worker })
1902*3f1979aaSAndroid Build Coastguard Worker
1903*3f1979aaSAndroid Build Coastguard Worker // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
1904*3f1979aaSAndroid Build Coastguard Worker // shifting in zeros. :
1905*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
1906*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
_mm_slli_epi32(__m128i a,int imm)1907*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
1908*3f1979aaSAndroid Build Coastguard Worker {
1909*3f1979aaSAndroid Build Coastguard Worker if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
1910*3f1979aaSAndroid Build Coastguard Worker return a;
1911*3f1979aaSAndroid Build Coastguard Worker if (imm > 31) /* TODO: add unlikely macro */
1912*3f1979aaSAndroid Build Coastguard Worker return _mm_setzero_si128();
1913*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
1914*3f1979aaSAndroid Build Coastguard Worker vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
1915*3f1979aaSAndroid Build Coastguard Worker }
1916*3f1979aaSAndroid Build Coastguard Worker
1917*3f1979aaSAndroid Build Coastguard Worker // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
1918*3f1979aaSAndroid Build Coastguard Worker // store the results in dst.
_mm_slli_epi64(__m128i a,int imm)1919*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
1920*3f1979aaSAndroid Build Coastguard Worker {
1921*3f1979aaSAndroid Build Coastguard Worker if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
1922*3f1979aaSAndroid Build Coastguard Worker return a;
1923*3f1979aaSAndroid Build Coastguard Worker if (imm > 63) /* TODO: add unlikely macro */
1924*3f1979aaSAndroid Build Coastguard Worker return _mm_setzero_si128();
1925*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(
1926*3f1979aaSAndroid Build Coastguard Worker vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
1927*3f1979aaSAndroid Build Coastguard Worker }
1928*3f1979aaSAndroid Build Coastguard Worker
1929*3f1979aaSAndroid Build Coastguard Worker // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
1930*3f1979aaSAndroid Build Coastguard Worker // store the results in dst.
1931*3f1979aaSAndroid Build Coastguard Worker //
1932*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
1933*3f1979aaSAndroid Build Coastguard Worker // i := j*16
1934*3f1979aaSAndroid Build Coastguard Worker // IF imm8[7:0] > 15
1935*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := 0
1936*3f1979aaSAndroid Build Coastguard Worker // ELSE
1937*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
1938*3f1979aaSAndroid Build Coastguard Worker // FI
1939*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1940*3f1979aaSAndroid Build Coastguard Worker //
1941*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
1942*3f1979aaSAndroid Build Coastguard Worker #define _mm_srli_epi16(a, imm) \
1943*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1944*3f1979aaSAndroid Build Coastguard Worker __m128i ret; \
1945*3f1979aaSAndroid Build Coastguard Worker if ((imm) == 0) { \
1946*3f1979aaSAndroid Build Coastguard Worker ret = a; \
1947*3f1979aaSAndroid Build Coastguard Worker } else if (0 < (imm) && (imm) < 16) { \
1948*3f1979aaSAndroid Build Coastguard Worker ret = vreinterpretq_m128i_u16( \
1949*3f1979aaSAndroid Build Coastguard Worker vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
1950*3f1979aaSAndroid Build Coastguard Worker } else { \
1951*3f1979aaSAndroid Build Coastguard Worker ret = _mm_setzero_si128(); \
1952*3f1979aaSAndroid Build Coastguard Worker } \
1953*3f1979aaSAndroid Build Coastguard Worker ret; \
1954*3f1979aaSAndroid Build Coastguard Worker })
1955*3f1979aaSAndroid Build Coastguard Worker
1956*3f1979aaSAndroid Build Coastguard Worker // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
1957*3f1979aaSAndroid Build Coastguard Worker // store the results in dst.
1958*3f1979aaSAndroid Build Coastguard Worker //
1959*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
1960*3f1979aaSAndroid Build Coastguard Worker // i := j*32
1961*3f1979aaSAndroid Build Coastguard Worker // IF imm8[7:0] > 31
1962*3f1979aaSAndroid Build Coastguard Worker // dst[i+31:i] := 0
1963*3f1979aaSAndroid Build Coastguard Worker // ELSE
1964*3f1979aaSAndroid Build Coastguard Worker // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
1965*3f1979aaSAndroid Build Coastguard Worker // FI
1966*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1967*3f1979aaSAndroid Build Coastguard Worker //
1968*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
1969*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
1970*3f1979aaSAndroid Build Coastguard Worker #define _mm_srli_epi32(a, imm) \
1971*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1972*3f1979aaSAndroid Build Coastguard Worker __m128i ret; \
1973*3f1979aaSAndroid Build Coastguard Worker if ((imm) == 0) { \
1974*3f1979aaSAndroid Build Coastguard Worker ret = a; \
1975*3f1979aaSAndroid Build Coastguard Worker } else if (0 < (imm) && (imm) < 32) { \
1976*3f1979aaSAndroid Build Coastguard Worker ret = vreinterpretq_m128i_u32( \
1977*3f1979aaSAndroid Build Coastguard Worker vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
1978*3f1979aaSAndroid Build Coastguard Worker } else { \
1979*3f1979aaSAndroid Build Coastguard Worker ret = _mm_setzero_si128(); \
1980*3f1979aaSAndroid Build Coastguard Worker } \
1981*3f1979aaSAndroid Build Coastguard Worker ret; \
1982*3f1979aaSAndroid Build Coastguard Worker })
1983*3f1979aaSAndroid Build Coastguard Worker
1984*3f1979aaSAndroid Build Coastguard Worker // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
1985*3f1979aaSAndroid Build Coastguard Worker // store the results in dst.
1986*3f1979aaSAndroid Build Coastguard Worker //
1987*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 1
1988*3f1979aaSAndroid Build Coastguard Worker // i := j*64
1989*3f1979aaSAndroid Build Coastguard Worker // IF imm8[7:0] > 63
1990*3f1979aaSAndroid Build Coastguard Worker // dst[i+63:i] := 0
1991*3f1979aaSAndroid Build Coastguard Worker // ELSE
1992*3f1979aaSAndroid Build Coastguard Worker // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
1993*3f1979aaSAndroid Build Coastguard Worker // FI
1994*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
1995*3f1979aaSAndroid Build Coastguard Worker //
1996*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
1997*3f1979aaSAndroid Build Coastguard Worker #define _mm_srli_epi64(a, imm) \
1998*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
1999*3f1979aaSAndroid Build Coastguard Worker __m128i ret; \
2000*3f1979aaSAndroid Build Coastguard Worker if ((imm) == 0) { \
2001*3f1979aaSAndroid Build Coastguard Worker ret = a; \
2002*3f1979aaSAndroid Build Coastguard Worker } else if (0 < (imm) && (imm) < 64) { \
2003*3f1979aaSAndroid Build Coastguard Worker ret = vreinterpretq_m128i_u64( \
2004*3f1979aaSAndroid Build Coastguard Worker vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
2005*3f1979aaSAndroid Build Coastguard Worker } else { \
2006*3f1979aaSAndroid Build Coastguard Worker ret = _mm_setzero_si128(); \
2007*3f1979aaSAndroid Build Coastguard Worker } \
2008*3f1979aaSAndroid Build Coastguard Worker ret; \
2009*3f1979aaSAndroid Build Coastguard Worker })
2010*3f1979aaSAndroid Build Coastguard Worker
2011*3f1979aaSAndroid Build Coastguard Worker // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
2012*3f1979aaSAndroid Build Coastguard Worker // and store the results in dst.
2013*3f1979aaSAndroid Build Coastguard Worker //
2014*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
2015*3f1979aaSAndroid Build Coastguard Worker // i := j*32
2016*3f1979aaSAndroid Build Coastguard Worker // IF imm8[7:0] > 31
2017*3f1979aaSAndroid Build Coastguard Worker // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
2018*3f1979aaSAndroid Build Coastguard Worker // ELSE
2019*3f1979aaSAndroid Build Coastguard Worker // dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
2020*3f1979aaSAndroid Build Coastguard Worker // FI
2021*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
2022*3f1979aaSAndroid Build Coastguard Worker //
2023*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
2024*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
2025*3f1979aaSAndroid Build Coastguard Worker #define _mm_srai_epi32(a, imm) \
2026*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
2027*3f1979aaSAndroid Build Coastguard Worker __m128i ret; \
2028*3f1979aaSAndroid Build Coastguard Worker if ((imm) == 0) { \
2029*3f1979aaSAndroid Build Coastguard Worker ret = a; \
2030*3f1979aaSAndroid Build Coastguard Worker } else if (0 < (imm) && (imm) < 32) { \
2031*3f1979aaSAndroid Build Coastguard Worker ret = vreinterpretq_m128i_s32( \
2032*3f1979aaSAndroid Build Coastguard Worker vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
2033*3f1979aaSAndroid Build Coastguard Worker } else { \
2034*3f1979aaSAndroid Build Coastguard Worker ret = vreinterpretq_m128i_s32( \
2035*3f1979aaSAndroid Build Coastguard Worker vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
2036*3f1979aaSAndroid Build Coastguard Worker } \
2037*3f1979aaSAndroid Build Coastguard Worker ret; \
2038*3f1979aaSAndroid Build Coastguard Worker })
2039*3f1979aaSAndroid Build Coastguard Worker
2040*3f1979aaSAndroid Build Coastguard Worker // Shifts the 128 - bit value in a right by imm bytes while shifting in
2041*3f1979aaSAndroid Build Coastguard Worker // zeros.imm must be an immediate.
2042*3f1979aaSAndroid Build Coastguard Worker //
2043*3f1979aaSAndroid Build Coastguard Worker // r := srl(a, imm*8)
2044*3f1979aaSAndroid Build Coastguard Worker //
2045*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
2046*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
2047*3f1979aaSAndroid Build Coastguard Worker #define _mm_srli_si128(a, imm) \
2048*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
2049*3f1979aaSAndroid Build Coastguard Worker __m128i ret; \
2050*3f1979aaSAndroid Build Coastguard Worker if ((imm) <= 0) { \
2051*3f1979aaSAndroid Build Coastguard Worker ret = a; \
2052*3f1979aaSAndroid Build Coastguard Worker } else if ((imm) > 15) { \
2053*3f1979aaSAndroid Build Coastguard Worker ret = _mm_setzero_si128(); \
2054*3f1979aaSAndroid Build Coastguard Worker } else { \
2055*3f1979aaSAndroid Build Coastguard Worker ret = vreinterpretq_m128i_s8( \
2056*3f1979aaSAndroid Build Coastguard Worker vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
2057*3f1979aaSAndroid Build Coastguard Worker } \
2058*3f1979aaSAndroid Build Coastguard Worker ret; \
2059*3f1979aaSAndroid Build Coastguard Worker })
2060*3f1979aaSAndroid Build Coastguard Worker
2061*3f1979aaSAndroid Build Coastguard Worker // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
2062*3f1979aaSAndroid Build Coastguard Worker // must be an immediate.
2063*3f1979aaSAndroid Build Coastguard Worker //
2064*3f1979aaSAndroid Build Coastguard Worker // r := a << (imm * 8)
2065*3f1979aaSAndroid Build Coastguard Worker //
2066*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
2067*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
2068*3f1979aaSAndroid Build Coastguard Worker #define _mm_slli_si128(a, imm) \
2069*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
2070*3f1979aaSAndroid Build Coastguard Worker __m128i ret; \
2071*3f1979aaSAndroid Build Coastguard Worker if ((imm) <= 0) { \
2072*3f1979aaSAndroid Build Coastguard Worker ret = a; \
2073*3f1979aaSAndroid Build Coastguard Worker } else if ((imm) > 15) { \
2074*3f1979aaSAndroid Build Coastguard Worker ret = _mm_setzero_si128(); \
2075*3f1979aaSAndroid Build Coastguard Worker } else { \
2076*3f1979aaSAndroid Build Coastguard Worker ret = vreinterpretq_m128i_s8(vextq_s8( \
2077*3f1979aaSAndroid Build Coastguard Worker vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
2078*3f1979aaSAndroid Build Coastguard Worker } \
2079*3f1979aaSAndroid Build Coastguard Worker ret; \
2080*3f1979aaSAndroid Build Coastguard Worker })
2081*3f1979aaSAndroid Build Coastguard Worker
2082*3f1979aaSAndroid Build Coastguard Worker // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
2083*3f1979aaSAndroid Build Coastguard Worker // shifting in zeros.
2084*3f1979aaSAndroid Build Coastguard Worker //
2085*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 << count
2086*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 << count
2087*3f1979aaSAndroid Build Coastguard Worker // ...
2088*3f1979aaSAndroid Build Coastguard Worker // r7 := a7 << count
2089*3f1979aaSAndroid Build Coastguard Worker //
2090*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
_mm_sll_epi16(__m128i a,__m128i count)2091*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
2092*3f1979aaSAndroid Build Coastguard Worker {
2093*3f1979aaSAndroid Build Coastguard Worker uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2094*3f1979aaSAndroid Build Coastguard Worker if (c > 15)
2095*3f1979aaSAndroid Build Coastguard Worker return _mm_setzero_si128();
2096*3f1979aaSAndroid Build Coastguard Worker
2097*3f1979aaSAndroid Build Coastguard Worker int16x8_t vc = vdupq_n_s16((int16_t) c);
2098*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
2099*3f1979aaSAndroid Build Coastguard Worker }
2100*3f1979aaSAndroid Build Coastguard Worker
2101*3f1979aaSAndroid Build Coastguard Worker // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2102*3f1979aaSAndroid Build Coastguard Worker // shifting in zeros.
2103*3f1979aaSAndroid Build Coastguard Worker //
2104*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 << count
2105*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 << count
2106*3f1979aaSAndroid Build Coastguard Worker // r2 := a2 << count
2107*3f1979aaSAndroid Build Coastguard Worker // r3 := a3 << count
2108*3f1979aaSAndroid Build Coastguard Worker //
2109*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
_mm_sll_epi32(__m128i a,__m128i count)2110*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
2111*3f1979aaSAndroid Build Coastguard Worker {
2112*3f1979aaSAndroid Build Coastguard Worker uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2113*3f1979aaSAndroid Build Coastguard Worker if (c > 31)
2114*3f1979aaSAndroid Build Coastguard Worker return _mm_setzero_si128();
2115*3f1979aaSAndroid Build Coastguard Worker
2116*3f1979aaSAndroid Build Coastguard Worker int32x4_t vc = vdupq_n_s32((int32_t) c);
2117*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
2118*3f1979aaSAndroid Build Coastguard Worker }
2119*3f1979aaSAndroid Build Coastguard Worker
2120*3f1979aaSAndroid Build Coastguard Worker // Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
2121*3f1979aaSAndroid Build Coastguard Worker // shifting in zeros.
2122*3f1979aaSAndroid Build Coastguard Worker //
2123*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 << count
2124*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 << count
2125*3f1979aaSAndroid Build Coastguard Worker //
2126*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
_mm_sll_epi64(__m128i a,__m128i count)2127*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
2128*3f1979aaSAndroid Build Coastguard Worker {
2129*3f1979aaSAndroid Build Coastguard Worker uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2130*3f1979aaSAndroid Build Coastguard Worker if (c > 63)
2131*3f1979aaSAndroid Build Coastguard Worker return _mm_setzero_si128();
2132*3f1979aaSAndroid Build Coastguard Worker
2133*3f1979aaSAndroid Build Coastguard Worker int64x2_t vc = vdupq_n_s64((int64_t) c);
2134*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
2135*3f1979aaSAndroid Build Coastguard Worker }
2136*3f1979aaSAndroid Build Coastguard Worker
2137*3f1979aaSAndroid Build Coastguard Worker // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
2138*3f1979aaSAndroid Build Coastguard Worker // while shifting in zeros.
2139*3f1979aaSAndroid Build Coastguard Worker //
2140*3f1979aaSAndroid Build Coastguard Worker // r0 := srl(a0, count)
2141*3f1979aaSAndroid Build Coastguard Worker // r1 := srl(a1, count)
2142*3f1979aaSAndroid Build Coastguard Worker // ...
2143*3f1979aaSAndroid Build Coastguard Worker // r7 := srl(a7, count)
2144*3f1979aaSAndroid Build Coastguard Worker //
2145*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
_mm_srl_epi16(__m128i a,__m128i count)2146*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
2147*3f1979aaSAndroid Build Coastguard Worker {
2148*3f1979aaSAndroid Build Coastguard Worker uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2149*3f1979aaSAndroid Build Coastguard Worker if (c > 15)
2150*3f1979aaSAndroid Build Coastguard Worker return _mm_setzero_si128();
2151*3f1979aaSAndroid Build Coastguard Worker
2152*3f1979aaSAndroid Build Coastguard Worker int16x8_t vc = vdupq_n_s16(-(int16_t) c);
2153*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
2154*3f1979aaSAndroid Build Coastguard Worker }
2155*3f1979aaSAndroid Build Coastguard Worker
2156*3f1979aaSAndroid Build Coastguard Worker // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
2157*3f1979aaSAndroid Build Coastguard Worker // while shifting in zeros.
2158*3f1979aaSAndroid Build Coastguard Worker //
2159*3f1979aaSAndroid Build Coastguard Worker // r0 := srl(a0, count)
2160*3f1979aaSAndroid Build Coastguard Worker // r1 := srl(a1, count)
2161*3f1979aaSAndroid Build Coastguard Worker // r2 := srl(a2, count)
2162*3f1979aaSAndroid Build Coastguard Worker // r3 := srl(a3, count)
2163*3f1979aaSAndroid Build Coastguard Worker //
2164*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
_mm_srl_epi32(__m128i a,__m128i count)2165*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
2166*3f1979aaSAndroid Build Coastguard Worker {
2167*3f1979aaSAndroid Build Coastguard Worker uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2168*3f1979aaSAndroid Build Coastguard Worker if (c > 31)
2169*3f1979aaSAndroid Build Coastguard Worker return _mm_setzero_si128();
2170*3f1979aaSAndroid Build Coastguard Worker
2171*3f1979aaSAndroid Build Coastguard Worker int32x4_t vc = vdupq_n_s32(-(int32_t) c);
2172*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
2173*3f1979aaSAndroid Build Coastguard Worker }
2174*3f1979aaSAndroid Build Coastguard Worker
2175*3f1979aaSAndroid Build Coastguard Worker // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
2176*3f1979aaSAndroid Build Coastguard Worker // while shifting in zeros.
2177*3f1979aaSAndroid Build Coastguard Worker //
2178*3f1979aaSAndroid Build Coastguard Worker // r0 := srl(a0, count)
2179*3f1979aaSAndroid Build Coastguard Worker // r1 := srl(a1, count)
2180*3f1979aaSAndroid Build Coastguard Worker //
2181*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
_mm_srl_epi64(__m128i a,__m128i count)2182*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
2183*3f1979aaSAndroid Build Coastguard Worker {
2184*3f1979aaSAndroid Build Coastguard Worker uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2185*3f1979aaSAndroid Build Coastguard Worker if (c > 63)
2186*3f1979aaSAndroid Build Coastguard Worker return _mm_setzero_si128();
2187*3f1979aaSAndroid Build Coastguard Worker
2188*3f1979aaSAndroid Build Coastguard Worker int64x2_t vc = vdupq_n_s64(-(int64_t) c);
2189*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
2190*3f1979aaSAndroid Build Coastguard Worker }
2191*3f1979aaSAndroid Build Coastguard Worker
2192*3f1979aaSAndroid Build Coastguard Worker // NEON does not provide a version of this function.
2193*3f1979aaSAndroid Build Coastguard Worker // Creates a 16-bit mask from the most significant bits of the 16 signed or
2194*3f1979aaSAndroid Build Coastguard Worker // unsigned 8-bit integers in a and zero extends the upper bits.
2195*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
_mm_movemask_epi8(__m128i a)2196*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_movemask_epi8(__m128i a)
2197*3f1979aaSAndroid Build Coastguard Worker {
2198*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2199*3f1979aaSAndroid Build Coastguard Worker uint8x16_t input = vreinterpretq_u8_m128i(a);
2200*3f1979aaSAndroid Build Coastguard Worker const int8_t ALIGN_STRUCT(16)
2201*3f1979aaSAndroid Build Coastguard Worker xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
2202*3f1979aaSAndroid Build Coastguard Worker const uint8x16_t mask_and = vdupq_n_u8(0x80);
2203*3f1979aaSAndroid Build Coastguard Worker const int8x16_t mask_shift = vld1q_s8(xr);
2204*3f1979aaSAndroid Build Coastguard Worker const uint8x16_t mask_result =
2205*3f1979aaSAndroid Build Coastguard Worker vshlq_u8(vandq_u8(input, mask_and), mask_shift);
2206*3f1979aaSAndroid Build Coastguard Worker uint8x8_t lo = vget_low_u8(mask_result);
2207*3f1979aaSAndroid Build Coastguard Worker uint8x8_t hi = vget_high_u8(mask_result);
2208*3f1979aaSAndroid Build Coastguard Worker
2209*3f1979aaSAndroid Build Coastguard Worker return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
2210*3f1979aaSAndroid Build Coastguard Worker #else
2211*3f1979aaSAndroid Build Coastguard Worker // Use increasingly wide shifts+adds to collect the sign bits
2212*3f1979aaSAndroid Build Coastguard Worker // together.
2213*3f1979aaSAndroid Build Coastguard Worker // Since the widening shifts would be rather confusing to follow in little
2214*3f1979aaSAndroid Build Coastguard Worker // endian, everything will be illustrated in big endian order instead. This
2215*3f1979aaSAndroid Build Coastguard Worker // has a different result - the bits would actually be reversed on a big
2216*3f1979aaSAndroid Build Coastguard Worker // endian machine.
2217*3f1979aaSAndroid Build Coastguard Worker
2218*3f1979aaSAndroid Build Coastguard Worker // Starting input (only half the elements are shown):
2219*3f1979aaSAndroid Build Coastguard Worker // 89 ff 1d c0 00 10 99 33
2220*3f1979aaSAndroid Build Coastguard Worker uint8x16_t input = vreinterpretq_u8_m128i(a);
2221*3f1979aaSAndroid Build Coastguard Worker
2222*3f1979aaSAndroid Build Coastguard Worker // Shift out everything but the sign bits with an unsigned shift right.
2223*3f1979aaSAndroid Build Coastguard Worker //
2224*3f1979aaSAndroid Build Coastguard Worker // Bytes of the vector::
2225*3f1979aaSAndroid Build Coastguard Worker // 89 ff 1d c0 00 10 99 33
2226*3f1979aaSAndroid Build Coastguard Worker // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
2227*3f1979aaSAndroid Build Coastguard Worker // | | | | | | | |
2228*3f1979aaSAndroid Build Coastguard Worker // 01 01 00 01 00 00 01 00
2229*3f1979aaSAndroid Build Coastguard Worker //
2230*3f1979aaSAndroid Build Coastguard Worker // Bits of first important lane(s):
2231*3f1979aaSAndroid Build Coastguard Worker // 10001001 (89)
2232*3f1979aaSAndroid Build Coastguard Worker // \______
2233*3f1979aaSAndroid Build Coastguard Worker // |
2234*3f1979aaSAndroid Build Coastguard Worker // 00000001 (01)
2235*3f1979aaSAndroid Build Coastguard Worker uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
2236*3f1979aaSAndroid Build Coastguard Worker
2237*3f1979aaSAndroid Build Coastguard Worker // Merge the even lanes together with a 16-bit unsigned shift right + add.
2238*3f1979aaSAndroid Build Coastguard Worker // 'xx' represents garbage data which will be ignored in the final result.
2239*3f1979aaSAndroid Build Coastguard Worker // In the important bytes, the add functions like a binary OR.
2240*3f1979aaSAndroid Build Coastguard Worker //
2241*3f1979aaSAndroid Build Coastguard Worker // 01 01 00 01 00 00 01 00
2242*3f1979aaSAndroid Build Coastguard Worker // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
2243*3f1979aaSAndroid Build Coastguard Worker // \| \| \| \|
2244*3f1979aaSAndroid Build Coastguard Worker // xx 03 xx 01 xx 00 xx 02
2245*3f1979aaSAndroid Build Coastguard Worker //
2246*3f1979aaSAndroid Build Coastguard Worker // 00000001 00000001 (01 01)
2247*3f1979aaSAndroid Build Coastguard Worker // \_______ |
2248*3f1979aaSAndroid Build Coastguard Worker // \|
2249*3f1979aaSAndroid Build Coastguard Worker // xxxxxxxx xxxxxx11 (xx 03)
2250*3f1979aaSAndroid Build Coastguard Worker uint32x4_t paired16 =
2251*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2252*3f1979aaSAndroid Build Coastguard Worker
2253*3f1979aaSAndroid Build Coastguard Worker // Repeat with a wider 32-bit shift + add.
2254*3f1979aaSAndroid Build Coastguard Worker // xx 03 xx 01 xx 00 xx 02
2255*3f1979aaSAndroid Build Coastguard Worker // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
2256*3f1979aaSAndroid Build Coastguard Worker // 14))
2257*3f1979aaSAndroid Build Coastguard Worker // \| \|
2258*3f1979aaSAndroid Build Coastguard Worker // xx xx xx 0d xx xx xx 02
2259*3f1979aaSAndroid Build Coastguard Worker //
2260*3f1979aaSAndroid Build Coastguard Worker // 00000011 00000001 (03 01)
2261*3f1979aaSAndroid Build Coastguard Worker // \\_____ ||
2262*3f1979aaSAndroid Build Coastguard Worker // '----.\||
2263*3f1979aaSAndroid Build Coastguard Worker // xxxxxxxx xxxx1101 (xx 0d)
2264*3f1979aaSAndroid Build Coastguard Worker uint64x2_t paired32 =
2265*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2266*3f1979aaSAndroid Build Coastguard Worker
2267*3f1979aaSAndroid Build Coastguard Worker // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
2268*3f1979aaSAndroid Build Coastguard Worker // lanes. xx xx xx 0d xx xx xx 02
2269*3f1979aaSAndroid Build Coastguard Worker // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
2270*3f1979aaSAndroid Build Coastguard Worker // 28))
2271*3f1979aaSAndroid Build Coastguard Worker // \|
2272*3f1979aaSAndroid Build Coastguard Worker // xx xx xx xx xx xx xx d2
2273*3f1979aaSAndroid Build Coastguard Worker //
2274*3f1979aaSAndroid Build Coastguard Worker // 00001101 00000010 (0d 02)
2275*3f1979aaSAndroid Build Coastguard Worker // \ \___ | |
2276*3f1979aaSAndroid Build Coastguard Worker // '---. \| |
2277*3f1979aaSAndroid Build Coastguard Worker // xxxxxxxx 11010010 (xx d2)
2278*3f1979aaSAndroid Build Coastguard Worker uint8x16_t paired64 =
2279*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2280*3f1979aaSAndroid Build Coastguard Worker
2281*3f1979aaSAndroid Build Coastguard Worker // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
2282*3f1979aaSAndroid Build Coastguard Worker // xx xx xx xx xx xx xx d2
2283*3f1979aaSAndroid Build Coastguard Worker // || return paired64[0]
2284*3f1979aaSAndroid Build Coastguard Worker // d2
2285*3f1979aaSAndroid Build Coastguard Worker // Note: Little endian would return the correct value 4b (01001011) instead.
2286*3f1979aaSAndroid Build Coastguard Worker return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
2287*3f1979aaSAndroid Build Coastguard Worker #endif
2288*3f1979aaSAndroid Build Coastguard Worker }
2289*3f1979aaSAndroid Build Coastguard Worker
2290*3f1979aaSAndroid Build Coastguard Worker // Copy the lower 64-bit integer in a to dst.
2291*3f1979aaSAndroid Build Coastguard Worker //
2292*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := a[63:0]
2293*3f1979aaSAndroid Build Coastguard Worker //
2294*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
_mm_movepi64_pi64(__m128i a)2295*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
2296*3f1979aaSAndroid Build Coastguard Worker {
2297*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
2298*3f1979aaSAndroid Build Coastguard Worker }
2299*3f1979aaSAndroid Build Coastguard Worker
2300*3f1979aaSAndroid Build Coastguard Worker // Copy the 64-bit integer a to the lower element of dst, and zero the upper
2301*3f1979aaSAndroid Build Coastguard Worker // element.
2302*3f1979aaSAndroid Build Coastguard Worker //
2303*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := a[63:0]
2304*3f1979aaSAndroid Build Coastguard Worker // dst[127:64] := 0
2305*3f1979aaSAndroid Build Coastguard Worker //
2306*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
_mm_movpi64_epi64(__m64 a)2307*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
2308*3f1979aaSAndroid Build Coastguard Worker {
2309*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(
2310*3f1979aaSAndroid Build Coastguard Worker vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
2311*3f1979aaSAndroid Build Coastguard Worker }
2312*3f1979aaSAndroid Build Coastguard Worker
2313*3f1979aaSAndroid Build Coastguard Worker // NEON does not provide this method
2314*3f1979aaSAndroid Build Coastguard Worker // Creates a 4-bit mask from the most significant bits of the four
2315*3f1979aaSAndroid Build Coastguard Worker // single-precision, floating-point values.
2316*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
_mm_movemask_ps(__m128 a)2317*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_movemask_ps(__m128 a)
2318*3f1979aaSAndroid Build Coastguard Worker {
2319*3f1979aaSAndroid Build Coastguard Worker uint32x4_t input = vreinterpretq_u32_m128(a);
2320*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2321*3f1979aaSAndroid Build Coastguard Worker static const int32x4_t shift = {0, 1, 2, 3};
2322*3f1979aaSAndroid Build Coastguard Worker uint32x4_t tmp = vshrq_n_u32(input, 31);
2323*3f1979aaSAndroid Build Coastguard Worker return vaddvq_u32(vshlq_u32(tmp, shift));
2324*3f1979aaSAndroid Build Coastguard Worker #else
2325*3f1979aaSAndroid Build Coastguard Worker // Uses the exact same method as _mm_movemask_epi8, see that for details.
2326*3f1979aaSAndroid Build Coastguard Worker // Shift out everything but the sign bits with a 32-bit unsigned shift
2327*3f1979aaSAndroid Build Coastguard Worker // right.
2328*3f1979aaSAndroid Build Coastguard Worker uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2329*3f1979aaSAndroid Build Coastguard Worker // Merge the two pairs together with a 64-bit unsigned shift right + add.
2330*3f1979aaSAndroid Build Coastguard Worker uint8x16_t paired =
2331*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2332*3f1979aaSAndroid Build Coastguard Worker // Extract the result.
2333*3f1979aaSAndroid Build Coastguard Worker return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2334*3f1979aaSAndroid Build Coastguard Worker #endif
2335*3f1979aaSAndroid Build Coastguard Worker }
2336*3f1979aaSAndroid Build Coastguard Worker
2337*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
2338*3f1979aaSAndroid Build Coastguard Worker // all 1's, and return 1 if the result is zero, otherwise return 0.
2339*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
_mm_test_all_ones(__m128i a)2340*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_test_all_ones(__m128i a)
2341*3f1979aaSAndroid Build Coastguard Worker {
2342*3f1979aaSAndroid Build Coastguard Worker return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
2343*3f1979aaSAndroid Build Coastguard Worker ~(uint64_t) 0;
2344*3f1979aaSAndroid Build Coastguard Worker }
2345*3f1979aaSAndroid Build Coastguard Worker
2346*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise AND of 128 bits (representing integer data) in a and
2347*3f1979aaSAndroid Build Coastguard Worker // mask, and return 1 if the result is zero, otherwise return 0.
2348*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
_mm_test_all_zeros(__m128i a,__m128i mask)2349*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
2350*3f1979aaSAndroid Build Coastguard Worker {
2351*3f1979aaSAndroid Build Coastguard Worker int64x2_t a_and_mask =
2352*3f1979aaSAndroid Build Coastguard Worker vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
2353*3f1979aaSAndroid Build Coastguard Worker return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
2354*3f1979aaSAndroid Build Coastguard Worker : 1;
2355*3f1979aaSAndroid Build Coastguard Worker }
2356*3f1979aaSAndroid Build Coastguard Worker
2357*3f1979aaSAndroid Build Coastguard Worker /* Math operations */
2358*3f1979aaSAndroid Build Coastguard Worker
2359*3f1979aaSAndroid Build Coastguard Worker // Subtracts the four single-precision, floating-point values of a and b.
2360*3f1979aaSAndroid Build Coastguard Worker //
2361*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 - b0
2362*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 - b1
2363*3f1979aaSAndroid Build Coastguard Worker // r2 := a2 - b2
2364*3f1979aaSAndroid Build Coastguard Worker // r3 := a3 - b3
2365*3f1979aaSAndroid Build Coastguard Worker //
2366*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
_mm_sub_ps(__m128 a,__m128 b)2367*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2368*3f1979aaSAndroid Build Coastguard Worker {
2369*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
2370*3f1979aaSAndroid Build Coastguard Worker vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2371*3f1979aaSAndroid Build Coastguard Worker }
2372*3f1979aaSAndroid Build Coastguard Worker
2373*3f1979aaSAndroid Build Coastguard Worker // Subtract the lower single-precision (32-bit) floating-point element in b from
2374*3f1979aaSAndroid Build Coastguard Worker // the lower single-precision (32-bit) floating-point element in a, store the
2375*3f1979aaSAndroid Build Coastguard Worker // result in the lower element of dst, and copy the upper 3 packed elements from
2376*3f1979aaSAndroid Build Coastguard Worker // a to the upper elements of dst.
2377*3f1979aaSAndroid Build Coastguard Worker //
2378*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := a[31:0] - b[31:0]
2379*3f1979aaSAndroid Build Coastguard Worker // dst[127:32] := a[127:32]
2380*3f1979aaSAndroid Build Coastguard Worker //
2381*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
_mm_sub_ss(__m128 a,__m128 b)2382*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2383*3f1979aaSAndroid Build Coastguard Worker {
2384*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_sub_ps(a, b));
2385*3f1979aaSAndroid Build Coastguard Worker }
2386*3f1979aaSAndroid Build Coastguard Worker
2387*3f1979aaSAndroid Build Coastguard Worker // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
2388*3f1979aaSAndroid Build Coastguard Worker // and store the results in dst.
2389*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 - b0
2390*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 - b1
_mm_sub_epi64(__m128i a,__m128i b)2391*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
2392*3f1979aaSAndroid Build Coastguard Worker {
2393*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(
2394*3f1979aaSAndroid Build Coastguard Worker vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2395*3f1979aaSAndroid Build Coastguard Worker }
2396*3f1979aaSAndroid Build Coastguard Worker
2397*3f1979aaSAndroid Build Coastguard Worker // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
2398*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers of a.
2399*3f1979aaSAndroid Build Coastguard Worker //
2400*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 - b0
2401*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 - b1
2402*3f1979aaSAndroid Build Coastguard Worker // r2 := a2 - b2
2403*3f1979aaSAndroid Build Coastguard Worker // r3 := a3 - b3
2404*3f1979aaSAndroid Build Coastguard Worker //
2405*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
_mm_sub_epi32(__m128i a,__m128i b)2406*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
2407*3f1979aaSAndroid Build Coastguard Worker {
2408*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
2409*3f1979aaSAndroid Build Coastguard Worker vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2410*3f1979aaSAndroid Build Coastguard Worker }
2411*3f1979aaSAndroid Build Coastguard Worker
_mm_sub_epi16(__m128i a,__m128i b)2412*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
2413*3f1979aaSAndroid Build Coastguard Worker {
2414*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
2415*3f1979aaSAndroid Build Coastguard Worker vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2416*3f1979aaSAndroid Build Coastguard Worker }
2417*3f1979aaSAndroid Build Coastguard Worker
_mm_sub_epi8(__m128i a,__m128i b)2418*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
2419*3f1979aaSAndroid Build Coastguard Worker {
2420*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(
2421*3f1979aaSAndroid Build Coastguard Worker vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2422*3f1979aaSAndroid Build Coastguard Worker }
2423*3f1979aaSAndroid Build Coastguard Worker
2424*3f1979aaSAndroid Build Coastguard Worker // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
2425*3f1979aaSAndroid Build Coastguard Worker //
2426*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := a[63:0] - b[63:0]
2427*3f1979aaSAndroid Build Coastguard Worker //
2428*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
_mm_sub_si64(__m64 a,__m64 b)2429*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
2430*3f1979aaSAndroid Build Coastguard Worker {
2431*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s64(
2432*3f1979aaSAndroid Build Coastguard Worker vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2433*3f1979aaSAndroid Build Coastguard Worker }
2434*3f1979aaSAndroid Build Coastguard Worker
2435*3f1979aaSAndroid Build Coastguard Worker // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
2436*3f1979aaSAndroid Build Coastguard Worker // integers of a and saturates..
2437*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
_mm_subs_epu16(__m128i a,__m128i b)2438*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
2439*3f1979aaSAndroid Build Coastguard Worker {
2440*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u16(
2441*3f1979aaSAndroid Build Coastguard Worker vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2442*3f1979aaSAndroid Build Coastguard Worker }
2443*3f1979aaSAndroid Build Coastguard Worker
2444*3f1979aaSAndroid Build Coastguard Worker // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
2445*3f1979aaSAndroid Build Coastguard Worker // integers of a and saturates.
2446*3f1979aaSAndroid Build Coastguard Worker //
2447*3f1979aaSAndroid Build Coastguard Worker // r0 := UnsignedSaturate(a0 - b0)
2448*3f1979aaSAndroid Build Coastguard Worker // r1 := UnsignedSaturate(a1 - b1)
2449*3f1979aaSAndroid Build Coastguard Worker // ...
2450*3f1979aaSAndroid Build Coastguard Worker // r15 := UnsignedSaturate(a15 - b15)
2451*3f1979aaSAndroid Build Coastguard Worker //
2452*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
_mm_subs_epu8(__m128i a,__m128i b)2453*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
2454*3f1979aaSAndroid Build Coastguard Worker {
2455*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(
2456*3f1979aaSAndroid Build Coastguard Worker vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2457*3f1979aaSAndroid Build Coastguard Worker }
2458*3f1979aaSAndroid Build Coastguard Worker
2459*3f1979aaSAndroid Build Coastguard Worker // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
2460*3f1979aaSAndroid Build Coastguard Worker // of a and saturates.
2461*3f1979aaSAndroid Build Coastguard Worker //
2462*3f1979aaSAndroid Build Coastguard Worker // r0 := SignedSaturate(a0 - b0)
2463*3f1979aaSAndroid Build Coastguard Worker // r1 := SignedSaturate(a1 - b1)
2464*3f1979aaSAndroid Build Coastguard Worker // ...
2465*3f1979aaSAndroid Build Coastguard Worker // r15 := SignedSaturate(a15 - b15)
2466*3f1979aaSAndroid Build Coastguard Worker //
2467*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
_mm_subs_epi8(__m128i a,__m128i b)2468*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
2469*3f1979aaSAndroid Build Coastguard Worker {
2470*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(
2471*3f1979aaSAndroid Build Coastguard Worker vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2472*3f1979aaSAndroid Build Coastguard Worker }
2473*3f1979aaSAndroid Build Coastguard Worker
2474*3f1979aaSAndroid Build Coastguard Worker // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
2475*3f1979aaSAndroid Build Coastguard Worker // of a and saturates.
2476*3f1979aaSAndroid Build Coastguard Worker //
2477*3f1979aaSAndroid Build Coastguard Worker // r0 := SignedSaturate(a0 - b0)
2478*3f1979aaSAndroid Build Coastguard Worker // r1 := SignedSaturate(a1 - b1)
2479*3f1979aaSAndroid Build Coastguard Worker // ...
2480*3f1979aaSAndroid Build Coastguard Worker // r7 := SignedSaturate(a7 - b7)
2481*3f1979aaSAndroid Build Coastguard Worker //
2482*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
_mm_subs_epi16(__m128i a,__m128i b)2483*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
2484*3f1979aaSAndroid Build Coastguard Worker {
2485*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
2486*3f1979aaSAndroid Build Coastguard Worker vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2487*3f1979aaSAndroid Build Coastguard Worker }
2488*3f1979aaSAndroid Build Coastguard Worker
_mm_adds_epu16(__m128i a,__m128i b)2489*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2490*3f1979aaSAndroid Build Coastguard Worker {
2491*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u16(
2492*3f1979aaSAndroid Build Coastguard Worker vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2493*3f1979aaSAndroid Build Coastguard Worker }
2494*3f1979aaSAndroid Build Coastguard Worker
2495*3f1979aaSAndroid Build Coastguard Worker // Negate packed 8-bit integers in a when the corresponding signed
2496*3f1979aaSAndroid Build Coastguard Worker // 8-bit integer in b is negative, and store the results in dst.
2497*3f1979aaSAndroid Build Coastguard Worker // Element in dst are zeroed out when the corresponding element
2498*3f1979aaSAndroid Build Coastguard Worker // in b is zero.
2499*3f1979aaSAndroid Build Coastguard Worker //
2500*3f1979aaSAndroid Build Coastguard Worker // for i in 0..15
2501*3f1979aaSAndroid Build Coastguard Worker // if b[i] < 0
2502*3f1979aaSAndroid Build Coastguard Worker // r[i] := -a[i]
2503*3f1979aaSAndroid Build Coastguard Worker // else if b[i] == 0
2504*3f1979aaSAndroid Build Coastguard Worker // r[i] := 0
2505*3f1979aaSAndroid Build Coastguard Worker // else
2506*3f1979aaSAndroid Build Coastguard Worker // r[i] := a[i]
2507*3f1979aaSAndroid Build Coastguard Worker // fi
2508*3f1979aaSAndroid Build Coastguard Worker // done
_mm_sign_epi8(__m128i _a,__m128i _b)2509*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
2510*3f1979aaSAndroid Build Coastguard Worker {
2511*3f1979aaSAndroid Build Coastguard Worker int8x16_t a = vreinterpretq_s8_m128i(_a);
2512*3f1979aaSAndroid Build Coastguard Worker int8x16_t b = vreinterpretq_s8_m128i(_b);
2513*3f1979aaSAndroid Build Coastguard Worker
2514*3f1979aaSAndroid Build Coastguard Worker // signed shift right: faster than vclt
2515*3f1979aaSAndroid Build Coastguard Worker // (b < 0) ? 0xFF : 0
2516*3f1979aaSAndroid Build Coastguard Worker uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
2517*3f1979aaSAndroid Build Coastguard Worker
2518*3f1979aaSAndroid Build Coastguard Worker // (b == 0) ? 0xFF : 0
2519*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2520*3f1979aaSAndroid Build Coastguard Worker int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
2521*3f1979aaSAndroid Build Coastguard Worker #else
2522*3f1979aaSAndroid Build Coastguard Worker int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
2523*3f1979aaSAndroid Build Coastguard Worker #endif
2524*3f1979aaSAndroid Build Coastguard Worker
2525*3f1979aaSAndroid Build Coastguard Worker // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
2526*3f1979aaSAndroid Build Coastguard Worker // based on ltMask
2527*3f1979aaSAndroid Build Coastguard Worker int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
2528*3f1979aaSAndroid Build Coastguard Worker // res = masked & (~zeroMask)
2529*3f1979aaSAndroid Build Coastguard Worker int8x16_t res = vbicq_s8(masked, zeroMask);
2530*3f1979aaSAndroid Build Coastguard Worker
2531*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(res);
2532*3f1979aaSAndroid Build Coastguard Worker }
2533*3f1979aaSAndroid Build Coastguard Worker
2534*3f1979aaSAndroid Build Coastguard Worker // Negate packed 16-bit integers in a when the corresponding signed
2535*3f1979aaSAndroid Build Coastguard Worker // 16-bit integer in b is negative, and store the results in dst.
2536*3f1979aaSAndroid Build Coastguard Worker // Element in dst are zeroed out when the corresponding element
2537*3f1979aaSAndroid Build Coastguard Worker // in b is zero.
2538*3f1979aaSAndroid Build Coastguard Worker //
2539*3f1979aaSAndroid Build Coastguard Worker // for i in 0..7
2540*3f1979aaSAndroid Build Coastguard Worker // if b[i] < 0
2541*3f1979aaSAndroid Build Coastguard Worker // r[i] := -a[i]
2542*3f1979aaSAndroid Build Coastguard Worker // else if b[i] == 0
2543*3f1979aaSAndroid Build Coastguard Worker // r[i] := 0
2544*3f1979aaSAndroid Build Coastguard Worker // else
2545*3f1979aaSAndroid Build Coastguard Worker // r[i] := a[i]
2546*3f1979aaSAndroid Build Coastguard Worker // fi
2547*3f1979aaSAndroid Build Coastguard Worker // done
_mm_sign_epi16(__m128i _a,__m128i _b)2548*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
2549*3f1979aaSAndroid Build Coastguard Worker {
2550*3f1979aaSAndroid Build Coastguard Worker int16x8_t a = vreinterpretq_s16_m128i(_a);
2551*3f1979aaSAndroid Build Coastguard Worker int16x8_t b = vreinterpretq_s16_m128i(_b);
2552*3f1979aaSAndroid Build Coastguard Worker
2553*3f1979aaSAndroid Build Coastguard Worker // signed shift right: faster than vclt
2554*3f1979aaSAndroid Build Coastguard Worker // (b < 0) ? 0xFFFF : 0
2555*3f1979aaSAndroid Build Coastguard Worker uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
2556*3f1979aaSAndroid Build Coastguard Worker // (b == 0) ? 0xFFFF : 0
2557*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2558*3f1979aaSAndroid Build Coastguard Worker int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
2559*3f1979aaSAndroid Build Coastguard Worker #else
2560*3f1979aaSAndroid Build Coastguard Worker int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
2561*3f1979aaSAndroid Build Coastguard Worker #endif
2562*3f1979aaSAndroid Build Coastguard Worker
2563*3f1979aaSAndroid Build Coastguard Worker // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
2564*3f1979aaSAndroid Build Coastguard Worker // 'a') based on ltMask
2565*3f1979aaSAndroid Build Coastguard Worker int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
2566*3f1979aaSAndroid Build Coastguard Worker // res = masked & (~zeroMask)
2567*3f1979aaSAndroid Build Coastguard Worker int16x8_t res = vbicq_s16(masked, zeroMask);
2568*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(res);
2569*3f1979aaSAndroid Build Coastguard Worker }
2570*3f1979aaSAndroid Build Coastguard Worker
2571*3f1979aaSAndroid Build Coastguard Worker // Negate packed 32-bit integers in a when the corresponding signed
2572*3f1979aaSAndroid Build Coastguard Worker // 32-bit integer in b is negative, and store the results in dst.
2573*3f1979aaSAndroid Build Coastguard Worker // Element in dst are zeroed out when the corresponding element
2574*3f1979aaSAndroid Build Coastguard Worker // in b is zero.
2575*3f1979aaSAndroid Build Coastguard Worker //
2576*3f1979aaSAndroid Build Coastguard Worker // for i in 0..3
2577*3f1979aaSAndroid Build Coastguard Worker // if b[i] < 0
2578*3f1979aaSAndroid Build Coastguard Worker // r[i] := -a[i]
2579*3f1979aaSAndroid Build Coastguard Worker // else if b[i] == 0
2580*3f1979aaSAndroid Build Coastguard Worker // r[i] := 0
2581*3f1979aaSAndroid Build Coastguard Worker // else
2582*3f1979aaSAndroid Build Coastguard Worker // r[i] := a[i]
2583*3f1979aaSAndroid Build Coastguard Worker // fi
2584*3f1979aaSAndroid Build Coastguard Worker // done
_mm_sign_epi32(__m128i _a,__m128i _b)2585*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
2586*3f1979aaSAndroid Build Coastguard Worker {
2587*3f1979aaSAndroid Build Coastguard Worker int32x4_t a = vreinterpretq_s32_m128i(_a);
2588*3f1979aaSAndroid Build Coastguard Worker int32x4_t b = vreinterpretq_s32_m128i(_b);
2589*3f1979aaSAndroid Build Coastguard Worker
2590*3f1979aaSAndroid Build Coastguard Worker // signed shift right: faster than vclt
2591*3f1979aaSAndroid Build Coastguard Worker // (b < 0) ? 0xFFFFFFFF : 0
2592*3f1979aaSAndroid Build Coastguard Worker uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
2593*3f1979aaSAndroid Build Coastguard Worker
2594*3f1979aaSAndroid Build Coastguard Worker // (b == 0) ? 0xFFFFFFFF : 0
2595*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2596*3f1979aaSAndroid Build Coastguard Worker int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
2597*3f1979aaSAndroid Build Coastguard Worker #else
2598*3f1979aaSAndroid Build Coastguard Worker int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
2599*3f1979aaSAndroid Build Coastguard Worker #endif
2600*3f1979aaSAndroid Build Coastguard Worker
2601*3f1979aaSAndroid Build Coastguard Worker // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
2602*3f1979aaSAndroid Build Coastguard Worker // 'a') based on ltMask
2603*3f1979aaSAndroid Build Coastguard Worker int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
2604*3f1979aaSAndroid Build Coastguard Worker // res = masked & (~zeroMask)
2605*3f1979aaSAndroid Build Coastguard Worker int32x4_t res = vbicq_s32(masked, zeroMask);
2606*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(res);
2607*3f1979aaSAndroid Build Coastguard Worker }
2608*3f1979aaSAndroid Build Coastguard Worker
2609*3f1979aaSAndroid Build Coastguard Worker // Negate packed 16-bit integers in a when the corresponding signed 16-bit
2610*3f1979aaSAndroid Build Coastguard Worker // integer in b is negative, and store the results in dst. Element in dst are
2611*3f1979aaSAndroid Build Coastguard Worker // zeroed out when the corresponding element in b is zero.
2612*3f1979aaSAndroid Build Coastguard Worker //
2613*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
2614*3f1979aaSAndroid Build Coastguard Worker // i := j*16
2615*3f1979aaSAndroid Build Coastguard Worker // IF b[i+15:i] < 0
2616*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := -(a[i+15:i])
2617*3f1979aaSAndroid Build Coastguard Worker // ELSE IF b[i+15:i] == 0
2618*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := 0
2619*3f1979aaSAndroid Build Coastguard Worker // ELSE
2620*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := a[i+15:i]
2621*3f1979aaSAndroid Build Coastguard Worker // FI
2622*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
2623*3f1979aaSAndroid Build Coastguard Worker //
2624*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
_mm_sign_pi16(__m64 _a,__m64 _b)2625*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
2626*3f1979aaSAndroid Build Coastguard Worker {
2627*3f1979aaSAndroid Build Coastguard Worker int16x4_t a = vreinterpret_s16_m64(_a);
2628*3f1979aaSAndroid Build Coastguard Worker int16x4_t b = vreinterpret_s16_m64(_b);
2629*3f1979aaSAndroid Build Coastguard Worker
2630*3f1979aaSAndroid Build Coastguard Worker // signed shift right: faster than vclt
2631*3f1979aaSAndroid Build Coastguard Worker // (b < 0) ? 0xFFFF : 0
2632*3f1979aaSAndroid Build Coastguard Worker uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
2633*3f1979aaSAndroid Build Coastguard Worker
2634*3f1979aaSAndroid Build Coastguard Worker // (b == 0) ? 0xFFFF : 0
2635*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2636*3f1979aaSAndroid Build Coastguard Worker int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
2637*3f1979aaSAndroid Build Coastguard Worker #else
2638*3f1979aaSAndroid Build Coastguard Worker int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
2639*3f1979aaSAndroid Build Coastguard Worker #endif
2640*3f1979aaSAndroid Build Coastguard Worker
2641*3f1979aaSAndroid Build Coastguard Worker // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
2642*3f1979aaSAndroid Build Coastguard Worker // based on ltMask
2643*3f1979aaSAndroid Build Coastguard Worker int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
2644*3f1979aaSAndroid Build Coastguard Worker // res = masked & (~zeroMask)
2645*3f1979aaSAndroid Build Coastguard Worker int16x4_t res = vbic_s16(masked, zeroMask);
2646*3f1979aaSAndroid Build Coastguard Worker
2647*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s16(res);
2648*3f1979aaSAndroid Build Coastguard Worker }
2649*3f1979aaSAndroid Build Coastguard Worker
2650*3f1979aaSAndroid Build Coastguard Worker // Negate packed 32-bit integers in a when the corresponding signed 32-bit
2651*3f1979aaSAndroid Build Coastguard Worker // integer in b is negative, and store the results in dst. Element in dst are
2652*3f1979aaSAndroid Build Coastguard Worker // zeroed out when the corresponding element in b is zero.
2653*3f1979aaSAndroid Build Coastguard Worker //
2654*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 1
2655*3f1979aaSAndroid Build Coastguard Worker // i := j*32
2656*3f1979aaSAndroid Build Coastguard Worker // IF b[i+31:i] < 0
2657*3f1979aaSAndroid Build Coastguard Worker // dst[i+31:i] := -(a[i+31:i])
2658*3f1979aaSAndroid Build Coastguard Worker // ELSE IF b[i+31:i] == 0
2659*3f1979aaSAndroid Build Coastguard Worker // dst[i+31:i] := 0
2660*3f1979aaSAndroid Build Coastguard Worker // ELSE
2661*3f1979aaSAndroid Build Coastguard Worker // dst[i+31:i] := a[i+31:i]
2662*3f1979aaSAndroid Build Coastguard Worker // FI
2663*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
2664*3f1979aaSAndroid Build Coastguard Worker //
2665*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
_mm_sign_pi32(__m64 _a,__m64 _b)2666*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
2667*3f1979aaSAndroid Build Coastguard Worker {
2668*3f1979aaSAndroid Build Coastguard Worker int32x2_t a = vreinterpret_s32_m64(_a);
2669*3f1979aaSAndroid Build Coastguard Worker int32x2_t b = vreinterpret_s32_m64(_b);
2670*3f1979aaSAndroid Build Coastguard Worker
2671*3f1979aaSAndroid Build Coastguard Worker // signed shift right: faster than vclt
2672*3f1979aaSAndroid Build Coastguard Worker // (b < 0) ? 0xFFFFFFFF : 0
2673*3f1979aaSAndroid Build Coastguard Worker uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
2674*3f1979aaSAndroid Build Coastguard Worker
2675*3f1979aaSAndroid Build Coastguard Worker // (b == 0) ? 0xFFFFFFFF : 0
2676*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2677*3f1979aaSAndroid Build Coastguard Worker int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
2678*3f1979aaSAndroid Build Coastguard Worker #else
2679*3f1979aaSAndroid Build Coastguard Worker int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
2680*3f1979aaSAndroid Build Coastguard Worker #endif
2681*3f1979aaSAndroid Build Coastguard Worker
2682*3f1979aaSAndroid Build Coastguard Worker // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
2683*3f1979aaSAndroid Build Coastguard Worker // based on ltMask
2684*3f1979aaSAndroid Build Coastguard Worker int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
2685*3f1979aaSAndroid Build Coastguard Worker // res = masked & (~zeroMask)
2686*3f1979aaSAndroid Build Coastguard Worker int32x2_t res = vbic_s32(masked, zeroMask);
2687*3f1979aaSAndroid Build Coastguard Worker
2688*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s32(res);
2689*3f1979aaSAndroid Build Coastguard Worker }
2690*3f1979aaSAndroid Build Coastguard Worker
2691*3f1979aaSAndroid Build Coastguard Worker // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
2692*3f1979aaSAndroid Build Coastguard Worker // in b is negative, and store the results in dst. Element in dst are zeroed out
2693*3f1979aaSAndroid Build Coastguard Worker // when the corresponding element in b is zero.
2694*3f1979aaSAndroid Build Coastguard Worker //
2695*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
2696*3f1979aaSAndroid Build Coastguard Worker // i := j*8
2697*3f1979aaSAndroid Build Coastguard Worker // IF b[i+7:i] < 0
2698*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := -(a[i+7:i])
2699*3f1979aaSAndroid Build Coastguard Worker // ELSE IF b[i+7:i] == 0
2700*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := 0
2701*3f1979aaSAndroid Build Coastguard Worker // ELSE
2702*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := a[i+7:i]
2703*3f1979aaSAndroid Build Coastguard Worker // FI
2704*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
2705*3f1979aaSAndroid Build Coastguard Worker //
2706*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
_mm_sign_pi8(__m64 _a,__m64 _b)2707*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
2708*3f1979aaSAndroid Build Coastguard Worker {
2709*3f1979aaSAndroid Build Coastguard Worker int8x8_t a = vreinterpret_s8_m64(_a);
2710*3f1979aaSAndroid Build Coastguard Worker int8x8_t b = vreinterpret_s8_m64(_b);
2711*3f1979aaSAndroid Build Coastguard Worker
2712*3f1979aaSAndroid Build Coastguard Worker // signed shift right: faster than vclt
2713*3f1979aaSAndroid Build Coastguard Worker // (b < 0) ? 0xFF : 0
2714*3f1979aaSAndroid Build Coastguard Worker uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
2715*3f1979aaSAndroid Build Coastguard Worker
2716*3f1979aaSAndroid Build Coastguard Worker // (b == 0) ? 0xFF : 0
2717*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2718*3f1979aaSAndroid Build Coastguard Worker int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
2719*3f1979aaSAndroid Build Coastguard Worker #else
2720*3f1979aaSAndroid Build Coastguard Worker int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
2721*3f1979aaSAndroid Build Coastguard Worker #endif
2722*3f1979aaSAndroid Build Coastguard Worker
2723*3f1979aaSAndroid Build Coastguard Worker // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
2724*3f1979aaSAndroid Build Coastguard Worker // based on ltMask
2725*3f1979aaSAndroid Build Coastguard Worker int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
2726*3f1979aaSAndroid Build Coastguard Worker // res = masked & (~zeroMask)
2727*3f1979aaSAndroid Build Coastguard Worker int8x8_t res = vbic_s8(masked, zeroMask);
2728*3f1979aaSAndroid Build Coastguard Worker
2729*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s8(res);
2730*3f1979aaSAndroid Build Coastguard Worker }
2731*3f1979aaSAndroid Build Coastguard Worker
2732*3f1979aaSAndroid Build Coastguard Worker // Average packed unsigned 16-bit integers in a and b, and store the results in
2733*3f1979aaSAndroid Build Coastguard Worker // dst.
2734*3f1979aaSAndroid Build Coastguard Worker //
2735*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
2736*3f1979aaSAndroid Build Coastguard Worker // i := j*16
2737*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2738*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
2739*3f1979aaSAndroid Build Coastguard Worker //
2740*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
_mm_avg_pu16(__m64 a,__m64 b)2741*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
2742*3f1979aaSAndroid Build Coastguard Worker {
2743*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_u16(
2744*3f1979aaSAndroid Build Coastguard Worker vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
2745*3f1979aaSAndroid Build Coastguard Worker }
2746*3f1979aaSAndroid Build Coastguard Worker
2747*3f1979aaSAndroid Build Coastguard Worker // Average packed unsigned 8-bit integers in a and b, and store the results in
2748*3f1979aaSAndroid Build Coastguard Worker // dst.
2749*3f1979aaSAndroid Build Coastguard Worker //
2750*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
2751*3f1979aaSAndroid Build Coastguard Worker // i := j*8
2752*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2753*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
2754*3f1979aaSAndroid Build Coastguard Worker //
2755*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
_mm_avg_pu8(__m64 a,__m64 b)2756*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
2757*3f1979aaSAndroid Build Coastguard Worker {
2758*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_u8(
2759*3f1979aaSAndroid Build Coastguard Worker vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2760*3f1979aaSAndroid Build Coastguard Worker }
2761*3f1979aaSAndroid Build Coastguard Worker
2762*3f1979aaSAndroid Build Coastguard Worker // Average packed unsigned 8-bit integers in a and b, and store the results in
2763*3f1979aaSAndroid Build Coastguard Worker // dst.
2764*3f1979aaSAndroid Build Coastguard Worker //
2765*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
2766*3f1979aaSAndroid Build Coastguard Worker // i := j*8
2767*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2768*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
2769*3f1979aaSAndroid Build Coastguard Worker //
2770*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2771*3f1979aaSAndroid Build Coastguard Worker #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2772*3f1979aaSAndroid Build Coastguard Worker
2773*3f1979aaSAndroid Build Coastguard Worker // Average packed unsigned 16-bit integers in a and b, and store the results in
2774*3f1979aaSAndroid Build Coastguard Worker // dst.
2775*3f1979aaSAndroid Build Coastguard Worker //
2776*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
2777*3f1979aaSAndroid Build Coastguard Worker // i := j*16
2778*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2779*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
2780*3f1979aaSAndroid Build Coastguard Worker //
2781*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2782*3f1979aaSAndroid Build Coastguard Worker #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2783*3f1979aaSAndroid Build Coastguard Worker
2784*3f1979aaSAndroid Build Coastguard Worker // Computes the average of the 16 unsigned 8-bit integers in a and the 16
2785*3f1979aaSAndroid Build Coastguard Worker // unsigned 8-bit integers in b and rounds.
2786*3f1979aaSAndroid Build Coastguard Worker //
2787*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 + b0) / 2
2788*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 + b1) / 2
2789*3f1979aaSAndroid Build Coastguard Worker // ...
2790*3f1979aaSAndroid Build Coastguard Worker // r15 := (a15 + b15) / 2
2791*3f1979aaSAndroid Build Coastguard Worker //
2792*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
_mm_avg_epu8(__m128i a,__m128i b)2793*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
2794*3f1979aaSAndroid Build Coastguard Worker {
2795*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(
2796*3f1979aaSAndroid Build Coastguard Worker vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2797*3f1979aaSAndroid Build Coastguard Worker }
2798*3f1979aaSAndroid Build Coastguard Worker
2799*3f1979aaSAndroid Build Coastguard Worker // Computes the average of the 8 unsigned 16-bit integers in a and the 8
2800*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers in b and rounds.
2801*3f1979aaSAndroid Build Coastguard Worker //
2802*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 + b0) / 2
2803*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 + b1) / 2
2804*3f1979aaSAndroid Build Coastguard Worker // ...
2805*3f1979aaSAndroid Build Coastguard Worker // r7 := (a7 + b7) / 2
2806*3f1979aaSAndroid Build Coastguard Worker //
2807*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
_mm_avg_epu16(__m128i a,__m128i b)2808*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
2809*3f1979aaSAndroid Build Coastguard Worker {
2810*3f1979aaSAndroid Build Coastguard Worker return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
2811*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u16_m128i(b));
2812*3f1979aaSAndroid Build Coastguard Worker }
2813*3f1979aaSAndroid Build Coastguard Worker
2814*3f1979aaSAndroid Build Coastguard Worker // Adds the four single-precision, floating-point values of a and b.
2815*3f1979aaSAndroid Build Coastguard Worker //
2816*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 + b0
2817*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 + b1
2818*3f1979aaSAndroid Build Coastguard Worker // r2 := a2 + b2
2819*3f1979aaSAndroid Build Coastguard Worker // r3 := a3 + b3
2820*3f1979aaSAndroid Build Coastguard Worker //
2821*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
_mm_add_ps(__m128 a,__m128 b)2822*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
2823*3f1979aaSAndroid Build Coastguard Worker {
2824*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
2825*3f1979aaSAndroid Build Coastguard Worker vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2826*3f1979aaSAndroid Build Coastguard Worker }
2827*3f1979aaSAndroid Build Coastguard Worker
2828*3f1979aaSAndroid Build Coastguard Worker // Add packed double-precision (64-bit) floating-point elements in a and b, and
2829*3f1979aaSAndroid Build Coastguard Worker // store the results in dst.
2830*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
_mm_add_pd(__m128d a,__m128d b)2831*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2832*3f1979aaSAndroid Build Coastguard Worker {
2833*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
2834*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f64(
2835*3f1979aaSAndroid Build Coastguard Worker vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2836*3f1979aaSAndroid Build Coastguard Worker #else
2837*3f1979aaSAndroid Build Coastguard Worker double *da = (double *) &a;
2838*3f1979aaSAndroid Build Coastguard Worker double *db = (double *) &b;
2839*3f1979aaSAndroid Build Coastguard Worker double c[2];
2840*3f1979aaSAndroid Build Coastguard Worker c[0] = da[0] + db[0];
2841*3f1979aaSAndroid Build Coastguard Worker c[1] = da[1] + db[1];
2842*3f1979aaSAndroid Build Coastguard Worker return vld1q_f32((float32_t *) c);
2843*3f1979aaSAndroid Build Coastguard Worker #endif
2844*3f1979aaSAndroid Build Coastguard Worker }
2845*3f1979aaSAndroid Build Coastguard Worker
2846*3f1979aaSAndroid Build Coastguard Worker // Add 64-bit integers a and b, and store the result in dst.
2847*3f1979aaSAndroid Build Coastguard Worker //
2848*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := a[63:0] + b[63:0]
2849*3f1979aaSAndroid Build Coastguard Worker //
2850*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
_mm_add_si64(__m64 a,__m64 b)2851*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2852*3f1979aaSAndroid Build Coastguard Worker {
2853*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s64(
2854*3f1979aaSAndroid Build Coastguard Worker vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2855*3f1979aaSAndroid Build Coastguard Worker }
2856*3f1979aaSAndroid Build Coastguard Worker
2857*3f1979aaSAndroid Build Coastguard Worker // adds the scalar single-precision floating point values of a and b.
2858*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
_mm_add_ss(__m128 a,__m128 b)2859*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
2860*3f1979aaSAndroid Build Coastguard Worker {
2861*3f1979aaSAndroid Build Coastguard Worker float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
2862*3f1979aaSAndroid Build Coastguard Worker float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
2863*3f1979aaSAndroid Build Coastguard Worker // the upper values in the result must be the remnants of <a>.
2864*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vaddq_f32(a, value));
2865*3f1979aaSAndroid Build Coastguard Worker }
2866*3f1979aaSAndroid Build Coastguard Worker
2867*3f1979aaSAndroid Build Coastguard Worker // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2868*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers in b.
2869*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi64(__m128i a,__m128i b)2870*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2871*3f1979aaSAndroid Build Coastguard Worker {
2872*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(
2873*3f1979aaSAndroid Build Coastguard Worker vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2874*3f1979aaSAndroid Build Coastguard Worker }
2875*3f1979aaSAndroid Build Coastguard Worker
2876*3f1979aaSAndroid Build Coastguard Worker // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2877*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers in b.
2878*3f1979aaSAndroid Build Coastguard Worker //
2879*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 + b0
2880*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 + b1
2881*3f1979aaSAndroid Build Coastguard Worker // r2 := a2 + b2
2882*3f1979aaSAndroid Build Coastguard Worker // r3 := a3 + b3
2883*3f1979aaSAndroid Build Coastguard Worker //
2884*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi32(__m128i a,__m128i b)2885*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2886*3f1979aaSAndroid Build Coastguard Worker {
2887*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
2888*3f1979aaSAndroid Build Coastguard Worker vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2889*3f1979aaSAndroid Build Coastguard Worker }
2890*3f1979aaSAndroid Build Coastguard Worker
2891*3f1979aaSAndroid Build Coastguard Worker // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2892*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers in b.
2893*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
_mm_add_epi16(__m128i a,__m128i b)2894*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2895*3f1979aaSAndroid Build Coastguard Worker {
2896*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
2897*3f1979aaSAndroid Build Coastguard Worker vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2898*3f1979aaSAndroid Build Coastguard Worker }
2899*3f1979aaSAndroid Build Coastguard Worker
2900*3f1979aaSAndroid Build Coastguard Worker // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2901*3f1979aaSAndroid Build Coastguard Worker // unsigned 8-bit integers in b.
2902*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
_mm_add_epi8(__m128i a,__m128i b)2903*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2904*3f1979aaSAndroid Build Coastguard Worker {
2905*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(
2906*3f1979aaSAndroid Build Coastguard Worker vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2907*3f1979aaSAndroid Build Coastguard Worker }
2908*3f1979aaSAndroid Build Coastguard Worker
2909*3f1979aaSAndroid Build Coastguard Worker // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
2910*3f1979aaSAndroid Build Coastguard Worker // and saturates.
2911*3f1979aaSAndroid Build Coastguard Worker //
2912*3f1979aaSAndroid Build Coastguard Worker // r0 := SignedSaturate(a0 + b0)
2913*3f1979aaSAndroid Build Coastguard Worker // r1 := SignedSaturate(a1 + b1)
2914*3f1979aaSAndroid Build Coastguard Worker // ...
2915*3f1979aaSAndroid Build Coastguard Worker // r7 := SignedSaturate(a7 + b7)
2916*3f1979aaSAndroid Build Coastguard Worker //
2917*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
_mm_adds_epi16(__m128i a,__m128i b)2918*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2919*3f1979aaSAndroid Build Coastguard Worker {
2920*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
2921*3f1979aaSAndroid Build Coastguard Worker vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2922*3f1979aaSAndroid Build Coastguard Worker }
2923*3f1979aaSAndroid Build Coastguard Worker
2924*3f1979aaSAndroid Build Coastguard Worker // Add packed signed 8-bit integers in a and b using saturation, and store the
2925*3f1979aaSAndroid Build Coastguard Worker // results in dst.
2926*3f1979aaSAndroid Build Coastguard Worker //
2927*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 15
2928*3f1979aaSAndroid Build Coastguard Worker // i := j*8
2929*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
2930*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
2931*3f1979aaSAndroid Build Coastguard Worker //
2932*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
_mm_adds_epi8(__m128i a,__m128i b)2933*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
2934*3f1979aaSAndroid Build Coastguard Worker {
2935*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(
2936*3f1979aaSAndroid Build Coastguard Worker vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2937*3f1979aaSAndroid Build Coastguard Worker }
2938*3f1979aaSAndroid Build Coastguard Worker
2939*3f1979aaSAndroid Build Coastguard Worker // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
2940*3f1979aaSAndroid Build Coastguard Worker // b and saturates..
2941*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
_mm_adds_epu8(__m128i a,__m128i b)2942*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
2943*3f1979aaSAndroid Build Coastguard Worker {
2944*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(
2945*3f1979aaSAndroid Build Coastguard Worker vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2946*3f1979aaSAndroid Build Coastguard Worker }
2947*3f1979aaSAndroid Build Coastguard Worker
2948*3f1979aaSAndroid Build Coastguard Worker // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
2949*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers from b.
2950*3f1979aaSAndroid Build Coastguard Worker //
2951*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 * b0)[15:0]
2952*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 * b1)[15:0]
2953*3f1979aaSAndroid Build Coastguard Worker // ...
2954*3f1979aaSAndroid Build Coastguard Worker // r7 := (a7 * b7)[15:0]
2955*3f1979aaSAndroid Build Coastguard Worker //
2956*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
_mm_mullo_epi16(__m128i a,__m128i b)2957*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
2958*3f1979aaSAndroid Build Coastguard Worker {
2959*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
2960*3f1979aaSAndroid Build Coastguard Worker vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2961*3f1979aaSAndroid Build Coastguard Worker }
2962*3f1979aaSAndroid Build Coastguard Worker
2963*3f1979aaSAndroid Build Coastguard Worker // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
2964*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers from b.
2965*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
_mm_mullo_epi32(__m128i a,__m128i b)2966*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
2967*3f1979aaSAndroid Build Coastguard Worker {
2968*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
2969*3f1979aaSAndroid Build Coastguard Worker vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2970*3f1979aaSAndroid Build Coastguard Worker }
2971*3f1979aaSAndroid Build Coastguard Worker
2972*3f1979aaSAndroid Build Coastguard Worker // Multiply the packed unsigned 16-bit integers in a and b, producing
2973*3f1979aaSAndroid Build Coastguard Worker // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2974*3f1979aaSAndroid Build Coastguard Worker // integers in dst.
2975*3f1979aaSAndroid Build Coastguard Worker //
2976*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
2977*3f1979aaSAndroid Build Coastguard Worker // i := j*16
2978*3f1979aaSAndroid Build Coastguard Worker // tmp[31:0] := a[i+15:i] * b[i+15:i]
2979*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := tmp[31:16]
2980*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
2981*3f1979aaSAndroid Build Coastguard Worker //
2982*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2983*3f1979aaSAndroid Build Coastguard Worker #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2984*3f1979aaSAndroid Build Coastguard Worker
2985*3f1979aaSAndroid Build Coastguard Worker // Multiplies the four single-precision, floating-point values of a and b.
2986*3f1979aaSAndroid Build Coastguard Worker //
2987*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 * b0
2988*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 * b1
2989*3f1979aaSAndroid Build Coastguard Worker // r2 := a2 * b2
2990*3f1979aaSAndroid Build Coastguard Worker // r3 := a3 * b3
2991*3f1979aaSAndroid Build Coastguard Worker //
2992*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
_mm_mul_ps(__m128 a,__m128 b)2993*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2994*3f1979aaSAndroid Build Coastguard Worker {
2995*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
2996*3f1979aaSAndroid Build Coastguard Worker vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2997*3f1979aaSAndroid Build Coastguard Worker }
2998*3f1979aaSAndroid Build Coastguard Worker
2999*3f1979aaSAndroid Build Coastguard Worker // Multiply the lower single-precision (32-bit) floating-point element in a and
3000*3f1979aaSAndroid Build Coastguard Worker // b, store the result in the lower element of dst, and copy the upper 3 packed
3001*3f1979aaSAndroid Build Coastguard Worker // elements from a to the upper elements of dst.
3002*3f1979aaSAndroid Build Coastguard Worker //
3003*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := a[31:0] * b[31:0]
3004*3f1979aaSAndroid Build Coastguard Worker // dst[127:32] := a[127:32]
3005*3f1979aaSAndroid Build Coastguard Worker //
3006*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
_mm_mul_ss(__m128 a,__m128 b)3007*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
3008*3f1979aaSAndroid Build Coastguard Worker {
3009*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_mul_ps(a, b));
3010*3f1979aaSAndroid Build Coastguard Worker }
3011*3f1979aaSAndroid Build Coastguard Worker
3012*3f1979aaSAndroid Build Coastguard Worker // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
3013*3f1979aaSAndroid Build Coastguard Worker // a and b, and store the unsigned 64-bit results in dst.
3014*3f1979aaSAndroid Build Coastguard Worker //
3015*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
3016*3f1979aaSAndroid Build Coastguard Worker // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
_mm_mul_epu32(__m128i a,__m128i b)3017*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
3018*3f1979aaSAndroid Build Coastguard Worker {
3019*3f1979aaSAndroid Build Coastguard Worker // vmull_u32 upcasts instead of masking, so we downcast.
3020*3f1979aaSAndroid Build Coastguard Worker uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
3021*3f1979aaSAndroid Build Coastguard Worker uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
3022*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
3023*3f1979aaSAndroid Build Coastguard Worker }
3024*3f1979aaSAndroid Build Coastguard Worker
3025*3f1979aaSAndroid Build Coastguard Worker // Multiply the low unsigned 32-bit integers from a and b, and store the
3026*3f1979aaSAndroid Build Coastguard Worker // unsigned 64-bit result in dst.
3027*3f1979aaSAndroid Build Coastguard Worker //
3028*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := a[31:0] * b[31:0]
3029*3f1979aaSAndroid Build Coastguard Worker //
3030*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
_mm_mul_su32(__m64 a,__m64 b)3031*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
3032*3f1979aaSAndroid Build Coastguard Worker {
3033*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_u64(vget_low_u64(
3034*3f1979aaSAndroid Build Coastguard Worker vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
3035*3f1979aaSAndroid Build Coastguard Worker }
3036*3f1979aaSAndroid Build Coastguard Worker
3037*3f1979aaSAndroid Build Coastguard Worker // Multiply the low signed 32-bit integers from each packed 64-bit element in
3038*3f1979aaSAndroid Build Coastguard Worker // a and b, and store the signed 64-bit results in dst.
3039*3f1979aaSAndroid Build Coastguard Worker //
3040*3f1979aaSAndroid Build Coastguard Worker // r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
3041*3f1979aaSAndroid Build Coastguard Worker // r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
_mm_mul_epi32(__m128i a,__m128i b)3042*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
3043*3f1979aaSAndroid Build Coastguard Worker {
3044*3f1979aaSAndroid Build Coastguard Worker // vmull_s32 upcasts instead of masking, so we downcast.
3045*3f1979aaSAndroid Build Coastguard Worker int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
3046*3f1979aaSAndroid Build Coastguard Worker int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
3047*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
3048*3f1979aaSAndroid Build Coastguard Worker }
3049*3f1979aaSAndroid Build Coastguard Worker
3050*3f1979aaSAndroid Build Coastguard Worker // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3051*3f1979aaSAndroid Build Coastguard Worker // integers from b.
3052*3f1979aaSAndroid Build Coastguard Worker //
3053*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 * b0) + (a1 * b1)
3054*3f1979aaSAndroid Build Coastguard Worker // r1 := (a2 * b2) + (a3 * b3)
3055*3f1979aaSAndroid Build Coastguard Worker // r2 := (a4 * b4) + (a5 * b5)
3056*3f1979aaSAndroid Build Coastguard Worker // r3 := (a6 * b6) + (a7 * b7)
3057*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
_mm_madd_epi16(__m128i a,__m128i b)3058*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
3059*3f1979aaSAndroid Build Coastguard Worker {
3060*3f1979aaSAndroid Build Coastguard Worker int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3061*3f1979aaSAndroid Build Coastguard Worker vget_low_s16(vreinterpretq_s16_m128i(b)));
3062*3f1979aaSAndroid Build Coastguard Worker int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3063*3f1979aaSAndroid Build Coastguard Worker vget_high_s16(vreinterpretq_s16_m128i(b)));
3064*3f1979aaSAndroid Build Coastguard Worker
3065*3f1979aaSAndroid Build Coastguard Worker int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
3066*3f1979aaSAndroid Build Coastguard Worker int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
3067*3f1979aaSAndroid Build Coastguard Worker
3068*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
3069*3f1979aaSAndroid Build Coastguard Worker }
3070*3f1979aaSAndroid Build Coastguard Worker
3071*3f1979aaSAndroid Build Coastguard Worker // Multiply packed signed 16-bit integers in a and b, producing intermediate
3072*3f1979aaSAndroid Build Coastguard Worker // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
3073*3f1979aaSAndroid Build Coastguard Worker // the packed 16-bit integers in dst.
3074*3f1979aaSAndroid Build Coastguard Worker //
3075*3f1979aaSAndroid Build Coastguard Worker // r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
3076*3f1979aaSAndroid Build Coastguard Worker // r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
3077*3f1979aaSAndroid Build Coastguard Worker // r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
3078*3f1979aaSAndroid Build Coastguard Worker // ...
3079*3f1979aaSAndroid Build Coastguard Worker // r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
_mm_mulhrs_epi16(__m128i a,__m128i b)3080*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
3081*3f1979aaSAndroid Build Coastguard Worker {
3082*3f1979aaSAndroid Build Coastguard Worker // Has issues due to saturation
3083*3f1979aaSAndroid Build Coastguard Worker // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
3084*3f1979aaSAndroid Build Coastguard Worker
3085*3f1979aaSAndroid Build Coastguard Worker // Multiply
3086*3f1979aaSAndroid Build Coastguard Worker int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3087*3f1979aaSAndroid Build Coastguard Worker vget_low_s16(vreinterpretq_s16_m128i(b)));
3088*3f1979aaSAndroid Build Coastguard Worker int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3089*3f1979aaSAndroid Build Coastguard Worker vget_high_s16(vreinterpretq_s16_m128i(b)));
3090*3f1979aaSAndroid Build Coastguard Worker
3091*3f1979aaSAndroid Build Coastguard Worker // Rounding narrowing shift right
3092*3f1979aaSAndroid Build Coastguard Worker // narrow = (int16_t)((mul + 16384) >> 15);
3093*3f1979aaSAndroid Build Coastguard Worker int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
3094*3f1979aaSAndroid Build Coastguard Worker int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
3095*3f1979aaSAndroid Build Coastguard Worker
3096*3f1979aaSAndroid Build Coastguard Worker // Join together
3097*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
3098*3f1979aaSAndroid Build Coastguard Worker }
3099*3f1979aaSAndroid Build Coastguard Worker
3100*3f1979aaSAndroid Build Coastguard Worker // Vertically multiply each unsigned 8-bit integer from a with the corresponding
3101*3f1979aaSAndroid Build Coastguard Worker // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
3102*3f1979aaSAndroid Build Coastguard Worker // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
3103*3f1979aaSAndroid Build Coastguard Worker // and pack the saturated results in dst.
3104*3f1979aaSAndroid Build Coastguard Worker //
3105*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
3106*3f1979aaSAndroid Build Coastguard Worker // i := j*16
3107*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
3108*3f1979aaSAndroid Build Coastguard Worker // a[i+7:i]*b[i+7:i] )
3109*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
_mm_maddubs_epi16(__m128i _a,__m128i _b)3110*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
3111*3f1979aaSAndroid Build Coastguard Worker {
3112*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3113*3f1979aaSAndroid Build Coastguard Worker uint8x16_t a = vreinterpretq_u8_m128i(_a);
3114*3f1979aaSAndroid Build Coastguard Worker int8x16_t b = vreinterpretq_s8_m128i(_b);
3115*3f1979aaSAndroid Build Coastguard Worker int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
3116*3f1979aaSAndroid Build Coastguard Worker vmovl_s8(vget_low_s8(b)));
3117*3f1979aaSAndroid Build Coastguard Worker int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
3118*3f1979aaSAndroid Build Coastguard Worker vmovl_s8(vget_high_s8(b)));
3119*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
3120*3f1979aaSAndroid Build Coastguard Worker vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
3121*3f1979aaSAndroid Build Coastguard Worker #else
3122*3f1979aaSAndroid Build Coastguard Worker // This would be much simpler if x86 would choose to zero extend OR sign
3123*3f1979aaSAndroid Build Coastguard Worker // extend, not both. This could probably be optimized better.
3124*3f1979aaSAndroid Build Coastguard Worker uint16x8_t a = vreinterpretq_u16_m128i(_a);
3125*3f1979aaSAndroid Build Coastguard Worker int16x8_t b = vreinterpretq_s16_m128i(_b);
3126*3f1979aaSAndroid Build Coastguard Worker
3127*3f1979aaSAndroid Build Coastguard Worker // Zero extend a
3128*3f1979aaSAndroid Build Coastguard Worker int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
3129*3f1979aaSAndroid Build Coastguard Worker int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
3130*3f1979aaSAndroid Build Coastguard Worker
3131*3f1979aaSAndroid Build Coastguard Worker // Sign extend by shifting left then shifting right.
3132*3f1979aaSAndroid Build Coastguard Worker int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
3133*3f1979aaSAndroid Build Coastguard Worker int16x8_t b_odd = vshrq_n_s16(b, 8);
3134*3f1979aaSAndroid Build Coastguard Worker
3135*3f1979aaSAndroid Build Coastguard Worker // multiply
3136*3f1979aaSAndroid Build Coastguard Worker int16x8_t prod1 = vmulq_s16(a_even, b_even);
3137*3f1979aaSAndroid Build Coastguard Worker int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
3138*3f1979aaSAndroid Build Coastguard Worker
3139*3f1979aaSAndroid Build Coastguard Worker // saturated add
3140*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
3141*3f1979aaSAndroid Build Coastguard Worker #endif
3142*3f1979aaSAndroid Build Coastguard Worker }
3143*3f1979aaSAndroid Build Coastguard Worker
3144*3f1979aaSAndroid Build Coastguard Worker // Computes the fused multiple add product of 32-bit floating point numbers.
3145*3f1979aaSAndroid Build Coastguard Worker //
3146*3f1979aaSAndroid Build Coastguard Worker // Return Value
3147*3f1979aaSAndroid Build Coastguard Worker // Multiplies A and B, and adds C to the temporary result before returning it.
3148*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
_mm_fmadd_ps(__m128 a,__m128 b,__m128 c)3149*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
3150*3f1979aaSAndroid Build Coastguard Worker {
3151*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3152*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
3153*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_f32_m128(b),
3154*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_f32_m128(a)));
3155*3f1979aaSAndroid Build Coastguard Worker #else
3156*3f1979aaSAndroid Build Coastguard Worker return _mm_add_ps(_mm_mul_ps(a, b), c);
3157*3f1979aaSAndroid Build Coastguard Worker #endif
3158*3f1979aaSAndroid Build Coastguard Worker }
3159*3f1979aaSAndroid Build Coastguard Worker
3160*3f1979aaSAndroid Build Coastguard Worker // Alternatively add and subtract packed single-precision (32-bit)
3161*3f1979aaSAndroid Build Coastguard Worker // floating-point elements in a to/from packed elements in b, and store the
3162*3f1979aaSAndroid Build Coastguard Worker // results in dst.
3163*3f1979aaSAndroid Build Coastguard Worker //
3164*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
_mm_addsub_ps(__m128 a,__m128 b)3165*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
3166*3f1979aaSAndroid Build Coastguard Worker {
3167*3f1979aaSAndroid Build Coastguard Worker __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
3168*3f1979aaSAndroid Build Coastguard Worker return _mm_fmadd_ps(b, mask, a);
3169*3f1979aaSAndroid Build Coastguard Worker }
3170*3f1979aaSAndroid Build Coastguard Worker
3171*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute differences of packed unsigned 8-bit integers in a and
3172*3f1979aaSAndroid Build Coastguard Worker // b, then horizontally sum each consecutive 8 differences to produce two
3173*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3174*3f1979aaSAndroid Build Coastguard Worker // 16 bits of 64-bit elements in dst.
3175*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
_mm_sad_epu8(__m128i a,__m128i b)3176*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
3177*3f1979aaSAndroid Build Coastguard Worker {
3178*3f1979aaSAndroid Build Coastguard Worker uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
3179*3f1979aaSAndroid Build Coastguard Worker uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3180*3f1979aaSAndroid Build Coastguard Worker uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3181*3f1979aaSAndroid Build Coastguard Worker uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3182*3f1979aaSAndroid Build Coastguard Worker return (__m128i) vsetq_lane_u16(r4, r, 4);
3183*3f1979aaSAndroid Build Coastguard Worker }
3184*3f1979aaSAndroid Build Coastguard Worker
3185*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute differences of packed unsigned 8-bit integers in a and
3186*3f1979aaSAndroid Build Coastguard Worker // b, then horizontally sum each consecutive 8 differences to produce four
3187*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3188*3f1979aaSAndroid Build Coastguard Worker // 16 bits of dst.
3189*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
_mm_sad_pu8(__m64 a,__m64 b)3190*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
3191*3f1979aaSAndroid Build Coastguard Worker {
3192*3f1979aaSAndroid Build Coastguard Worker uint16x4_t t =
3193*3f1979aaSAndroid Build Coastguard Worker vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3194*3f1979aaSAndroid Build Coastguard Worker uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3195*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
3196*3f1979aaSAndroid Build Coastguard Worker }
3197*3f1979aaSAndroid Build Coastguard Worker
3198*3f1979aaSAndroid Build Coastguard Worker // Compute the absolute differences of packed unsigned 8-bit integers in a and
3199*3f1979aaSAndroid Build Coastguard Worker // b, then horizontally sum each consecutive 8 differences to produce four
3200*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3201*3f1979aaSAndroid Build Coastguard Worker // 16 bits of dst.
3202*3f1979aaSAndroid Build Coastguard Worker //
3203*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
3204*3f1979aaSAndroid Build Coastguard Worker // i := j*8
3205*3f1979aaSAndroid Build Coastguard Worker // tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
3206*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
3207*3f1979aaSAndroid Build Coastguard Worker // dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
3208*3f1979aaSAndroid Build Coastguard Worker // tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
3209*3f1979aaSAndroid Build Coastguard Worker //
3210*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
3211*3f1979aaSAndroid Build Coastguard Worker #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
3212*3f1979aaSAndroid Build Coastguard Worker
3213*3f1979aaSAndroid Build Coastguard Worker // Divides the four single-precision, floating-point values of a and b.
3214*3f1979aaSAndroid Build Coastguard Worker //
3215*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 / b0
3216*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 / b1
3217*3f1979aaSAndroid Build Coastguard Worker // r2 := a2 / b2
3218*3f1979aaSAndroid Build Coastguard Worker // r3 := a3 / b3
3219*3f1979aaSAndroid Build Coastguard Worker //
3220*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
_mm_div_ps(__m128 a,__m128 b)3221*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
3222*3f1979aaSAndroid Build Coastguard Worker {
3223*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3224*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
3225*3f1979aaSAndroid Build Coastguard Worker vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3226*3f1979aaSAndroid Build Coastguard Worker #else
3227*3f1979aaSAndroid Build Coastguard Worker float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
3228*3f1979aaSAndroid Build Coastguard Worker float32x4_t recip1 =
3229*3f1979aaSAndroid Build Coastguard Worker vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
3230*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
3231*3f1979aaSAndroid Build Coastguard Worker #endif
3232*3f1979aaSAndroid Build Coastguard Worker }
3233*3f1979aaSAndroid Build Coastguard Worker
3234*3f1979aaSAndroid Build Coastguard Worker // Divides the scalar single-precision floating point value of a by b.
3235*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
_mm_div_ss(__m128 a,__m128 b)3236*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
3237*3f1979aaSAndroid Build Coastguard Worker {
3238*3f1979aaSAndroid Build Coastguard Worker float32_t value =
3239*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
3240*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
3241*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3242*3f1979aaSAndroid Build Coastguard Worker }
3243*3f1979aaSAndroid Build Coastguard Worker
3244*3f1979aaSAndroid Build Coastguard Worker // Computes the approximations of reciprocals of the four single-precision,
3245*3f1979aaSAndroid Build Coastguard Worker // floating-point values of a.
3246*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
_mm_rcp_ps(__m128 in)3247*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
3248*3f1979aaSAndroid Build Coastguard Worker {
3249*3f1979aaSAndroid Build Coastguard Worker float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
3250*3f1979aaSAndroid Build Coastguard Worker recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
3251*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(recip);
3252*3f1979aaSAndroid Build Coastguard Worker }
3253*3f1979aaSAndroid Build Coastguard Worker
3254*3f1979aaSAndroid Build Coastguard Worker // Compute the approximate reciprocal of the lower single-precision (32-bit)
3255*3f1979aaSAndroid Build Coastguard Worker // floating-point element in a, store the result in the lower element of dst,
3256*3f1979aaSAndroid Build Coastguard Worker // and copy the upper 3 packed elements from a to the upper elements of dst. The
3257*3f1979aaSAndroid Build Coastguard Worker // maximum relative error for this approximation is less than 1.5*2^-12.
3258*3f1979aaSAndroid Build Coastguard Worker //
3259*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := (1.0 / a[31:0])
3260*3f1979aaSAndroid Build Coastguard Worker // dst[127:32] := a[127:32]
3261*3f1979aaSAndroid Build Coastguard Worker //
3262*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
_mm_rcp_ss(__m128 a)3263*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
3264*3f1979aaSAndroid Build Coastguard Worker {
3265*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_rcp_ps(a));
3266*3f1979aaSAndroid Build Coastguard Worker }
3267*3f1979aaSAndroid Build Coastguard Worker
3268*3f1979aaSAndroid Build Coastguard Worker // Computes the approximations of square roots of the four single-precision,
3269*3f1979aaSAndroid Build Coastguard Worker // floating-point values of a. First computes reciprocal square roots and then
3270*3f1979aaSAndroid Build Coastguard Worker // reciprocals of the four values.
3271*3f1979aaSAndroid Build Coastguard Worker //
3272*3f1979aaSAndroid Build Coastguard Worker // r0 := sqrt(a0)
3273*3f1979aaSAndroid Build Coastguard Worker // r1 := sqrt(a1)
3274*3f1979aaSAndroid Build Coastguard Worker // r2 := sqrt(a2)
3275*3f1979aaSAndroid Build Coastguard Worker // r3 := sqrt(a3)
3276*3f1979aaSAndroid Build Coastguard Worker //
3277*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
_mm_sqrt_ps(__m128 in)3278*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
3279*3f1979aaSAndroid Build Coastguard Worker {
3280*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3281*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
3282*3f1979aaSAndroid Build Coastguard Worker #else
3283*3f1979aaSAndroid Build Coastguard Worker float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3284*3f1979aaSAndroid Build Coastguard Worker float32x4_t sq = vrecpeq_f32(recipsq);
3285*3f1979aaSAndroid Build Coastguard Worker // ??? use step versions of both sqrt and recip for better accuracy?
3286*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(sq);
3287*3f1979aaSAndroid Build Coastguard Worker #endif
3288*3f1979aaSAndroid Build Coastguard Worker }
3289*3f1979aaSAndroid Build Coastguard Worker
3290*3f1979aaSAndroid Build Coastguard Worker // Computes the approximation of the square root of the scalar single-precision
3291*3f1979aaSAndroid Build Coastguard Worker // floating point value of in.
3292*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
_mm_sqrt_ss(__m128 in)3293*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
3294*3f1979aaSAndroid Build Coastguard Worker {
3295*3f1979aaSAndroid Build Coastguard Worker float32_t value =
3296*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
3297*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
3298*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
3299*3f1979aaSAndroid Build Coastguard Worker }
3300*3f1979aaSAndroid Build Coastguard Worker
3301*3f1979aaSAndroid Build Coastguard Worker // Computes the approximations of the reciprocal square roots of the four
3302*3f1979aaSAndroid Build Coastguard Worker // single-precision floating point values of in.
3303*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
_mm_rsqrt_ps(__m128 in)3304*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
3305*3f1979aaSAndroid Build Coastguard Worker {
3306*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
3307*3f1979aaSAndroid Build Coastguard Worker }
3308*3f1979aaSAndroid Build Coastguard Worker
3309*3f1979aaSAndroid Build Coastguard Worker // Compute the approximate reciprocal square root of the lower single-precision
3310*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point element in a, store the result in the lower element
3311*3f1979aaSAndroid Build Coastguard Worker // of dst, and copy the upper 3 packed elements from a to the upper elements of
3312*3f1979aaSAndroid Build Coastguard Worker // dst.
3313*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
_mm_rsqrt_ss(__m128 in)3314*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
3315*3f1979aaSAndroid Build Coastguard Worker {
3316*3f1979aaSAndroid Build Coastguard Worker return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
3317*3f1979aaSAndroid Build Coastguard Worker }
3318*3f1979aaSAndroid Build Coastguard Worker
3319*3f1979aaSAndroid Build Coastguard Worker // Compare packed signed 16-bit integers in a and b, and store packed maximum
3320*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3321*3f1979aaSAndroid Build Coastguard Worker //
3322*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
3323*3f1979aaSAndroid Build Coastguard Worker // i := j*16
3324*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3325*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
3326*3f1979aaSAndroid Build Coastguard Worker //
3327*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
_mm_max_pi16(__m64 a,__m64 b)3328*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
3329*3f1979aaSAndroid Build Coastguard Worker {
3330*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s16(
3331*3f1979aaSAndroid Build Coastguard Worker vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3332*3f1979aaSAndroid Build Coastguard Worker }
3333*3f1979aaSAndroid Build Coastguard Worker
3334*3f1979aaSAndroid Build Coastguard Worker // Compare packed signed 16-bit integers in a and b, and store packed maximum
3335*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3336*3f1979aaSAndroid Build Coastguard Worker //
3337*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
3338*3f1979aaSAndroid Build Coastguard Worker // i := j*16
3339*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3340*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
3341*3f1979aaSAndroid Build Coastguard Worker //
3342*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
3343*3f1979aaSAndroid Build Coastguard Worker #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
3344*3f1979aaSAndroid Build Coastguard Worker
3345*3f1979aaSAndroid Build Coastguard Worker // Computes the maximums of the four single-precision, floating-point values of
3346*3f1979aaSAndroid Build Coastguard Worker // a and b.
3347*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
_mm_max_ps(__m128 a,__m128 b)3348*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
3349*3f1979aaSAndroid Build Coastguard Worker {
3350*3f1979aaSAndroid Build Coastguard Worker #if SSE2NEON_PRECISE_MINMAX
3351*3f1979aaSAndroid Build Coastguard Worker float32x4_t _a = vreinterpretq_f32_m128(a);
3352*3f1979aaSAndroid Build Coastguard Worker float32x4_t _b = vreinterpretq_f32_m128(b);
3353*3f1979aaSAndroid Build Coastguard Worker return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
3354*3f1979aaSAndroid Build Coastguard Worker #else
3355*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
3356*3f1979aaSAndroid Build Coastguard Worker vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3357*3f1979aaSAndroid Build Coastguard Worker #endif
3358*3f1979aaSAndroid Build Coastguard Worker }
3359*3f1979aaSAndroid Build Coastguard Worker
3360*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3361*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3362*3f1979aaSAndroid Build Coastguard Worker //
3363*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
3364*3f1979aaSAndroid Build Coastguard Worker // i := j*8
3365*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3366*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
3367*3f1979aaSAndroid Build Coastguard Worker //
3368*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
_mm_max_pu8(__m64 a,__m64 b)3369*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
3370*3f1979aaSAndroid Build Coastguard Worker {
3371*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_u8(
3372*3f1979aaSAndroid Build Coastguard Worker vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3373*3f1979aaSAndroid Build Coastguard Worker }
3374*3f1979aaSAndroid Build Coastguard Worker
3375*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3376*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3377*3f1979aaSAndroid Build Coastguard Worker //
3378*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
3379*3f1979aaSAndroid Build Coastguard Worker // i := j*8
3380*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3381*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
3382*3f1979aaSAndroid Build Coastguard Worker //
3383*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
3384*3f1979aaSAndroid Build Coastguard Worker #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
3385*3f1979aaSAndroid Build Coastguard Worker
3386*3f1979aaSAndroid Build Coastguard Worker // Compare packed signed 16-bit integers in a and b, and store packed minimum
3387*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3388*3f1979aaSAndroid Build Coastguard Worker //
3389*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
3390*3f1979aaSAndroid Build Coastguard Worker // i := j*16
3391*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3392*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
3393*3f1979aaSAndroid Build Coastguard Worker //
3394*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
_mm_min_pi16(__m64 a,__m64 b)3395*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
3396*3f1979aaSAndroid Build Coastguard Worker {
3397*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s16(
3398*3f1979aaSAndroid Build Coastguard Worker vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3399*3f1979aaSAndroid Build Coastguard Worker }
3400*3f1979aaSAndroid Build Coastguard Worker
3401*3f1979aaSAndroid Build Coastguard Worker // Compare packed signed 16-bit integers in a and b, and store packed minimum
3402*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3403*3f1979aaSAndroid Build Coastguard Worker //
3404*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
3405*3f1979aaSAndroid Build Coastguard Worker // i := j*16
3406*3f1979aaSAndroid Build Coastguard Worker // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3407*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
3408*3f1979aaSAndroid Build Coastguard Worker //
3409*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
3410*3f1979aaSAndroid Build Coastguard Worker #define _m_pminsw(a, b) _mm_min_pi16(a, b)
3411*3f1979aaSAndroid Build Coastguard Worker
3412*3f1979aaSAndroid Build Coastguard Worker // Computes the minima of the four single-precision, floating-point values of a
3413*3f1979aaSAndroid Build Coastguard Worker // and b.
3414*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
_mm_min_ps(__m128 a,__m128 b)3415*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
3416*3f1979aaSAndroid Build Coastguard Worker {
3417*3f1979aaSAndroid Build Coastguard Worker #if SSE2NEON_PRECISE_MINMAX
3418*3f1979aaSAndroid Build Coastguard Worker float32x4_t _a = vreinterpretq_f32_m128(a);
3419*3f1979aaSAndroid Build Coastguard Worker float32x4_t _b = vreinterpretq_f32_m128(b);
3420*3f1979aaSAndroid Build Coastguard Worker return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
3421*3f1979aaSAndroid Build Coastguard Worker #else
3422*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
3423*3f1979aaSAndroid Build Coastguard Worker vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3424*3f1979aaSAndroid Build Coastguard Worker #endif
3425*3f1979aaSAndroid Build Coastguard Worker }
3426*3f1979aaSAndroid Build Coastguard Worker
3427*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3428*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3429*3f1979aaSAndroid Build Coastguard Worker //
3430*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
3431*3f1979aaSAndroid Build Coastguard Worker // i := j*8
3432*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3433*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
3434*3f1979aaSAndroid Build Coastguard Worker //
3435*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
_mm_min_pu8(__m64 a,__m64 b)3436*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
3437*3f1979aaSAndroid Build Coastguard Worker {
3438*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_u8(
3439*3f1979aaSAndroid Build Coastguard Worker vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3440*3f1979aaSAndroid Build Coastguard Worker }
3441*3f1979aaSAndroid Build Coastguard Worker
3442*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3443*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3444*3f1979aaSAndroid Build Coastguard Worker //
3445*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
3446*3f1979aaSAndroid Build Coastguard Worker // i := j*8
3447*3f1979aaSAndroid Build Coastguard Worker // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3448*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
3449*3f1979aaSAndroid Build Coastguard Worker //
3450*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
3451*3f1979aaSAndroid Build Coastguard Worker #define _m_pminub(a, b) _mm_min_pu8(a, b)
3452*3f1979aaSAndroid Build Coastguard Worker
3453*3f1979aaSAndroid Build Coastguard Worker // Computes the maximum of the two lower scalar single-precision floating point
3454*3f1979aaSAndroid Build Coastguard Worker // values of a and b.
3455*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
_mm_max_ss(__m128 a,__m128 b)3456*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
3457*3f1979aaSAndroid Build Coastguard Worker {
3458*3f1979aaSAndroid Build Coastguard Worker float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
3459*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
3460*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3461*3f1979aaSAndroid Build Coastguard Worker }
3462*3f1979aaSAndroid Build Coastguard Worker
3463*3f1979aaSAndroid Build Coastguard Worker // Computes the minimum of the two lower scalar single-precision floating point
3464*3f1979aaSAndroid Build Coastguard Worker // values of a and b.
3465*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
_mm_min_ss(__m128 a,__m128 b)3466*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
3467*3f1979aaSAndroid Build Coastguard Worker {
3468*3f1979aaSAndroid Build Coastguard Worker float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
3469*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
3470*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3471*3f1979aaSAndroid Build Coastguard Worker }
3472*3f1979aaSAndroid Build Coastguard Worker
3473*3f1979aaSAndroid Build Coastguard Worker // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
3474*3f1979aaSAndroid Build Coastguard Worker // 16 unsigned 8-bit integers from b.
3475*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
_mm_max_epu8(__m128i a,__m128i b)3476*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
3477*3f1979aaSAndroid Build Coastguard Worker {
3478*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(
3479*3f1979aaSAndroid Build Coastguard Worker vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3480*3f1979aaSAndroid Build Coastguard Worker }
3481*3f1979aaSAndroid Build Coastguard Worker
3482*3f1979aaSAndroid Build Coastguard Worker // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
3483*3f1979aaSAndroid Build Coastguard Worker // 16 unsigned 8-bit integers from b.
3484*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
_mm_min_epu8(__m128i a,__m128i b)3485*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
3486*3f1979aaSAndroid Build Coastguard Worker {
3487*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(
3488*3f1979aaSAndroid Build Coastguard Worker vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3489*3f1979aaSAndroid Build Coastguard Worker }
3490*3f1979aaSAndroid Build Coastguard Worker
3491*3f1979aaSAndroid Build Coastguard Worker // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
3492*3f1979aaSAndroid Build Coastguard Worker // signed 16-bit integers from b.
3493*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
_mm_min_epi16(__m128i a,__m128i b)3494*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
3495*3f1979aaSAndroid Build Coastguard Worker {
3496*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
3497*3f1979aaSAndroid Build Coastguard Worker vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3498*3f1979aaSAndroid Build Coastguard Worker }
3499*3f1979aaSAndroid Build Coastguard Worker
3500*3f1979aaSAndroid Build Coastguard Worker // Compare packed signed 8-bit integers in a and b, and store packed maximum
3501*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3502*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
_mm_max_epi8(__m128i a,__m128i b)3503*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
3504*3f1979aaSAndroid Build Coastguard Worker {
3505*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(
3506*3f1979aaSAndroid Build Coastguard Worker vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3507*3f1979aaSAndroid Build Coastguard Worker }
3508*3f1979aaSAndroid Build Coastguard Worker
3509*3f1979aaSAndroid Build Coastguard Worker // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
3510*3f1979aaSAndroid Build Coastguard Worker // signed 16-bit integers from b.
3511*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
_mm_max_epi16(__m128i a,__m128i b)3512*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
3513*3f1979aaSAndroid Build Coastguard Worker {
3514*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
3515*3f1979aaSAndroid Build Coastguard Worker vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3516*3f1979aaSAndroid Build Coastguard Worker }
3517*3f1979aaSAndroid Build Coastguard Worker
3518*3f1979aaSAndroid Build Coastguard Worker // epi versions of min/max
3519*3f1979aaSAndroid Build Coastguard Worker // Computes the pariwise maximums of the four signed 32-bit integer values of a
3520*3f1979aaSAndroid Build Coastguard Worker // and b.
3521*3f1979aaSAndroid Build Coastguard Worker //
3522*3f1979aaSAndroid Build Coastguard Worker // A 128-bit parameter that can be defined with the following equations:
3523*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 > b0) ? a0 : b0
3524*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 > b1) ? a1 : b1
3525*3f1979aaSAndroid Build Coastguard Worker // r2 := (a2 > b2) ? a2 : b2
3526*3f1979aaSAndroid Build Coastguard Worker // r3 := (a3 > b3) ? a3 : b3
3527*3f1979aaSAndroid Build Coastguard Worker //
3528*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
_mm_max_epi32(__m128i a,__m128i b)3529*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
3530*3f1979aaSAndroid Build Coastguard Worker {
3531*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
3532*3f1979aaSAndroid Build Coastguard Worker vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3533*3f1979aaSAndroid Build Coastguard Worker }
3534*3f1979aaSAndroid Build Coastguard Worker
3535*3f1979aaSAndroid Build Coastguard Worker // Computes the pariwise minima of the four signed 32-bit integer values of a
3536*3f1979aaSAndroid Build Coastguard Worker // and b.
3537*3f1979aaSAndroid Build Coastguard Worker //
3538*3f1979aaSAndroid Build Coastguard Worker // A 128-bit parameter that can be defined with the following equations:
3539*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 < b0) ? a0 : b0
3540*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 < b1) ? a1 : b1
3541*3f1979aaSAndroid Build Coastguard Worker // r2 := (a2 < b2) ? a2 : b2
3542*3f1979aaSAndroid Build Coastguard Worker // r3 := (a3 < b3) ? a3 : b3
3543*3f1979aaSAndroid Build Coastguard Worker //
3544*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
_mm_min_epi32(__m128i a,__m128i b)3545*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
3546*3f1979aaSAndroid Build Coastguard Worker {
3547*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
3548*3f1979aaSAndroid Build Coastguard Worker vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3549*3f1979aaSAndroid Build Coastguard Worker }
3550*3f1979aaSAndroid Build Coastguard Worker
3551*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
3552*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3553*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_max_epu32(__m128i a,__m128i b)3554*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
3555*3f1979aaSAndroid Build Coastguard Worker {
3556*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u32(
3557*3f1979aaSAndroid Build Coastguard Worker vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3558*3f1979aaSAndroid Build Coastguard Worker }
3559*3f1979aaSAndroid Build Coastguard Worker
3560*3f1979aaSAndroid Build Coastguard Worker // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
3561*3f1979aaSAndroid Build Coastguard Worker // values in dst.
3562*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_min_epu32(__m128i a,__m128i b)3563*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
3564*3f1979aaSAndroid Build Coastguard Worker {
3565*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u32(
3566*3f1979aaSAndroid Build Coastguard Worker vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3567*3f1979aaSAndroid Build Coastguard Worker }
3568*3f1979aaSAndroid Build Coastguard Worker
3569*3f1979aaSAndroid Build Coastguard Worker // Multiply the packed unsigned 16-bit integers in a and b, producing
3570*3f1979aaSAndroid Build Coastguard Worker // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3571*3f1979aaSAndroid Build Coastguard Worker // integers in dst.
3572*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
_mm_mulhi_pu16(__m64 a,__m64 b)3573*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
3574*3f1979aaSAndroid Build Coastguard Worker {
3575*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_u16(vshrn_n_u32(
3576*3f1979aaSAndroid Build Coastguard Worker vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
3577*3f1979aaSAndroid Build Coastguard Worker }
3578*3f1979aaSAndroid Build Coastguard Worker
3579*3f1979aaSAndroid Build Coastguard Worker // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3580*3f1979aaSAndroid Build Coastguard Worker // integers from b.
3581*3f1979aaSAndroid Build Coastguard Worker //
3582*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 * b0)[31:16]
3583*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 * b1)[31:16]
3584*3f1979aaSAndroid Build Coastguard Worker // ...
3585*3f1979aaSAndroid Build Coastguard Worker // r7 := (a7 * b7)[31:16]
3586*3f1979aaSAndroid Build Coastguard Worker //
3587*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
_mm_mulhi_epi16(__m128i a,__m128i b)3588*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
3589*3f1979aaSAndroid Build Coastguard Worker {
3590*3f1979aaSAndroid Build Coastguard Worker /* FIXME: issue with large values because of result saturation */
3591*3f1979aaSAndroid Build Coastguard Worker // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
3592*3f1979aaSAndroid Build Coastguard Worker // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
3593*3f1979aaSAndroid Build Coastguard Worker // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
3594*3f1979aaSAndroid Build Coastguard Worker int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
3595*3f1979aaSAndroid Build Coastguard Worker int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
3596*3f1979aaSAndroid Build Coastguard Worker int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3597*3f1979aaSAndroid Build Coastguard Worker int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
3598*3f1979aaSAndroid Build Coastguard Worker int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
3599*3f1979aaSAndroid Build Coastguard Worker int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3600*3f1979aaSAndroid Build Coastguard Worker uint16x8x2_t r =
3601*3f1979aaSAndroid Build Coastguard Worker vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
3602*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u16(r.val[1]);
3603*3f1979aaSAndroid Build Coastguard Worker }
3604*3f1979aaSAndroid Build Coastguard Worker
3605*3f1979aaSAndroid Build Coastguard Worker // Computes pairwise add of each argument as single-precision, floating-point
3606*3f1979aaSAndroid Build Coastguard Worker // values a and b.
3607*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
_mm_hadd_ps(__m128 a,__m128 b)3608*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
3609*3f1979aaSAndroid Build Coastguard Worker {
3610*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3611*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
3612*3f1979aaSAndroid Build Coastguard Worker vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3613*3f1979aaSAndroid Build Coastguard Worker #else
3614*3f1979aaSAndroid Build Coastguard Worker float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
3615*3f1979aaSAndroid Build Coastguard Worker float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
3616*3f1979aaSAndroid Build Coastguard Worker float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
3617*3f1979aaSAndroid Build Coastguard Worker float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
3618*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
3619*3f1979aaSAndroid Build Coastguard Worker vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
3620*3f1979aaSAndroid Build Coastguard Worker #endif
3621*3f1979aaSAndroid Build Coastguard Worker }
3622*3f1979aaSAndroid Build Coastguard Worker
3623*3f1979aaSAndroid Build Coastguard Worker // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
3624*3f1979aaSAndroid Build Coastguard Worker // values a and b.
_mm_hadd_epi16(__m128i _a,__m128i _b)3625*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
3626*3f1979aaSAndroid Build Coastguard Worker {
3627*3f1979aaSAndroid Build Coastguard Worker int16x8_t a = vreinterpretq_s16_m128i(_a);
3628*3f1979aaSAndroid Build Coastguard Worker int16x8_t b = vreinterpretq_s16_m128i(_b);
3629*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3630*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
3631*3f1979aaSAndroid Build Coastguard Worker #else
3632*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
3633*3f1979aaSAndroid Build Coastguard Worker vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
3634*3f1979aaSAndroid Build Coastguard Worker vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
3635*3f1979aaSAndroid Build Coastguard Worker #endif
3636*3f1979aaSAndroid Build Coastguard Worker }
3637*3f1979aaSAndroid Build Coastguard Worker
3638*3f1979aaSAndroid Build Coastguard Worker // Horizontally substract adjacent pairs of single-precision (32-bit)
3639*3f1979aaSAndroid Build Coastguard Worker // floating-point elements in a and b, and pack the results in dst.
3640*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
_mm_hsub_ps(__m128 _a,__m128 _b)3641*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
3642*3f1979aaSAndroid Build Coastguard Worker {
3643*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3644*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vsubq_f32(
3645*3f1979aaSAndroid Build Coastguard Worker vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
3646*3f1979aaSAndroid Build Coastguard Worker vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
3647*3f1979aaSAndroid Build Coastguard Worker #else
3648*3f1979aaSAndroid Build Coastguard Worker float32x4x2_t c =
3649*3f1979aaSAndroid Build Coastguard Worker vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
3650*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
3651*3f1979aaSAndroid Build Coastguard Worker #endif
3652*3f1979aaSAndroid Build Coastguard Worker }
3653*3f1979aaSAndroid Build Coastguard Worker
3654*3f1979aaSAndroid Build Coastguard Worker // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
3655*3f1979aaSAndroid Build Coastguard Worker // signed 16-bit results in dst.
3656*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
_mm_hadd_pi16(__m64 a,__m64 b)3657*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
3658*3f1979aaSAndroid Build Coastguard Worker {
3659*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s16(
3660*3f1979aaSAndroid Build Coastguard Worker vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3661*3f1979aaSAndroid Build Coastguard Worker }
3662*3f1979aaSAndroid Build Coastguard Worker
3663*3f1979aaSAndroid Build Coastguard Worker // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
3664*3f1979aaSAndroid Build Coastguard Worker // signed 32-bit results in dst.
3665*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
_mm_hadd_pi32(__m64 a,__m64 b)3666*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
3667*3f1979aaSAndroid Build Coastguard Worker {
3668*3f1979aaSAndroid Build Coastguard Worker return vreinterpret_m64_s32(
3669*3f1979aaSAndroid Build Coastguard Worker vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
3670*3f1979aaSAndroid Build Coastguard Worker }
3671*3f1979aaSAndroid Build Coastguard Worker
3672*3f1979aaSAndroid Build Coastguard Worker // Computes pairwise difference of each argument as a 16-bit signed or unsigned
3673*3f1979aaSAndroid Build Coastguard Worker // integer values a and b.
_mm_hsub_epi16(__m128i _a,__m128i _b)3674*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
3675*3f1979aaSAndroid Build Coastguard Worker {
3676*3f1979aaSAndroid Build Coastguard Worker int32x4_t a = vreinterpretq_s32_m128i(_a);
3677*3f1979aaSAndroid Build Coastguard Worker int32x4_t b = vreinterpretq_s32_m128i(_b);
3678*3f1979aaSAndroid Build Coastguard Worker // Interleave using vshrn/vmovn
3679*3f1979aaSAndroid Build Coastguard Worker // [a0|a2|a4|a6|b0|b2|b4|b6]
3680*3f1979aaSAndroid Build Coastguard Worker // [a1|a3|a5|a7|b1|b3|b5|b7]
3681*3f1979aaSAndroid Build Coastguard Worker int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3682*3f1979aaSAndroid Build Coastguard Worker int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3683*3f1979aaSAndroid Build Coastguard Worker // Subtract
3684*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
3685*3f1979aaSAndroid Build Coastguard Worker }
3686*3f1979aaSAndroid Build Coastguard Worker
3687*3f1979aaSAndroid Build Coastguard Worker // Computes saturated pairwise sub of each argument as a 16-bit signed
3688*3f1979aaSAndroid Build Coastguard Worker // integer values a and b.
_mm_hadds_epi16(__m128i _a,__m128i _b)3689*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
3690*3f1979aaSAndroid Build Coastguard Worker {
3691*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3692*3f1979aaSAndroid Build Coastguard Worker int16x8_t a = vreinterpretq_s16_m128i(_a);
3693*3f1979aaSAndroid Build Coastguard Worker int16x8_t b = vreinterpretq_s16_m128i(_b);
3694*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_s64_s16(
3695*3f1979aaSAndroid Build Coastguard Worker vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3696*3f1979aaSAndroid Build Coastguard Worker #else
3697*3f1979aaSAndroid Build Coastguard Worker int32x4_t a = vreinterpretq_s32_m128i(_a);
3698*3f1979aaSAndroid Build Coastguard Worker int32x4_t b = vreinterpretq_s32_m128i(_b);
3699*3f1979aaSAndroid Build Coastguard Worker // Interleave using vshrn/vmovn
3700*3f1979aaSAndroid Build Coastguard Worker // [a0|a2|a4|a6|b0|b2|b4|b6]
3701*3f1979aaSAndroid Build Coastguard Worker // [a1|a3|a5|a7|b1|b3|b5|b7]
3702*3f1979aaSAndroid Build Coastguard Worker int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3703*3f1979aaSAndroid Build Coastguard Worker int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3704*3f1979aaSAndroid Build Coastguard Worker // Saturated add
3705*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
3706*3f1979aaSAndroid Build Coastguard Worker #endif
3707*3f1979aaSAndroid Build Coastguard Worker }
3708*3f1979aaSAndroid Build Coastguard Worker
3709*3f1979aaSAndroid Build Coastguard Worker // Computes saturated pairwise difference of each argument as a 16-bit signed
3710*3f1979aaSAndroid Build Coastguard Worker // integer values a and b.
3711*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
_mm_hsubs_epi16(__m128i _a,__m128i _b)3712*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
3713*3f1979aaSAndroid Build Coastguard Worker {
3714*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3715*3f1979aaSAndroid Build Coastguard Worker int16x8_t a = vreinterpretq_s16_m128i(_a);
3716*3f1979aaSAndroid Build Coastguard Worker int16x8_t b = vreinterpretq_s16_m128i(_b);
3717*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_s64_s16(
3718*3f1979aaSAndroid Build Coastguard Worker vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3719*3f1979aaSAndroid Build Coastguard Worker #else
3720*3f1979aaSAndroid Build Coastguard Worker int32x4_t a = vreinterpretq_s32_m128i(_a);
3721*3f1979aaSAndroid Build Coastguard Worker int32x4_t b = vreinterpretq_s32_m128i(_b);
3722*3f1979aaSAndroid Build Coastguard Worker // Interleave using vshrn/vmovn
3723*3f1979aaSAndroid Build Coastguard Worker // [a0|a2|a4|a6|b0|b2|b4|b6]
3724*3f1979aaSAndroid Build Coastguard Worker // [a1|a3|a5|a7|b1|b3|b5|b7]
3725*3f1979aaSAndroid Build Coastguard Worker int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3726*3f1979aaSAndroid Build Coastguard Worker int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3727*3f1979aaSAndroid Build Coastguard Worker // Saturated subtract
3728*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
3729*3f1979aaSAndroid Build Coastguard Worker #endif
3730*3f1979aaSAndroid Build Coastguard Worker }
3731*3f1979aaSAndroid Build Coastguard Worker
3732*3f1979aaSAndroid Build Coastguard Worker // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
3733*3f1979aaSAndroid Build Coastguard Worker // values a and b.
_mm_hadd_epi32(__m128i _a,__m128i _b)3734*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
3735*3f1979aaSAndroid Build Coastguard Worker {
3736*3f1979aaSAndroid Build Coastguard Worker int32x4_t a = vreinterpretq_s32_m128i(_a);
3737*3f1979aaSAndroid Build Coastguard Worker int32x4_t b = vreinterpretq_s32_m128i(_b);
3738*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
3739*3f1979aaSAndroid Build Coastguard Worker vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
3740*3f1979aaSAndroid Build Coastguard Worker vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
3741*3f1979aaSAndroid Build Coastguard Worker }
3742*3f1979aaSAndroid Build Coastguard Worker
3743*3f1979aaSAndroid Build Coastguard Worker // Computes pairwise difference of each argument as a 32-bit signed or unsigned
3744*3f1979aaSAndroid Build Coastguard Worker // integer values a and b.
_mm_hsub_epi32(__m128i _a,__m128i _b)3745*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
3746*3f1979aaSAndroid Build Coastguard Worker {
3747*3f1979aaSAndroid Build Coastguard Worker int64x2_t a = vreinterpretq_s64_m128i(_a);
3748*3f1979aaSAndroid Build Coastguard Worker int64x2_t b = vreinterpretq_s64_m128i(_b);
3749*3f1979aaSAndroid Build Coastguard Worker // Interleave using vshrn/vmovn
3750*3f1979aaSAndroid Build Coastguard Worker // [a0|a2|b0|b2]
3751*3f1979aaSAndroid Build Coastguard Worker // [a1|a2|b1|b3]
3752*3f1979aaSAndroid Build Coastguard Worker int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
3753*3f1979aaSAndroid Build Coastguard Worker int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
3754*3f1979aaSAndroid Build Coastguard Worker // Subtract
3755*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
3756*3f1979aaSAndroid Build Coastguard Worker }
3757*3f1979aaSAndroid Build Coastguard Worker
3758*3f1979aaSAndroid Build Coastguard Worker // Kahan summation for accurate summation of floating-point numbers.
3759*3f1979aaSAndroid Build Coastguard Worker // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
sse2neon_kadd_f32(float * sum,float * c,float y)3760*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y)
3761*3f1979aaSAndroid Build Coastguard Worker {
3762*3f1979aaSAndroid Build Coastguard Worker y -= *c;
3763*3f1979aaSAndroid Build Coastguard Worker float t = *sum + y;
3764*3f1979aaSAndroid Build Coastguard Worker *c = (t - *sum) - y;
3765*3f1979aaSAndroid Build Coastguard Worker *sum = t;
3766*3f1979aaSAndroid Build Coastguard Worker }
3767*3f1979aaSAndroid Build Coastguard Worker
3768*3f1979aaSAndroid Build Coastguard Worker // Conditionally multiply the packed single-precision (32-bit) floating-point
3769*3f1979aaSAndroid Build Coastguard Worker // elements in a and b using the high 4 bits in imm8, sum the four products,
3770*3f1979aaSAndroid Build Coastguard Worker // and conditionally store the sum in dst using the low 4 bits of imm.
3771*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
_mm_dp_ps(__m128 a,__m128 b,const int imm)3772*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
3773*3f1979aaSAndroid Build Coastguard Worker {
3774*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
3775*3f1979aaSAndroid Build Coastguard Worker /* shortcuts */
3776*3f1979aaSAndroid Build Coastguard Worker if (imm == 0xFF) {
3777*3f1979aaSAndroid Build Coastguard Worker return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
3778*3f1979aaSAndroid Build Coastguard Worker }
3779*3f1979aaSAndroid Build Coastguard Worker if (imm == 0x7F) {
3780*3f1979aaSAndroid Build Coastguard Worker float32x4_t m = _mm_mul_ps(a, b);
3781*3f1979aaSAndroid Build Coastguard Worker m[3] = 0;
3782*3f1979aaSAndroid Build Coastguard Worker return _mm_set1_ps(vaddvq_f32(m));
3783*3f1979aaSAndroid Build Coastguard Worker }
3784*3f1979aaSAndroid Build Coastguard Worker #endif
3785*3f1979aaSAndroid Build Coastguard Worker
3786*3f1979aaSAndroid Build Coastguard Worker float s = 0, c = 0;
3787*3f1979aaSAndroid Build Coastguard Worker float32x4_t f32a = vreinterpretq_f32_m128(a);
3788*3f1979aaSAndroid Build Coastguard Worker float32x4_t f32b = vreinterpretq_f32_m128(b);
3789*3f1979aaSAndroid Build Coastguard Worker
3790*3f1979aaSAndroid Build Coastguard Worker /* To improve the accuracy of floating-point summation, Kahan algorithm
3791*3f1979aaSAndroid Build Coastguard Worker * is used for each operation.
3792*3f1979aaSAndroid Build Coastguard Worker */
3793*3f1979aaSAndroid Build Coastguard Worker if (imm & (1 << 4))
3794*3f1979aaSAndroid Build Coastguard Worker sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
3795*3f1979aaSAndroid Build Coastguard Worker if (imm & (1 << 5))
3796*3f1979aaSAndroid Build Coastguard Worker sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
3797*3f1979aaSAndroid Build Coastguard Worker if (imm & (1 << 6))
3798*3f1979aaSAndroid Build Coastguard Worker sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
3799*3f1979aaSAndroid Build Coastguard Worker if (imm & (1 << 7))
3800*3f1979aaSAndroid Build Coastguard Worker sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
3801*3f1979aaSAndroid Build Coastguard Worker s += c;
3802*3f1979aaSAndroid Build Coastguard Worker
3803*3f1979aaSAndroid Build Coastguard Worker float32x4_t res = {
3804*3f1979aaSAndroid Build Coastguard Worker (imm & 0x1) ? s : 0,
3805*3f1979aaSAndroid Build Coastguard Worker (imm & 0x2) ? s : 0,
3806*3f1979aaSAndroid Build Coastguard Worker (imm & 0x4) ? s : 0,
3807*3f1979aaSAndroid Build Coastguard Worker (imm & 0x8) ? s : 0,
3808*3f1979aaSAndroid Build Coastguard Worker };
3809*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(res);
3810*3f1979aaSAndroid Build Coastguard Worker }
3811*3f1979aaSAndroid Build Coastguard Worker
3812*3f1979aaSAndroid Build Coastguard Worker /* Compare operations */
3813*3f1979aaSAndroid Build Coastguard Worker
3814*3f1979aaSAndroid Build Coastguard Worker // Compares for less than
3815*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
_mm_cmplt_ps(__m128 a,__m128 b)3816*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
3817*3f1979aaSAndroid Build Coastguard Worker {
3818*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_u32(
3819*3f1979aaSAndroid Build Coastguard Worker vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3820*3f1979aaSAndroid Build Coastguard Worker }
3821*3f1979aaSAndroid Build Coastguard Worker
3822*3f1979aaSAndroid Build Coastguard Worker // Compares for less than
3823*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
_mm_cmplt_ss(__m128 a,__m128 b)3824*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
3825*3f1979aaSAndroid Build Coastguard Worker {
3826*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_cmplt_ps(a, b));
3827*3f1979aaSAndroid Build Coastguard Worker }
3828*3f1979aaSAndroid Build Coastguard Worker
3829*3f1979aaSAndroid Build Coastguard Worker // Compares for greater than.
3830*3f1979aaSAndroid Build Coastguard Worker //
3831*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 > b0) ? 0xffffffff : 0x0
3832*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 > b1) ? 0xffffffff : 0x0
3833*3f1979aaSAndroid Build Coastguard Worker // r2 := (a2 > b2) ? 0xffffffff : 0x0
3834*3f1979aaSAndroid Build Coastguard Worker // r3 := (a3 > b3) ? 0xffffffff : 0x0
3835*3f1979aaSAndroid Build Coastguard Worker //
3836*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
_mm_cmpgt_ps(__m128 a,__m128 b)3837*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
3838*3f1979aaSAndroid Build Coastguard Worker {
3839*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_u32(
3840*3f1979aaSAndroid Build Coastguard Worker vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3841*3f1979aaSAndroid Build Coastguard Worker }
3842*3f1979aaSAndroid Build Coastguard Worker
3843*3f1979aaSAndroid Build Coastguard Worker // Compares for greater than.
3844*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
_mm_cmpgt_ss(__m128 a,__m128 b)3845*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
3846*3f1979aaSAndroid Build Coastguard Worker {
3847*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
3848*3f1979aaSAndroid Build Coastguard Worker }
3849*3f1979aaSAndroid Build Coastguard Worker
3850*3f1979aaSAndroid Build Coastguard Worker // Compares for greater than or equal.
3851*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
_mm_cmpge_ps(__m128 a,__m128 b)3852*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
3853*3f1979aaSAndroid Build Coastguard Worker {
3854*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_u32(
3855*3f1979aaSAndroid Build Coastguard Worker vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3856*3f1979aaSAndroid Build Coastguard Worker }
3857*3f1979aaSAndroid Build Coastguard Worker
3858*3f1979aaSAndroid Build Coastguard Worker // Compares for greater than or equal.
3859*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
_mm_cmpge_ss(__m128 a,__m128 b)3860*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
3861*3f1979aaSAndroid Build Coastguard Worker {
3862*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_cmpge_ps(a, b));
3863*3f1979aaSAndroid Build Coastguard Worker }
3864*3f1979aaSAndroid Build Coastguard Worker
3865*3f1979aaSAndroid Build Coastguard Worker // Compares for less than or equal.
3866*3f1979aaSAndroid Build Coastguard Worker //
3867*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 <= b0) ? 0xffffffff : 0x0
3868*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 <= b1) ? 0xffffffff : 0x0
3869*3f1979aaSAndroid Build Coastguard Worker // r2 := (a2 <= b2) ? 0xffffffff : 0x0
3870*3f1979aaSAndroid Build Coastguard Worker // r3 := (a3 <= b3) ? 0xffffffff : 0x0
3871*3f1979aaSAndroid Build Coastguard Worker //
3872*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
_mm_cmple_ps(__m128 a,__m128 b)3873*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
3874*3f1979aaSAndroid Build Coastguard Worker {
3875*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_u32(
3876*3f1979aaSAndroid Build Coastguard Worker vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3877*3f1979aaSAndroid Build Coastguard Worker }
3878*3f1979aaSAndroid Build Coastguard Worker
3879*3f1979aaSAndroid Build Coastguard Worker // Compares for less than or equal.
3880*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
_mm_cmple_ss(__m128 a,__m128 b)3881*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
3882*3f1979aaSAndroid Build Coastguard Worker {
3883*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_cmple_ps(a, b));
3884*3f1979aaSAndroid Build Coastguard Worker }
3885*3f1979aaSAndroid Build Coastguard Worker
3886*3f1979aaSAndroid Build Coastguard Worker // Compares for equality.
3887*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
_mm_cmpeq_ps(__m128 a,__m128 b)3888*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
3889*3f1979aaSAndroid Build Coastguard Worker {
3890*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_u32(
3891*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3892*3f1979aaSAndroid Build Coastguard Worker }
3893*3f1979aaSAndroid Build Coastguard Worker
3894*3f1979aaSAndroid Build Coastguard Worker // Compares for equality.
3895*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
_mm_cmpeq_ss(__m128 a,__m128 b)3896*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
3897*3f1979aaSAndroid Build Coastguard Worker {
3898*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
3899*3f1979aaSAndroid Build Coastguard Worker }
3900*3f1979aaSAndroid Build Coastguard Worker
3901*3f1979aaSAndroid Build Coastguard Worker // Compares for inequality.
3902*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
_mm_cmpneq_ps(__m128 a,__m128 b)3903*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
3904*3f1979aaSAndroid Build Coastguard Worker {
3905*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_u32(vmvnq_u32(
3906*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
3907*3f1979aaSAndroid Build Coastguard Worker }
3908*3f1979aaSAndroid Build Coastguard Worker
3909*3f1979aaSAndroid Build Coastguard Worker // Compares for inequality.
3910*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
_mm_cmpneq_ss(__m128 a,__m128 b)3911*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
3912*3f1979aaSAndroid Build Coastguard Worker {
3913*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
3914*3f1979aaSAndroid Build Coastguard Worker }
3915*3f1979aaSAndroid Build Coastguard Worker
3916*3f1979aaSAndroid Build Coastguard Worker // Compares for not greater than or equal.
3917*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
_mm_cmpnge_ps(__m128 a,__m128 b)3918*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
3919*3f1979aaSAndroid Build Coastguard Worker {
3920*3f1979aaSAndroid Build Coastguard Worker return _mm_cmplt_ps(a, b);
3921*3f1979aaSAndroid Build Coastguard Worker }
3922*3f1979aaSAndroid Build Coastguard Worker
3923*3f1979aaSAndroid Build Coastguard Worker // Compares for not greater than or equal.
3924*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
_mm_cmpnge_ss(__m128 a,__m128 b)3925*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
3926*3f1979aaSAndroid Build Coastguard Worker {
3927*3f1979aaSAndroid Build Coastguard Worker return _mm_cmplt_ss(a, b);
3928*3f1979aaSAndroid Build Coastguard Worker }
3929*3f1979aaSAndroid Build Coastguard Worker
3930*3f1979aaSAndroid Build Coastguard Worker // Compares for not greater than.
3931*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
_mm_cmpngt_ps(__m128 a,__m128 b)3932*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
3933*3f1979aaSAndroid Build Coastguard Worker {
3934*3f1979aaSAndroid Build Coastguard Worker return _mm_cmple_ps(a, b);
3935*3f1979aaSAndroid Build Coastguard Worker }
3936*3f1979aaSAndroid Build Coastguard Worker
3937*3f1979aaSAndroid Build Coastguard Worker // Compares for not greater than.
3938*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpngt_ss(__m128 a,__m128 b)3939*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
3940*3f1979aaSAndroid Build Coastguard Worker {
3941*3f1979aaSAndroid Build Coastguard Worker return _mm_cmple_ss(a, b);
3942*3f1979aaSAndroid Build Coastguard Worker }
3943*3f1979aaSAndroid Build Coastguard Worker
3944*3f1979aaSAndroid Build Coastguard Worker // Compares for not less than or equal.
3945*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
_mm_cmpnle_ps(__m128 a,__m128 b)3946*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
3947*3f1979aaSAndroid Build Coastguard Worker {
3948*3f1979aaSAndroid Build Coastguard Worker return _mm_cmpgt_ps(a, b);
3949*3f1979aaSAndroid Build Coastguard Worker }
3950*3f1979aaSAndroid Build Coastguard Worker
3951*3f1979aaSAndroid Build Coastguard Worker // Compares for not less than or equal.
3952*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpnle_ss(__m128 a,__m128 b)3953*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
3954*3f1979aaSAndroid Build Coastguard Worker {
3955*3f1979aaSAndroid Build Coastguard Worker return _mm_cmpgt_ss(a, b);
3956*3f1979aaSAndroid Build Coastguard Worker }
3957*3f1979aaSAndroid Build Coastguard Worker
3958*3f1979aaSAndroid Build Coastguard Worker // Compares for not less than.
3959*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
_mm_cmpnlt_ps(__m128 a,__m128 b)3960*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
3961*3f1979aaSAndroid Build Coastguard Worker {
3962*3f1979aaSAndroid Build Coastguard Worker return _mm_cmpge_ps(a, b);
3963*3f1979aaSAndroid Build Coastguard Worker }
3964*3f1979aaSAndroid Build Coastguard Worker
3965*3f1979aaSAndroid Build Coastguard Worker // Compares for not less than.
3966*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
_mm_cmpnlt_ss(__m128 a,__m128 b)3967*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
3968*3f1979aaSAndroid Build Coastguard Worker {
3969*3f1979aaSAndroid Build Coastguard Worker return _mm_cmpge_ss(a, b);
3970*3f1979aaSAndroid Build Coastguard Worker }
3971*3f1979aaSAndroid Build Coastguard Worker
3972*3f1979aaSAndroid Build Coastguard Worker // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3973*3f1979aaSAndroid Build Coastguard Worker // unsigned 8-bit integers in b for equality.
3974*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
_mm_cmpeq_epi8(__m128i a,__m128i b)3975*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3976*3f1979aaSAndroid Build Coastguard Worker {
3977*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(
3978*3f1979aaSAndroid Build Coastguard Worker vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3979*3f1979aaSAndroid Build Coastguard Worker }
3980*3f1979aaSAndroid Build Coastguard Worker
3981*3f1979aaSAndroid Build Coastguard Worker // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3982*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integers in b for equality.
3983*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
_mm_cmpeq_epi16(__m128i a,__m128i b)3984*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3985*3f1979aaSAndroid Build Coastguard Worker {
3986*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u16(
3987*3f1979aaSAndroid Build Coastguard Worker vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3988*3f1979aaSAndroid Build Coastguard Worker }
3989*3f1979aaSAndroid Build Coastguard Worker
3990*3f1979aaSAndroid Build Coastguard Worker // Compare packed 32-bit integers in a and b for equality, and store the results
3991*3f1979aaSAndroid Build Coastguard Worker // in dst
_mm_cmpeq_epi32(__m128i a,__m128i b)3992*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3993*3f1979aaSAndroid Build Coastguard Worker {
3994*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u32(
3995*3f1979aaSAndroid Build Coastguard Worker vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3996*3f1979aaSAndroid Build Coastguard Worker }
3997*3f1979aaSAndroid Build Coastguard Worker
3998*3f1979aaSAndroid Build Coastguard Worker // Compare packed 64-bit integers in a and b for equality, and store the results
3999*3f1979aaSAndroid Build Coastguard Worker // in dst
_mm_cmpeq_epi64(__m128i a,__m128i b)4000*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
4001*3f1979aaSAndroid Build Coastguard Worker {
4002*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4003*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(
4004*3f1979aaSAndroid Build Coastguard Worker vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
4005*3f1979aaSAndroid Build Coastguard Worker #else
4006*3f1979aaSAndroid Build Coastguard Worker // ARMv7 lacks vceqq_u64
4007*3f1979aaSAndroid Build Coastguard Worker // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
4008*3f1979aaSAndroid Build Coastguard Worker uint32x4_t cmp =
4009*3f1979aaSAndroid Build Coastguard Worker vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
4010*3f1979aaSAndroid Build Coastguard Worker uint32x4_t swapped = vrev64q_u32(cmp);
4011*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
4012*3f1979aaSAndroid Build Coastguard Worker #endif
4013*3f1979aaSAndroid Build Coastguard Worker }
4014*3f1979aaSAndroid Build Coastguard Worker
4015*3f1979aaSAndroid Build Coastguard Worker // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4016*3f1979aaSAndroid Build Coastguard Worker // in b for lesser than.
4017*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
_mm_cmplt_epi8(__m128i a,__m128i b)4018*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
4019*3f1979aaSAndroid Build Coastguard Worker {
4020*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(
4021*3f1979aaSAndroid Build Coastguard Worker vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4022*3f1979aaSAndroid Build Coastguard Worker }
4023*3f1979aaSAndroid Build Coastguard Worker
4024*3f1979aaSAndroid Build Coastguard Worker // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4025*3f1979aaSAndroid Build Coastguard Worker // in b for greater than.
4026*3f1979aaSAndroid Build Coastguard Worker //
4027*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 > b0) ? 0xff : 0x0
4028*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 > b1) ? 0xff : 0x0
4029*3f1979aaSAndroid Build Coastguard Worker // ...
4030*3f1979aaSAndroid Build Coastguard Worker // r15 := (a15 > b15) ? 0xff : 0x0
4031*3f1979aaSAndroid Build Coastguard Worker //
4032*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
_mm_cmpgt_epi8(__m128i a,__m128i b)4033*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
4034*3f1979aaSAndroid Build Coastguard Worker {
4035*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(
4036*3f1979aaSAndroid Build Coastguard Worker vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4037*3f1979aaSAndroid Build Coastguard Worker }
4038*3f1979aaSAndroid Build Coastguard Worker
4039*3f1979aaSAndroid Build Coastguard Worker // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4040*3f1979aaSAndroid Build Coastguard Worker // in b for less than.
4041*3f1979aaSAndroid Build Coastguard Worker //
4042*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 < b0) ? 0xffff : 0x0
4043*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 < b1) ? 0xffff : 0x0
4044*3f1979aaSAndroid Build Coastguard Worker // ...
4045*3f1979aaSAndroid Build Coastguard Worker // r7 := (a7 < b7) ? 0xffff : 0x0
4046*3f1979aaSAndroid Build Coastguard Worker //
4047*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
_mm_cmplt_epi16(__m128i a,__m128i b)4048*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
4049*3f1979aaSAndroid Build Coastguard Worker {
4050*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u16(
4051*3f1979aaSAndroid Build Coastguard Worker vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4052*3f1979aaSAndroid Build Coastguard Worker }
4053*3f1979aaSAndroid Build Coastguard Worker
4054*3f1979aaSAndroid Build Coastguard Worker // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4055*3f1979aaSAndroid Build Coastguard Worker // in b for greater than.
4056*3f1979aaSAndroid Build Coastguard Worker //
4057*3f1979aaSAndroid Build Coastguard Worker // r0 := (a0 > b0) ? 0xffff : 0x0
4058*3f1979aaSAndroid Build Coastguard Worker // r1 := (a1 > b1) ? 0xffff : 0x0
4059*3f1979aaSAndroid Build Coastguard Worker // ...
4060*3f1979aaSAndroid Build Coastguard Worker // r7 := (a7 > b7) ? 0xffff : 0x0
4061*3f1979aaSAndroid Build Coastguard Worker //
4062*3f1979aaSAndroid Build Coastguard Worker // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
_mm_cmpgt_epi16(__m128i a,__m128i b)4063*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
4064*3f1979aaSAndroid Build Coastguard Worker {
4065*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u16(
4066*3f1979aaSAndroid Build Coastguard Worker vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4067*3f1979aaSAndroid Build Coastguard Worker }
4068*3f1979aaSAndroid Build Coastguard Worker
4069*3f1979aaSAndroid Build Coastguard Worker
4070*3f1979aaSAndroid Build Coastguard Worker // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4071*3f1979aaSAndroid Build Coastguard Worker // in b for less than.
4072*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
_mm_cmplt_epi32(__m128i a,__m128i b)4073*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
4074*3f1979aaSAndroid Build Coastguard Worker {
4075*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u32(
4076*3f1979aaSAndroid Build Coastguard Worker vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4077*3f1979aaSAndroid Build Coastguard Worker }
4078*3f1979aaSAndroid Build Coastguard Worker
4079*3f1979aaSAndroid Build Coastguard Worker // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4080*3f1979aaSAndroid Build Coastguard Worker // in b for greater than.
4081*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
_mm_cmpgt_epi32(__m128i a,__m128i b)4082*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
4083*3f1979aaSAndroid Build Coastguard Worker {
4084*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u32(
4085*3f1979aaSAndroid Build Coastguard Worker vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4086*3f1979aaSAndroid Build Coastguard Worker }
4087*3f1979aaSAndroid Build Coastguard Worker
4088*3f1979aaSAndroid Build Coastguard Worker // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
4089*3f1979aaSAndroid Build Coastguard Worker // in b for greater than.
_mm_cmpgt_epi64(__m128i a,__m128i b)4090*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
4091*3f1979aaSAndroid Build Coastguard Worker {
4092*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4093*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(
4094*3f1979aaSAndroid Build Coastguard Worker vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
4095*3f1979aaSAndroid Build Coastguard Worker #else
4096*3f1979aaSAndroid Build Coastguard Worker // ARMv7 lacks vcgtq_s64.
4097*3f1979aaSAndroid Build Coastguard Worker // This is based off of Clang's SSE2 polyfill:
4098*3f1979aaSAndroid Build Coastguard Worker // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
4099*3f1979aaSAndroid Build Coastguard Worker
4100*3f1979aaSAndroid Build Coastguard Worker // Mask the sign bit out since we need a signed AND an unsigned comparison
4101*3f1979aaSAndroid Build Coastguard Worker // and it is ugly to try and split them.
4102*3f1979aaSAndroid Build Coastguard Worker int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
4103*3f1979aaSAndroid Build Coastguard Worker int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
4104*3f1979aaSAndroid Build Coastguard Worker int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
4105*3f1979aaSAndroid Build Coastguard Worker // Check if a > b
4106*3f1979aaSAndroid Build Coastguard Worker int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
4107*3f1979aaSAndroid Build Coastguard Worker // Copy upper mask to lower mask
4108*3f1979aaSAndroid Build Coastguard Worker // a_hi > b_hi
4109*3f1979aaSAndroid Build Coastguard Worker int64x2_t gt_hi = vshrq_n_s64(greater, 63);
4110*3f1979aaSAndroid Build Coastguard Worker // Copy lower mask to upper mask
4111*3f1979aaSAndroid Build Coastguard Worker // a_lo > b_lo
4112*3f1979aaSAndroid Build Coastguard Worker int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
4113*3f1979aaSAndroid Build Coastguard Worker // Compare for equality
4114*3f1979aaSAndroid Build Coastguard Worker int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
4115*3f1979aaSAndroid Build Coastguard Worker // Copy upper mask to lower mask
4116*3f1979aaSAndroid Build Coastguard Worker // a_hi == b_hi
4117*3f1979aaSAndroid Build Coastguard Worker int64x2_t eq_hi = vshrq_n_s64(equal, 63);
4118*3f1979aaSAndroid Build Coastguard Worker // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
4119*3f1979aaSAndroid Build Coastguard Worker int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
4120*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(ret);
4121*3f1979aaSAndroid Build Coastguard Worker #endif
4122*3f1979aaSAndroid Build Coastguard Worker }
4123*3f1979aaSAndroid Build Coastguard Worker
4124*3f1979aaSAndroid Build Coastguard Worker // Compares the four 32-bit floats in a and b to check if any values are NaN.
4125*3f1979aaSAndroid Build Coastguard Worker // Ordered compare between each value returns true for "orderable" and false for
4126*3f1979aaSAndroid Build Coastguard Worker // "not orderable" (NaN).
4127*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
4128*3f1979aaSAndroid Build Coastguard Worker // also:
4129*3f1979aaSAndroid Build Coastguard Worker // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
4130*3f1979aaSAndroid Build Coastguard Worker // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
_mm_cmpord_ps(__m128 a,__m128 b)4131*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
4132*3f1979aaSAndroid Build Coastguard Worker {
4133*3f1979aaSAndroid Build Coastguard Worker // Note: NEON does not have ordered compare builtin
4134*3f1979aaSAndroid Build Coastguard Worker // Need to compare a eq a and b eq b to check for NaN
4135*3f1979aaSAndroid Build Coastguard Worker // Do AND of results to get final
4136*3f1979aaSAndroid Build Coastguard Worker uint32x4_t ceqaa =
4137*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4138*3f1979aaSAndroid Build Coastguard Worker uint32x4_t ceqbb =
4139*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4140*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
4141*3f1979aaSAndroid Build Coastguard Worker }
4142*3f1979aaSAndroid Build Coastguard Worker
4143*3f1979aaSAndroid Build Coastguard Worker // Compares for ordered.
4144*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
_mm_cmpord_ss(__m128 a,__m128 b)4145*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
4146*3f1979aaSAndroid Build Coastguard Worker {
4147*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_cmpord_ps(a, b));
4148*3f1979aaSAndroid Build Coastguard Worker }
4149*3f1979aaSAndroid Build Coastguard Worker
4150*3f1979aaSAndroid Build Coastguard Worker // Compares for unordered.
4151*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
_mm_cmpunord_ps(__m128 a,__m128 b)4152*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
4153*3f1979aaSAndroid Build Coastguard Worker {
4154*3f1979aaSAndroid Build Coastguard Worker uint32x4_t f32a =
4155*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4156*3f1979aaSAndroid Build Coastguard Worker uint32x4_t f32b =
4157*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4158*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
4159*3f1979aaSAndroid Build Coastguard Worker }
4160*3f1979aaSAndroid Build Coastguard Worker
4161*3f1979aaSAndroid Build Coastguard Worker // Compares for unordered.
4162*3f1979aaSAndroid Build Coastguard Worker // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
_mm_cmpunord_ss(__m128 a,__m128 b)4163*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
4164*3f1979aaSAndroid Build Coastguard Worker {
4165*3f1979aaSAndroid Build Coastguard Worker return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
4166*3f1979aaSAndroid Build Coastguard Worker }
4167*3f1979aaSAndroid Build Coastguard Worker
4168*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4169*3f1979aaSAndroid Build Coastguard Worker // using a less than operation. :
4170*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
4171*3f1979aaSAndroid Build Coastguard Worker // note!! The documentation on MSDN is incorrect! If either of the values is a
4172*3f1979aaSAndroid Build Coastguard Worker // NAN the docs say you will get a one, but in fact, it will return a zero!!
_mm_comilt_ss(__m128 a,__m128 b)4173*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
4174*3f1979aaSAndroid Build Coastguard Worker {
4175*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_not_nan =
4176*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4177*3f1979aaSAndroid Build Coastguard Worker uint32x4_t b_not_nan =
4178*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4179*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4180*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_lt_b =
4181*3f1979aaSAndroid Build Coastguard Worker vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4182*3f1979aaSAndroid Build Coastguard Worker return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
4183*3f1979aaSAndroid Build Coastguard Worker }
4184*3f1979aaSAndroid Build Coastguard Worker
4185*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4186*3f1979aaSAndroid Build Coastguard Worker // using a greater than operation. :
4187*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
_mm_comigt_ss(__m128 a,__m128 b)4188*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
4189*3f1979aaSAndroid Build Coastguard Worker {
4190*3f1979aaSAndroid Build Coastguard Worker // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
4191*3f1979aaSAndroid Build Coastguard Worker // vreinterpretq_f32_m128(b)), 0);
4192*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_not_nan =
4193*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4194*3f1979aaSAndroid Build Coastguard Worker uint32x4_t b_not_nan =
4195*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4196*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4197*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_gt_b =
4198*3f1979aaSAndroid Build Coastguard Worker vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4199*3f1979aaSAndroid Build Coastguard Worker return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
4200*3f1979aaSAndroid Build Coastguard Worker }
4201*3f1979aaSAndroid Build Coastguard Worker
4202*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4203*3f1979aaSAndroid Build Coastguard Worker // using a less than or equal operation. :
4204*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
_mm_comile_ss(__m128 a,__m128 b)4205*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
4206*3f1979aaSAndroid Build Coastguard Worker {
4207*3f1979aaSAndroid Build Coastguard Worker // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
4208*3f1979aaSAndroid Build Coastguard Worker // vreinterpretq_f32_m128(b)), 0);
4209*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_not_nan =
4210*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4211*3f1979aaSAndroid Build Coastguard Worker uint32x4_t b_not_nan =
4212*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4213*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4214*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_le_b =
4215*3f1979aaSAndroid Build Coastguard Worker vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4216*3f1979aaSAndroid Build Coastguard Worker return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
4217*3f1979aaSAndroid Build Coastguard Worker }
4218*3f1979aaSAndroid Build Coastguard Worker
4219*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4220*3f1979aaSAndroid Build Coastguard Worker // using a greater than or equal operation. :
4221*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
_mm_comige_ss(__m128 a,__m128 b)4222*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
4223*3f1979aaSAndroid Build Coastguard Worker {
4224*3f1979aaSAndroid Build Coastguard Worker // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
4225*3f1979aaSAndroid Build Coastguard Worker // vreinterpretq_f32_m128(b)), 0);
4226*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_not_nan =
4227*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4228*3f1979aaSAndroid Build Coastguard Worker uint32x4_t b_not_nan =
4229*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4230*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4231*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_ge_b =
4232*3f1979aaSAndroid Build Coastguard Worker vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4233*3f1979aaSAndroid Build Coastguard Worker return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
4234*3f1979aaSAndroid Build Coastguard Worker }
4235*3f1979aaSAndroid Build Coastguard Worker
4236*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4237*3f1979aaSAndroid Build Coastguard Worker // using an equality operation. :
4238*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
_mm_comieq_ss(__m128 a,__m128 b)4239*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
4240*3f1979aaSAndroid Build Coastguard Worker {
4241*3f1979aaSAndroid Build Coastguard Worker // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4242*3f1979aaSAndroid Build Coastguard Worker // vreinterpretq_f32_m128(b)), 0);
4243*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_not_nan =
4244*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4245*3f1979aaSAndroid Build Coastguard Worker uint32x4_t b_not_nan =
4246*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4247*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4248*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_eq_b =
4249*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4250*3f1979aaSAndroid Build Coastguard Worker return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
4251*3f1979aaSAndroid Build Coastguard Worker }
4252*3f1979aaSAndroid Build Coastguard Worker
4253*3f1979aaSAndroid Build Coastguard Worker // Compares the lower single-precision floating point scalar values of a and b
4254*3f1979aaSAndroid Build Coastguard Worker // using an inequality operation. :
4255*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
_mm_comineq_ss(__m128 a,__m128 b)4256*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
4257*3f1979aaSAndroid Build Coastguard Worker {
4258*3f1979aaSAndroid Build Coastguard Worker // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4259*3f1979aaSAndroid Build Coastguard Worker // vreinterpretq_f32_m128(b)), 0);
4260*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_not_nan =
4261*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4262*3f1979aaSAndroid Build Coastguard Worker uint32x4_t b_not_nan =
4263*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4264*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4265*3f1979aaSAndroid Build Coastguard Worker uint32x4_t a_neq_b = vmvnq_u32(
4266*3f1979aaSAndroid Build Coastguard Worker vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4267*3f1979aaSAndroid Build Coastguard Worker return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
4268*3f1979aaSAndroid Build Coastguard Worker }
4269*3f1979aaSAndroid Build Coastguard Worker
4270*3f1979aaSAndroid Build Coastguard Worker // according to the documentation, these intrinsics behave the same as the
4271*3f1979aaSAndroid Build Coastguard Worker // non-'u' versions. We'll just alias them here.
4272*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomilt_ss _mm_comilt_ss
4273*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomile_ss _mm_comile_ss
4274*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomigt_ss _mm_comigt_ss
4275*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomige_ss _mm_comige_ss
4276*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomieq_ss _mm_comieq_ss
4277*3f1979aaSAndroid Build Coastguard Worker #define _mm_ucomineq_ss _mm_comineq_ss
4278*3f1979aaSAndroid Build Coastguard Worker
4279*3f1979aaSAndroid Build Coastguard Worker /* Conversions */
4280*3f1979aaSAndroid Build Coastguard Worker
4281*3f1979aaSAndroid Build Coastguard Worker // Convert packed signed 32-bit integers in b to packed single-precision
4282*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point elements, store the results in the lower 2 elements
4283*3f1979aaSAndroid Build Coastguard Worker // of dst, and copy the upper 2 packed elements from a to the upper elements of
4284*3f1979aaSAndroid Build Coastguard Worker // dst.
4285*3f1979aaSAndroid Build Coastguard Worker //
4286*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4287*3f1979aaSAndroid Build Coastguard Worker // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4288*3f1979aaSAndroid Build Coastguard Worker // dst[95:64] := a[95:64]
4289*3f1979aaSAndroid Build Coastguard Worker // dst[127:96] := a[127:96]
4290*3f1979aaSAndroid Build Coastguard Worker //
4291*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
_mm_cvt_pi2ps(__m128 a,__m64 b)4292*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
4293*3f1979aaSAndroid Build Coastguard Worker {
4294*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
4295*3f1979aaSAndroid Build Coastguard Worker vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4296*3f1979aaSAndroid Build Coastguard Worker vget_high_f32(vreinterpretq_f32_m128(a))));
4297*3f1979aaSAndroid Build Coastguard Worker }
4298*3f1979aaSAndroid Build Coastguard Worker
4299*3f1979aaSAndroid Build Coastguard Worker // Convert the signed 32-bit integer b to a single-precision (32-bit)
4300*3f1979aaSAndroid Build Coastguard Worker // floating-point element, store the result in the lower element of dst, and
4301*3f1979aaSAndroid Build Coastguard Worker // copy the upper 3 packed elements from a to the upper elements of dst.
4302*3f1979aaSAndroid Build Coastguard Worker //
4303*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4304*3f1979aaSAndroid Build Coastguard Worker // dst[127:32] := a[127:32]
4305*3f1979aaSAndroid Build Coastguard Worker //
4306*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
_mm_cvt_si2ss(__m128 a,int b)4307*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
4308*3f1979aaSAndroid Build Coastguard Worker {
4309*3f1979aaSAndroid Build Coastguard Worker __m128 ret = a;
4310*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
4311*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_f32((float) b, vreinterpretq_f32_m128(ret), 0));
4312*3f1979aaSAndroid Build Coastguard Worker }
4313*3f1979aaSAndroid Build Coastguard Worker
4314*3f1979aaSAndroid Build Coastguard Worker // Convert the lower single-precision (32-bit) floating-point element in a to a
4315*3f1979aaSAndroid Build Coastguard Worker // 32-bit integer, and store the result in dst.
4316*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
_mm_cvt_ss2si(__m128 a)4317*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
4318*3f1979aaSAndroid Build Coastguard Worker {
4319*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4320*3f1979aaSAndroid Build Coastguard Worker return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
4321*3f1979aaSAndroid Build Coastguard Worker #else
4322*3f1979aaSAndroid Build Coastguard Worker float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4323*3f1979aaSAndroid Build Coastguard Worker float32_t diff = data - floor(data);
4324*3f1979aaSAndroid Build Coastguard Worker if (diff > 0.5)
4325*3f1979aaSAndroid Build Coastguard Worker return (int32_t) ceil(data);
4326*3f1979aaSAndroid Build Coastguard Worker if (diff == 0.5) {
4327*3f1979aaSAndroid Build Coastguard Worker int32_t f = (int32_t) floor(data);
4328*3f1979aaSAndroid Build Coastguard Worker int32_t c = (int32_t) ceil(data);
4329*3f1979aaSAndroid Build Coastguard Worker return c & 1 ? f : c;
4330*3f1979aaSAndroid Build Coastguard Worker }
4331*3f1979aaSAndroid Build Coastguard Worker return (int32_t) floor(data);
4332*3f1979aaSAndroid Build Coastguard Worker #endif
4333*3f1979aaSAndroid Build Coastguard Worker }
4334*3f1979aaSAndroid Build Coastguard Worker
4335*3f1979aaSAndroid Build Coastguard Worker // Convert packed 16-bit integers in a to packed single-precision (32-bit)
4336*3f1979aaSAndroid Build Coastguard Worker // floating-point elements, and store the results in dst.
4337*3f1979aaSAndroid Build Coastguard Worker //
4338*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
4339*3f1979aaSAndroid Build Coastguard Worker // i := j*16
4340*3f1979aaSAndroid Build Coastguard Worker // m := j*32
4341*3f1979aaSAndroid Build Coastguard Worker // dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
4342*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
4343*3f1979aaSAndroid Build Coastguard Worker //
4344*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
_mm_cvtpi16_ps(__m64 a)4345*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
4346*3f1979aaSAndroid Build Coastguard Worker {
4347*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
4348*3f1979aaSAndroid Build Coastguard Worker vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
4349*3f1979aaSAndroid Build Coastguard Worker }
4350*3f1979aaSAndroid Build Coastguard Worker
4351*3f1979aaSAndroid Build Coastguard Worker // Convert packed 32-bit integers in b to packed single-precision (32-bit)
4352*3f1979aaSAndroid Build Coastguard Worker // floating-point elements, store the results in the lower 2 elements of dst,
4353*3f1979aaSAndroid Build Coastguard Worker // and copy the upper 2 packed elements from a to the upper elements of dst.
4354*3f1979aaSAndroid Build Coastguard Worker //
4355*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4356*3f1979aaSAndroid Build Coastguard Worker // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4357*3f1979aaSAndroid Build Coastguard Worker // dst[95:64] := a[95:64]
4358*3f1979aaSAndroid Build Coastguard Worker // dst[127:96] := a[127:96]
4359*3f1979aaSAndroid Build Coastguard Worker //
4360*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
_mm_cvtpi32_ps(__m128 a,__m64 b)4361*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
4362*3f1979aaSAndroid Build Coastguard Worker {
4363*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
4364*3f1979aaSAndroid Build Coastguard Worker vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4365*3f1979aaSAndroid Build Coastguard Worker vget_high_f32(vreinterpretq_f32_m128(a))));
4366*3f1979aaSAndroid Build Coastguard Worker }
4367*3f1979aaSAndroid Build Coastguard Worker
4368*3f1979aaSAndroid Build Coastguard Worker // Convert packed signed 32-bit integers in a to packed single-precision
4369*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point elements, store the results in the lower 2 elements
4370*3f1979aaSAndroid Build Coastguard Worker // of dst, then covert the packed signed 32-bit integers in b to
4371*3f1979aaSAndroid Build Coastguard Worker // single-precision (32-bit) floating-point element, and store the results in
4372*3f1979aaSAndroid Build Coastguard Worker // the upper 2 elements of dst.
4373*3f1979aaSAndroid Build Coastguard Worker //
4374*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := Convert_Int32_To_FP32(a[31:0])
4375*3f1979aaSAndroid Build Coastguard Worker // dst[63:32] := Convert_Int32_To_FP32(a[63:32])
4376*3f1979aaSAndroid Build Coastguard Worker // dst[95:64] := Convert_Int32_To_FP32(b[31:0])
4377*3f1979aaSAndroid Build Coastguard Worker // dst[127:96] := Convert_Int32_To_FP32(b[63:32])
4378*3f1979aaSAndroid Build Coastguard Worker //
4379*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
_mm_cvtpi32x2_ps(__m64 a,__m64 b)4380*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
4381*3f1979aaSAndroid Build Coastguard Worker {
4382*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcvtq_f32_s32(
4383*3f1979aaSAndroid Build Coastguard Worker vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
4384*3f1979aaSAndroid Build Coastguard Worker }
4385*3f1979aaSAndroid Build Coastguard Worker
4386*3f1979aaSAndroid Build Coastguard Worker // Convert the lower packed 8-bit integers in a to packed single-precision
4387*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point elements, and store the results in dst.
4388*3f1979aaSAndroid Build Coastguard Worker //
4389*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
4390*3f1979aaSAndroid Build Coastguard Worker // i := j*8
4391*3f1979aaSAndroid Build Coastguard Worker // m := j*32
4392*3f1979aaSAndroid Build Coastguard Worker // dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
4393*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
4394*3f1979aaSAndroid Build Coastguard Worker //
4395*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
_mm_cvtpi8_ps(__m64 a)4396*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
4397*3f1979aaSAndroid Build Coastguard Worker {
4398*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcvtq_f32_s32(
4399*3f1979aaSAndroid Build Coastguard Worker vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
4400*3f1979aaSAndroid Build Coastguard Worker }
4401*3f1979aaSAndroid Build Coastguard Worker
4402*3f1979aaSAndroid Build Coastguard Worker // Convert packed unsigned 16-bit integers in a to packed single-precision
4403*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point elements, and store the results in dst.
4404*3f1979aaSAndroid Build Coastguard Worker //
4405*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
4406*3f1979aaSAndroid Build Coastguard Worker // i := j*16
4407*3f1979aaSAndroid Build Coastguard Worker // m := j*32
4408*3f1979aaSAndroid Build Coastguard Worker // dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
4409*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
4410*3f1979aaSAndroid Build Coastguard Worker //
4411*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
_mm_cvtpu16_ps(__m64 a)4412*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
4413*3f1979aaSAndroid Build Coastguard Worker {
4414*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
4415*3f1979aaSAndroid Build Coastguard Worker vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
4416*3f1979aaSAndroid Build Coastguard Worker }
4417*3f1979aaSAndroid Build Coastguard Worker
4418*3f1979aaSAndroid Build Coastguard Worker // Convert the lower packed unsigned 8-bit integers in a to packed
4419*3f1979aaSAndroid Build Coastguard Worker // single-precision (32-bit) floating-point elements, and store the results in
4420*3f1979aaSAndroid Build Coastguard Worker // dst.
4421*3f1979aaSAndroid Build Coastguard Worker //
4422*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 3
4423*3f1979aaSAndroid Build Coastguard Worker // i := j*8
4424*3f1979aaSAndroid Build Coastguard Worker // m := j*32
4425*3f1979aaSAndroid Build Coastguard Worker // dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
4426*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
4427*3f1979aaSAndroid Build Coastguard Worker //
4428*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
_mm_cvtpu8_ps(__m64 a)4429*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
4430*3f1979aaSAndroid Build Coastguard Worker {
4431*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcvtq_f32_u32(
4432*3f1979aaSAndroid Build Coastguard Worker vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
4433*3f1979aaSAndroid Build Coastguard Worker }
4434*3f1979aaSAndroid Build Coastguard Worker
4435*3f1979aaSAndroid Build Coastguard Worker // Converts the four single-precision, floating-point values of a to signed
4436*3f1979aaSAndroid Build Coastguard Worker // 32-bit integer values using truncate.
4437*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
_mm_cvttps_epi32(__m128 a)4438*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4439*3f1979aaSAndroid Build Coastguard Worker {
4440*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4441*3f1979aaSAndroid Build Coastguard Worker }
4442*3f1979aaSAndroid Build Coastguard Worker
4443*3f1979aaSAndroid Build Coastguard Worker // Converts the four signed 32-bit integer values of a to single-precision,
4444*3f1979aaSAndroid Build Coastguard Worker // floating-point values
4445*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
_mm_cvtepi32_ps(__m128i a)4446*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
4447*3f1979aaSAndroid Build Coastguard Worker {
4448*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
4449*3f1979aaSAndroid Build Coastguard Worker }
4450*3f1979aaSAndroid Build Coastguard Worker
4451*3f1979aaSAndroid Build Coastguard Worker // Converts the four unsigned 8-bit integers in the lower 16 bits to four
4452*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers.
_mm_cvtepu8_epi16(__m128i a)4453*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
4454*3f1979aaSAndroid Build Coastguard Worker {
4455*3f1979aaSAndroid Build Coastguard Worker uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
4456*3f1979aaSAndroid Build Coastguard Worker uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4457*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u16(u16x8);
4458*3f1979aaSAndroid Build Coastguard Worker }
4459*3f1979aaSAndroid Build Coastguard Worker
4460*3f1979aaSAndroid Build Coastguard Worker // Converts the four unsigned 8-bit integers in the lower 32 bits to four
4461*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers.
4462*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
_mm_cvtepu8_epi32(__m128i a)4463*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
4464*3f1979aaSAndroid Build Coastguard Worker {
4465*3f1979aaSAndroid Build Coastguard Worker uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
4466*3f1979aaSAndroid Build Coastguard Worker uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4467*3f1979aaSAndroid Build Coastguard Worker uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
4468*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u32(u32x4);
4469*3f1979aaSAndroid Build Coastguard Worker }
4470*3f1979aaSAndroid Build Coastguard Worker
4471*3f1979aaSAndroid Build Coastguard Worker // Converts the two unsigned 8-bit integers in the lower 16 bits to two
4472*3f1979aaSAndroid Build Coastguard Worker // unsigned 64-bit integers.
_mm_cvtepu8_epi64(__m128i a)4473*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
4474*3f1979aaSAndroid Build Coastguard Worker {
4475*3f1979aaSAndroid Build Coastguard Worker uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
4476*3f1979aaSAndroid Build Coastguard Worker uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
4477*3f1979aaSAndroid Build Coastguard Worker uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4478*3f1979aaSAndroid Build Coastguard Worker uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4479*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(u64x2);
4480*3f1979aaSAndroid Build Coastguard Worker }
4481*3f1979aaSAndroid Build Coastguard Worker
4482*3f1979aaSAndroid Build Coastguard Worker // Converts the four unsigned 8-bit integers in the lower 16 bits to four
4483*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers.
_mm_cvtepi8_epi16(__m128i a)4484*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
4485*3f1979aaSAndroid Build Coastguard Worker {
4486*3f1979aaSAndroid Build Coastguard Worker int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
4487*3f1979aaSAndroid Build Coastguard Worker int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4488*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(s16x8);
4489*3f1979aaSAndroid Build Coastguard Worker }
4490*3f1979aaSAndroid Build Coastguard Worker
4491*3f1979aaSAndroid Build Coastguard Worker // Converts the four unsigned 8-bit integers in the lower 32 bits to four
4492*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers.
_mm_cvtepi8_epi32(__m128i a)4493*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
4494*3f1979aaSAndroid Build Coastguard Worker {
4495*3f1979aaSAndroid Build Coastguard Worker int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
4496*3f1979aaSAndroid Build Coastguard Worker int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4497*3f1979aaSAndroid Build Coastguard Worker int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
4498*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(s32x4);
4499*3f1979aaSAndroid Build Coastguard Worker }
4500*3f1979aaSAndroid Build Coastguard Worker
4501*3f1979aaSAndroid Build Coastguard Worker // Converts the two signed 8-bit integers in the lower 32 bits to four
4502*3f1979aaSAndroid Build Coastguard Worker // signed 64-bit integers.
_mm_cvtepi8_epi64(__m128i a)4503*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
4504*3f1979aaSAndroid Build Coastguard Worker {
4505*3f1979aaSAndroid Build Coastguard Worker int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
4506*3f1979aaSAndroid Build Coastguard Worker int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
4507*3f1979aaSAndroid Build Coastguard Worker int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4508*3f1979aaSAndroid Build Coastguard Worker int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4509*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(s64x2);
4510*3f1979aaSAndroid Build Coastguard Worker }
4511*3f1979aaSAndroid Build Coastguard Worker
4512*3f1979aaSAndroid Build Coastguard Worker // Converts the four signed 16-bit integers in the lower 64 bits to four signed
4513*3f1979aaSAndroid Build Coastguard Worker // 32-bit integers.
_mm_cvtepi16_epi32(__m128i a)4514*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
4515*3f1979aaSAndroid Build Coastguard Worker {
4516*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
4517*3f1979aaSAndroid Build Coastguard Worker vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
4518*3f1979aaSAndroid Build Coastguard Worker }
4519*3f1979aaSAndroid Build Coastguard Worker
4520*3f1979aaSAndroid Build Coastguard Worker // Converts the two signed 16-bit integers in the lower 32 bits two signed
4521*3f1979aaSAndroid Build Coastguard Worker // 32-bit integers.
_mm_cvtepi16_epi64(__m128i a)4522*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
4523*3f1979aaSAndroid Build Coastguard Worker {
4524*3f1979aaSAndroid Build Coastguard Worker int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
4525*3f1979aaSAndroid Build Coastguard Worker int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4526*3f1979aaSAndroid Build Coastguard Worker int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4527*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(s64x2);
4528*3f1979aaSAndroid Build Coastguard Worker }
4529*3f1979aaSAndroid Build Coastguard Worker
4530*3f1979aaSAndroid Build Coastguard Worker // Converts the four unsigned 16-bit integers in the lower 64 bits to four
4531*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integers.
_mm_cvtepu16_epi32(__m128i a)4532*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
4533*3f1979aaSAndroid Build Coastguard Worker {
4534*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u32(
4535*3f1979aaSAndroid Build Coastguard Worker vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
4536*3f1979aaSAndroid Build Coastguard Worker }
4537*3f1979aaSAndroid Build Coastguard Worker
4538*3f1979aaSAndroid Build Coastguard Worker // Converts the two unsigned 16-bit integers in the lower 32 bits to two
4539*3f1979aaSAndroid Build Coastguard Worker // unsigned 64-bit integers.
_mm_cvtepu16_epi64(__m128i a)4540*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
4541*3f1979aaSAndroid Build Coastguard Worker {
4542*3f1979aaSAndroid Build Coastguard Worker uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
4543*3f1979aaSAndroid Build Coastguard Worker uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4544*3f1979aaSAndroid Build Coastguard Worker uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4545*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(u64x2);
4546*3f1979aaSAndroid Build Coastguard Worker }
4547*3f1979aaSAndroid Build Coastguard Worker
4548*3f1979aaSAndroid Build Coastguard Worker // Converts the two unsigned 32-bit integers in the lower 64 bits to two
4549*3f1979aaSAndroid Build Coastguard Worker // unsigned 64-bit integers.
_mm_cvtepu32_epi64(__m128i a)4550*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
4551*3f1979aaSAndroid Build Coastguard Worker {
4552*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(
4553*3f1979aaSAndroid Build Coastguard Worker vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
4554*3f1979aaSAndroid Build Coastguard Worker }
4555*3f1979aaSAndroid Build Coastguard Worker
4556*3f1979aaSAndroid Build Coastguard Worker // Converts the two signed 32-bit integers in the lower 64 bits to two signed
4557*3f1979aaSAndroid Build Coastguard Worker // 64-bit integers.
_mm_cvtepi32_epi64(__m128i a)4558*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
4559*3f1979aaSAndroid Build Coastguard Worker {
4560*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(
4561*3f1979aaSAndroid Build Coastguard Worker vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
4562*3f1979aaSAndroid Build Coastguard Worker }
4563*3f1979aaSAndroid Build Coastguard Worker
4564*3f1979aaSAndroid Build Coastguard Worker // Converts the four single-precision, floating-point values of a to signed
4565*3f1979aaSAndroid Build Coastguard Worker // 32-bit integer values.
4566*3f1979aaSAndroid Build Coastguard Worker //
4567*3f1979aaSAndroid Build Coastguard Worker // r0 := (int) a0
4568*3f1979aaSAndroid Build Coastguard Worker // r1 := (int) a1
4569*3f1979aaSAndroid Build Coastguard Worker // r2 := (int) a2
4570*3f1979aaSAndroid Build Coastguard Worker // r3 := (int) a3
4571*3f1979aaSAndroid Build Coastguard Worker //
4572*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
4573*3f1979aaSAndroid Build Coastguard Worker // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
4574*3f1979aaSAndroid Build Coastguard Worker // does not support! It is supported on ARMv8-A however.
_mm_cvtps_epi32(__m128 a)4575*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
4576*3f1979aaSAndroid Build Coastguard Worker {
4577*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4578*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4579*3f1979aaSAndroid Build Coastguard Worker #else
4580*3f1979aaSAndroid Build Coastguard Worker uint32x4_t signmask = vdupq_n_u32(0x80000000);
4581*3f1979aaSAndroid Build Coastguard Worker float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4582*3f1979aaSAndroid Build Coastguard Worker vdupq_n_f32(0.5f)); /* +/- 0.5 */
4583*3f1979aaSAndroid Build Coastguard Worker int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4584*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4585*3f1979aaSAndroid Build Coastguard Worker int32x4_t r_trunc =
4586*3f1979aaSAndroid Build Coastguard Worker vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4587*3f1979aaSAndroid Build Coastguard Worker int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4588*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4589*3f1979aaSAndroid Build Coastguard Worker int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4590*3f1979aaSAndroid Build Coastguard Worker vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4591*3f1979aaSAndroid Build Coastguard Worker float32x4_t delta = vsubq_f32(
4592*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_f32_m128(a),
4593*3f1979aaSAndroid Build Coastguard Worker vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4594*3f1979aaSAndroid Build Coastguard Worker uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
4595*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
4596*3f1979aaSAndroid Build Coastguard Worker #endif
4597*3f1979aaSAndroid Build Coastguard Worker }
4598*3f1979aaSAndroid Build Coastguard Worker
4599*3f1979aaSAndroid Build Coastguard Worker // Copy the lower 32-bit integer in a to dst.
4600*3f1979aaSAndroid Build Coastguard Worker //
4601*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := a[31:0]
4602*3f1979aaSAndroid Build Coastguard Worker //
4603*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
_mm_cvtsi128_si32(__m128i a)4604*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4605*3f1979aaSAndroid Build Coastguard Worker {
4606*3f1979aaSAndroid Build Coastguard Worker return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4607*3f1979aaSAndroid Build Coastguard Worker }
4608*3f1979aaSAndroid Build Coastguard Worker
4609*3f1979aaSAndroid Build Coastguard Worker // Copy the lower 64-bit integer in a to dst.
4610*3f1979aaSAndroid Build Coastguard Worker //
4611*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := a[63:0]
4612*3f1979aaSAndroid Build Coastguard Worker //
4613*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
_mm_cvtsi128_si64(__m128i a)4614*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4615*3f1979aaSAndroid Build Coastguard Worker {
4616*3f1979aaSAndroid Build Coastguard Worker return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4617*3f1979aaSAndroid Build Coastguard Worker }
4618*3f1979aaSAndroid Build Coastguard Worker
4619*3f1979aaSAndroid Build Coastguard Worker // Copy the lower 64-bit integer in a to dst.
4620*3f1979aaSAndroid Build Coastguard Worker //
4621*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := a[63:0]
4622*3f1979aaSAndroid Build Coastguard Worker //
4623*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4624*3f1979aaSAndroid Build Coastguard Worker #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4625*3f1979aaSAndroid Build Coastguard Worker
4626*3f1979aaSAndroid Build Coastguard Worker // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4627*3f1979aaSAndroid Build Coastguard Worker // zero extending the upper bits.
4628*3f1979aaSAndroid Build Coastguard Worker //
4629*3f1979aaSAndroid Build Coastguard Worker // r0 := a
4630*3f1979aaSAndroid Build Coastguard Worker // r1 := 0x0
4631*3f1979aaSAndroid Build Coastguard Worker // r2 := 0x0
4632*3f1979aaSAndroid Build Coastguard Worker // r3 := 0x0
4633*3f1979aaSAndroid Build Coastguard Worker //
4634*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
_mm_cvtsi32_si128(int a)4635*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4636*3f1979aaSAndroid Build Coastguard Worker {
4637*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4638*3f1979aaSAndroid Build Coastguard Worker }
4639*3f1979aaSAndroid Build Coastguard Worker
4640*3f1979aaSAndroid Build Coastguard Worker // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4641*3f1979aaSAndroid Build Coastguard Worker // zero extending the upper bits.
4642*3f1979aaSAndroid Build Coastguard Worker //
4643*3f1979aaSAndroid Build Coastguard Worker // r0 := a
4644*3f1979aaSAndroid Build Coastguard Worker // r1 := 0x0
_mm_cvtsi64_si128(int64_t a)4645*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4646*3f1979aaSAndroid Build Coastguard Worker {
4647*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4648*3f1979aaSAndroid Build Coastguard Worker }
4649*3f1979aaSAndroid Build Coastguard Worker
4650*3f1979aaSAndroid Build Coastguard Worker // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
4651*3f1979aaSAndroid Build Coastguard Worker // compilation and does not generate any instructions, thus it has zero latency.
4652*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
_mm_castps_pd(__m128 a)4653*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
4654*3f1979aaSAndroid Build Coastguard Worker {
4655*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
4656*3f1979aaSAndroid Build Coastguard Worker }
4657*3f1979aaSAndroid Build Coastguard Worker
4658*3f1979aaSAndroid Build Coastguard Worker // Applies a type cast to reinterpret four 32-bit floating point values passed
4659*3f1979aaSAndroid Build Coastguard Worker // in as a 128-bit parameter as packed 32-bit integers.
4660*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb514099.aspx
_mm_castps_si128(__m128 a)4661*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
4662*3f1979aaSAndroid Build Coastguard Worker {
4663*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
4664*3f1979aaSAndroid Build Coastguard Worker }
4665*3f1979aaSAndroid Build Coastguard Worker
4666*3f1979aaSAndroid Build Coastguard Worker // Applies a type cast to reinterpret four 32-bit integers passed in as a
4667*3f1979aaSAndroid Build Coastguard Worker // 128-bit parameter as packed 32-bit floating point values.
4668*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb514029.aspx
_mm_castsi128_ps(__m128i a)4669*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
4670*3f1979aaSAndroid Build Coastguard Worker {
4671*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
4672*3f1979aaSAndroid Build Coastguard Worker }
4673*3f1979aaSAndroid Build Coastguard Worker
4674*3f1979aaSAndroid Build Coastguard Worker // Loads 128-bit value. :
4675*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
_mm_load_si128(const __m128i * p)4676*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4677*3f1979aaSAndroid Build Coastguard Worker {
4678*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4679*3f1979aaSAndroid Build Coastguard Worker }
4680*3f1979aaSAndroid Build Coastguard Worker
4681*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into both
4682*3f1979aaSAndroid Build Coastguard Worker // elements of dst.
4683*3f1979aaSAndroid Build Coastguard Worker //
4684*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := MEM[mem_addr+63:mem_addr]
4685*3f1979aaSAndroid Build Coastguard Worker // dst[127:64] := MEM[mem_addr+63:mem_addr]
4686*3f1979aaSAndroid Build Coastguard Worker //
4687*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
_mm_load1_pd(const double * p)4688*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4689*3f1979aaSAndroid Build Coastguard Worker {
4690*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4691*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4692*3f1979aaSAndroid Build Coastguard Worker #else
4693*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4694*3f1979aaSAndroid Build Coastguard Worker #endif
4695*3f1979aaSAndroid Build Coastguard Worker }
4696*3f1979aaSAndroid Build Coastguard Worker
4697*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into the
4698*3f1979aaSAndroid Build Coastguard Worker // upper element of dst, and copy the lower element from a to dst. mem_addr does
4699*3f1979aaSAndroid Build Coastguard Worker // not need to be aligned on any particular boundary.
4700*3f1979aaSAndroid Build Coastguard Worker //
4701*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := a[63:0]
4702*3f1979aaSAndroid Build Coastguard Worker // dst[127:64] := MEM[mem_addr+63:mem_addr]
4703*3f1979aaSAndroid Build Coastguard Worker //
4704*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
_mm_loadh_pd(__m128d a,const double * p)4705*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4706*3f1979aaSAndroid Build Coastguard Worker {
4707*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4708*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f64(
4709*3f1979aaSAndroid Build Coastguard Worker vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4710*3f1979aaSAndroid Build Coastguard Worker #else
4711*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f32(vcombine_f32(
4712*3f1979aaSAndroid Build Coastguard Worker vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4713*3f1979aaSAndroid Build Coastguard Worker #endif
4714*3f1979aaSAndroid Build Coastguard Worker }
4715*3f1979aaSAndroid Build Coastguard Worker
4716*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into both
4717*3f1979aaSAndroid Build Coastguard Worker // elements of dst.
4718*3f1979aaSAndroid Build Coastguard Worker //
4719*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := MEM[mem_addr+63:mem_addr]
4720*3f1979aaSAndroid Build Coastguard Worker // dst[127:64] := MEM[mem_addr+63:mem_addr]
4721*3f1979aaSAndroid Build Coastguard Worker //
4722*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4723*3f1979aaSAndroid Build Coastguard Worker #define _mm_load_pd1 _mm_load1_pd
4724*3f1979aaSAndroid Build Coastguard Worker
4725*3f1979aaSAndroid Build Coastguard Worker // Load a double-precision (64-bit) floating-point element from memory into both
4726*3f1979aaSAndroid Build Coastguard Worker // elements of dst.
4727*3f1979aaSAndroid Build Coastguard Worker //
4728*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := MEM[mem_addr+63:mem_addr]
4729*3f1979aaSAndroid Build Coastguard Worker // dst[127:64] := MEM[mem_addr+63:mem_addr]
4730*3f1979aaSAndroid Build Coastguard Worker //
4731*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
4732*3f1979aaSAndroid Build Coastguard Worker #define _mm_loaddup_pd _mm_load1_pd
4733*3f1979aaSAndroid Build Coastguard Worker
4734*3f1979aaSAndroid Build Coastguard Worker // Loads 128-bit value. :
4735*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
_mm_loadu_si128(const __m128i * p)4736*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4737*3f1979aaSAndroid Build Coastguard Worker {
4738*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4739*3f1979aaSAndroid Build Coastguard Worker }
4740*3f1979aaSAndroid Build Coastguard Worker
4741*3f1979aaSAndroid Build Coastguard Worker // Load unaligned 32-bit integer from memory into the first element of dst.
4742*3f1979aaSAndroid Build Coastguard Worker //
4743*3f1979aaSAndroid Build Coastguard Worker // dst[31:0] := MEM[mem_addr+31:mem_addr]
4744*3f1979aaSAndroid Build Coastguard Worker // dst[MAX:32] := 0
4745*3f1979aaSAndroid Build Coastguard Worker //
4746*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
_mm_loadu_si32(const void * p)4747*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4748*3f1979aaSAndroid Build Coastguard Worker {
4749*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
4750*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4751*3f1979aaSAndroid Build Coastguard Worker }
4752*3f1979aaSAndroid Build Coastguard Worker
4753*3f1979aaSAndroid Build Coastguard Worker // Convert packed double-precision (64-bit) floating-point elements in a to
4754*3f1979aaSAndroid Build Coastguard Worker // packed single-precision (32-bit) floating-point elements, and store the
4755*3f1979aaSAndroid Build Coastguard Worker // results in dst.
4756*3f1979aaSAndroid Build Coastguard Worker //
4757*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 1
4758*3f1979aaSAndroid Build Coastguard Worker // i := 32*j
4759*3f1979aaSAndroid Build Coastguard Worker // k := 64*j
4760*3f1979aaSAndroid Build Coastguard Worker // dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
4761*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
4762*3f1979aaSAndroid Build Coastguard Worker // dst[127:64] := 0
4763*3f1979aaSAndroid Build Coastguard Worker //
4764*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
_mm_cvtpd_ps(__m128d a)4765*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
4766*3f1979aaSAndroid Build Coastguard Worker {
4767*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4768*3f1979aaSAndroid Build Coastguard Worker float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
4769*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
4770*3f1979aaSAndroid Build Coastguard Worker #else
4771*3f1979aaSAndroid Build Coastguard Worker float a0 = (float) ((double *) &a)[0];
4772*3f1979aaSAndroid Build Coastguard Worker float a1 = (float) ((double *) &a)[1];
4773*3f1979aaSAndroid Build Coastguard Worker return _mm_set_ps(0, 0, a1, a0);
4774*3f1979aaSAndroid Build Coastguard Worker #endif
4775*3f1979aaSAndroid Build Coastguard Worker }
4776*3f1979aaSAndroid Build Coastguard Worker
4777*3f1979aaSAndroid Build Coastguard Worker // Copy the lower double-precision (64-bit) floating-point element of a to dst.
4778*3f1979aaSAndroid Build Coastguard Worker //
4779*3f1979aaSAndroid Build Coastguard Worker // dst[63:0] := a[63:0]
4780*3f1979aaSAndroid Build Coastguard Worker //
4781*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
_mm_cvtsd_f64(__m128d a)4782*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
4783*3f1979aaSAndroid Build Coastguard Worker {
4784*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4785*3f1979aaSAndroid Build Coastguard Worker return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
4786*3f1979aaSAndroid Build Coastguard Worker #else
4787*3f1979aaSAndroid Build Coastguard Worker return ((double *) &a)[0];
4788*3f1979aaSAndroid Build Coastguard Worker #endif
4789*3f1979aaSAndroid Build Coastguard Worker }
4790*3f1979aaSAndroid Build Coastguard Worker
4791*3f1979aaSAndroid Build Coastguard Worker // Convert packed single-precision (32-bit) floating-point elements in a to
4792*3f1979aaSAndroid Build Coastguard Worker // packed double-precision (64-bit) floating-point elements, and store the
4793*3f1979aaSAndroid Build Coastguard Worker // results in dst.
4794*3f1979aaSAndroid Build Coastguard Worker //
4795*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 1
4796*3f1979aaSAndroid Build Coastguard Worker // i := 64*j
4797*3f1979aaSAndroid Build Coastguard Worker // k := 32*j
4798*3f1979aaSAndroid Build Coastguard Worker // dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
4799*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
4800*3f1979aaSAndroid Build Coastguard Worker //
4801*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
_mm_cvtps_pd(__m128 a)4802*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
4803*3f1979aaSAndroid Build Coastguard Worker {
4804*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4805*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128d_f64(
4806*3f1979aaSAndroid Build Coastguard Worker vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
4807*3f1979aaSAndroid Build Coastguard Worker #else
4808*3f1979aaSAndroid Build Coastguard Worker double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4809*3f1979aaSAndroid Build Coastguard Worker double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
4810*3f1979aaSAndroid Build Coastguard Worker return _mm_set_pd(a1, a0);
4811*3f1979aaSAndroid Build Coastguard Worker #endif
4812*3f1979aaSAndroid Build Coastguard Worker }
4813*3f1979aaSAndroid Build Coastguard Worker
4814*3f1979aaSAndroid Build Coastguard Worker // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
4815*3f1979aaSAndroid Build Coastguard Worker // compilation and does not generate any instructions, thus it has zero latency.
4816*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
_mm_castpd_si128(__m128d a)4817*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
4818*3f1979aaSAndroid Build Coastguard Worker {
4819*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
4820*3f1979aaSAndroid Build Coastguard Worker }
4821*3f1979aaSAndroid Build Coastguard Worker
4822*3f1979aaSAndroid Build Coastguard Worker // Blend packed single-precision (32-bit) floating-point elements from a and b
4823*3f1979aaSAndroid Build Coastguard Worker // using mask, and store the results in dst.
4824*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
_mm_blendv_ps(__m128 a,__m128 b,__m128 mask)4825*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
4826*3f1979aaSAndroid Build Coastguard Worker {
4827*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
4828*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_f32_m128(b),
4829*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_f32_m128(a)));
4830*3f1979aaSAndroid Build Coastguard Worker }
4831*3f1979aaSAndroid Build Coastguard Worker
4832*3f1979aaSAndroid Build Coastguard Worker // Round the packed single-precision (32-bit) floating-point elements in a using
4833*3f1979aaSAndroid Build Coastguard Worker // the rounding parameter, and store the results as packed single-precision
4834*3f1979aaSAndroid Build Coastguard Worker // floating-point elements in dst.
4835*3f1979aaSAndroid Build Coastguard Worker // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
_mm_round_ps(__m128 a,int rounding)4836*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
4837*3f1979aaSAndroid Build Coastguard Worker {
4838*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
4839*3f1979aaSAndroid Build Coastguard Worker switch (rounding) {
4840*3f1979aaSAndroid Build Coastguard Worker case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
4841*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
4842*3f1979aaSAndroid Build Coastguard Worker case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
4843*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
4844*3f1979aaSAndroid Build Coastguard Worker case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
4845*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
4846*3f1979aaSAndroid Build Coastguard Worker case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
4847*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
4848*3f1979aaSAndroid Build Coastguard Worker default: //_MM_FROUND_CUR_DIRECTION
4849*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
4850*3f1979aaSAndroid Build Coastguard Worker }
4851*3f1979aaSAndroid Build Coastguard Worker #else
4852*3f1979aaSAndroid Build Coastguard Worker float *v_float = (float *) &a;
4853*3f1979aaSAndroid Build Coastguard Worker __m128 zero, neg_inf, pos_inf;
4854*3f1979aaSAndroid Build Coastguard Worker
4855*3f1979aaSAndroid Build Coastguard Worker switch (rounding) {
4856*3f1979aaSAndroid Build Coastguard Worker case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
4857*3f1979aaSAndroid Build Coastguard Worker return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
4858*3f1979aaSAndroid Build Coastguard Worker case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
4859*3f1979aaSAndroid Build Coastguard Worker return (__m128){floorf(v_float[0]), floorf(v_float[1]),
4860*3f1979aaSAndroid Build Coastguard Worker floorf(v_float[2]), floorf(v_float[3])};
4861*3f1979aaSAndroid Build Coastguard Worker case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
4862*3f1979aaSAndroid Build Coastguard Worker return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
4863*3f1979aaSAndroid Build Coastguard Worker ceilf(v_float[3])};
4864*3f1979aaSAndroid Build Coastguard Worker case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
4865*3f1979aaSAndroid Build Coastguard Worker zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
4866*3f1979aaSAndroid Build Coastguard Worker neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
4867*3f1979aaSAndroid Build Coastguard Worker floorf(v_float[2]), floorf(v_float[3]));
4868*3f1979aaSAndroid Build Coastguard Worker pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
4869*3f1979aaSAndroid Build Coastguard Worker ceilf(v_float[2]), ceilf(v_float[3]));
4870*3f1979aaSAndroid Build Coastguard Worker return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
4871*3f1979aaSAndroid Build Coastguard Worker default: //_MM_FROUND_CUR_DIRECTION
4872*3f1979aaSAndroid Build Coastguard Worker return (__m128){roundf(v_float[0]), roundf(v_float[1]),
4873*3f1979aaSAndroid Build Coastguard Worker roundf(v_float[2]), roundf(v_float[3])};
4874*3f1979aaSAndroid Build Coastguard Worker }
4875*3f1979aaSAndroid Build Coastguard Worker #endif
4876*3f1979aaSAndroid Build Coastguard Worker }
4877*3f1979aaSAndroid Build Coastguard Worker
4878*3f1979aaSAndroid Build Coastguard Worker // Round the packed single-precision (32-bit) floating-point elements in a up to
4879*3f1979aaSAndroid Build Coastguard Worker // an integer value, and store the results as packed single-precision
4880*3f1979aaSAndroid Build Coastguard Worker // floating-point elements in dst.
4881*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
_mm_ceil_ps(__m128 a)4882*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
4883*3f1979aaSAndroid Build Coastguard Worker {
4884*3f1979aaSAndroid Build Coastguard Worker return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4885*3f1979aaSAndroid Build Coastguard Worker }
4886*3f1979aaSAndroid Build Coastguard Worker
4887*3f1979aaSAndroid Build Coastguard Worker // Round the packed single-precision (32-bit) floating-point elements in a down
4888*3f1979aaSAndroid Build Coastguard Worker // to an integer value, and store the results as packed single-precision
4889*3f1979aaSAndroid Build Coastguard Worker // floating-point elements in dst.
4890*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
_mm_floor_ps(__m128 a)4891*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
4892*3f1979aaSAndroid Build Coastguard Worker {
4893*3f1979aaSAndroid Build Coastguard Worker return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4894*3f1979aaSAndroid Build Coastguard Worker }
4895*3f1979aaSAndroid Build Coastguard Worker
4896*3f1979aaSAndroid Build Coastguard Worker
4897*3f1979aaSAndroid Build Coastguard Worker // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
4898*3f1979aaSAndroid Build Coastguard Worker // may perform better than _mm_loadu_si128 when the data crosses a cache line
4899*3f1979aaSAndroid Build Coastguard Worker // boundary.
4900*3f1979aaSAndroid Build Coastguard Worker //
4901*3f1979aaSAndroid Build Coastguard Worker // dst[127:0] := MEM[mem_addr+127:mem_addr]
4902*3f1979aaSAndroid Build Coastguard Worker //
4903*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
4904*3f1979aaSAndroid Build Coastguard Worker #define _mm_lddqu_si128 _mm_loadu_si128
4905*3f1979aaSAndroid Build Coastguard Worker
4906*3f1979aaSAndroid Build Coastguard Worker /* Miscellaneous Operations */
4907*3f1979aaSAndroid Build Coastguard Worker
4908*3f1979aaSAndroid Build Coastguard Worker // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
4909*3f1979aaSAndroid Build Coastguard Worker // in the sign bit.
4910*3f1979aaSAndroid Build Coastguard Worker //
4911*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 >> count
4912*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 >> count
4913*3f1979aaSAndroid Build Coastguard Worker // ...
4914*3f1979aaSAndroid Build Coastguard Worker // r7 := a7 >> count
4915*3f1979aaSAndroid Build Coastguard Worker //
4916*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
_mm_sra_epi16(__m128i a,__m128i count)4917*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
4918*3f1979aaSAndroid Build Coastguard Worker {
4919*3f1979aaSAndroid Build Coastguard Worker int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
4920*3f1979aaSAndroid Build Coastguard Worker if (c > 15)
4921*3f1979aaSAndroid Build Coastguard Worker return _mm_cmplt_epi16(a, _mm_setzero_si128());
4922*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
4923*3f1979aaSAndroid Build Coastguard Worker }
4924*3f1979aaSAndroid Build Coastguard Worker
4925*3f1979aaSAndroid Build Coastguard Worker // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
4926*3f1979aaSAndroid Build Coastguard Worker // in the sign bit.
4927*3f1979aaSAndroid Build Coastguard Worker //
4928*3f1979aaSAndroid Build Coastguard Worker // r0 := a0 >> count
4929*3f1979aaSAndroid Build Coastguard Worker // r1 := a1 >> count
4930*3f1979aaSAndroid Build Coastguard Worker // r2 := a2 >> count
4931*3f1979aaSAndroid Build Coastguard Worker // r3 := a3 >> count
4932*3f1979aaSAndroid Build Coastguard Worker //
4933*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
_mm_sra_epi32(__m128i a,__m128i count)4934*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
4935*3f1979aaSAndroid Build Coastguard Worker {
4936*3f1979aaSAndroid Build Coastguard Worker int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
4937*3f1979aaSAndroid Build Coastguard Worker if (c > 31)
4938*3f1979aaSAndroid Build Coastguard Worker return _mm_cmplt_epi32(a, _mm_setzero_si128());
4939*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
4940*3f1979aaSAndroid Build Coastguard Worker }
4941*3f1979aaSAndroid Build Coastguard Worker
4942*3f1979aaSAndroid Build Coastguard Worker // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
4943*3f1979aaSAndroid Build Coastguard Worker // saturates.
4944*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
_mm_packs_epi16(__m128i a,__m128i b)4945*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4946*3f1979aaSAndroid Build Coastguard Worker {
4947*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(
4948*3f1979aaSAndroid Build Coastguard Worker vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4949*3f1979aaSAndroid Build Coastguard Worker vqmovn_s16(vreinterpretq_s16_m128i(b))));
4950*3f1979aaSAndroid Build Coastguard Worker }
4951*3f1979aaSAndroid Build Coastguard Worker
4952*3f1979aaSAndroid Build Coastguard Worker // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
4953*3f1979aaSAndroid Build Coastguard Worker // integers and saturates.
4954*3f1979aaSAndroid Build Coastguard Worker //
4955*3f1979aaSAndroid Build Coastguard Worker // r0 := UnsignedSaturate(a0)
4956*3f1979aaSAndroid Build Coastguard Worker // r1 := UnsignedSaturate(a1)
4957*3f1979aaSAndroid Build Coastguard Worker // ...
4958*3f1979aaSAndroid Build Coastguard Worker // r7 := UnsignedSaturate(a7)
4959*3f1979aaSAndroid Build Coastguard Worker // r8 := UnsignedSaturate(b0)
4960*3f1979aaSAndroid Build Coastguard Worker // r9 := UnsignedSaturate(b1)
4961*3f1979aaSAndroid Build Coastguard Worker // ...
4962*3f1979aaSAndroid Build Coastguard Worker // r15 := UnsignedSaturate(b7)
4963*3f1979aaSAndroid Build Coastguard Worker //
4964*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
_mm_packus_epi16(const __m128i a,const __m128i b)4965*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4966*3f1979aaSAndroid Build Coastguard Worker {
4967*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(
4968*3f1979aaSAndroid Build Coastguard Worker vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4969*3f1979aaSAndroid Build Coastguard Worker vqmovun_s16(vreinterpretq_s16_m128i(b))));
4970*3f1979aaSAndroid Build Coastguard Worker }
4971*3f1979aaSAndroid Build Coastguard Worker
4972*3f1979aaSAndroid Build Coastguard Worker // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
4973*3f1979aaSAndroid Build Coastguard Worker // and saturates.
4974*3f1979aaSAndroid Build Coastguard Worker //
4975*3f1979aaSAndroid Build Coastguard Worker // r0 := SignedSaturate(a0)
4976*3f1979aaSAndroid Build Coastguard Worker // r1 := SignedSaturate(a1)
4977*3f1979aaSAndroid Build Coastguard Worker // r2 := SignedSaturate(a2)
4978*3f1979aaSAndroid Build Coastguard Worker // r3 := SignedSaturate(a3)
4979*3f1979aaSAndroid Build Coastguard Worker // r4 := SignedSaturate(b0)
4980*3f1979aaSAndroid Build Coastguard Worker // r5 := SignedSaturate(b1)
4981*3f1979aaSAndroid Build Coastguard Worker // r6 := SignedSaturate(b2)
4982*3f1979aaSAndroid Build Coastguard Worker // r7 := SignedSaturate(b3)
4983*3f1979aaSAndroid Build Coastguard Worker //
4984*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
_mm_packs_epi32(__m128i a,__m128i b)4985*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4986*3f1979aaSAndroid Build Coastguard Worker {
4987*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
4988*3f1979aaSAndroid Build Coastguard Worker vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4989*3f1979aaSAndroid Build Coastguard Worker vqmovn_s32(vreinterpretq_s32_m128i(b))));
4990*3f1979aaSAndroid Build Coastguard Worker }
4991*3f1979aaSAndroid Build Coastguard Worker
4992*3f1979aaSAndroid Build Coastguard Worker // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
4993*3f1979aaSAndroid Build Coastguard Worker // integers and saturates.
4994*3f1979aaSAndroid Build Coastguard Worker //
4995*3f1979aaSAndroid Build Coastguard Worker // r0 := UnsignedSaturate(a0)
4996*3f1979aaSAndroid Build Coastguard Worker // r1 := UnsignedSaturate(a1)
4997*3f1979aaSAndroid Build Coastguard Worker // r2 := UnsignedSaturate(a2)
4998*3f1979aaSAndroid Build Coastguard Worker // r3 := UnsignedSaturate(a3)
4999*3f1979aaSAndroid Build Coastguard Worker // r4 := UnsignedSaturate(b0)
5000*3f1979aaSAndroid Build Coastguard Worker // r5 := UnsignedSaturate(b1)
5001*3f1979aaSAndroid Build Coastguard Worker // r6 := UnsignedSaturate(b2)
5002*3f1979aaSAndroid Build Coastguard Worker // r7 := UnsignedSaturate(b3)
_mm_packus_epi32(__m128i a,__m128i b)5003*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
5004*3f1979aaSAndroid Build Coastguard Worker {
5005*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u16(
5006*3f1979aaSAndroid Build Coastguard Worker vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
5007*3f1979aaSAndroid Build Coastguard Worker vqmovun_s32(vreinterpretq_s32_m128i(b))));
5008*3f1979aaSAndroid Build Coastguard Worker }
5009*3f1979aaSAndroid Build Coastguard Worker
5010*3f1979aaSAndroid Build Coastguard Worker // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
5011*3f1979aaSAndroid Build Coastguard Worker // 8 signed or unsigned 8-bit integers in b.
5012*3f1979aaSAndroid Build Coastguard Worker //
5013*3f1979aaSAndroid Build Coastguard Worker // r0 := a0
5014*3f1979aaSAndroid Build Coastguard Worker // r1 := b0
5015*3f1979aaSAndroid Build Coastguard Worker // r2 := a1
5016*3f1979aaSAndroid Build Coastguard Worker // r3 := b1
5017*3f1979aaSAndroid Build Coastguard Worker // ...
5018*3f1979aaSAndroid Build Coastguard Worker // r14 := a7
5019*3f1979aaSAndroid Build Coastguard Worker // r15 := b7
5020*3f1979aaSAndroid Build Coastguard Worker //
5021*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
_mm_unpacklo_epi8(__m128i a,__m128i b)5022*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
5023*3f1979aaSAndroid Build Coastguard Worker {
5024*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5025*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(
5026*3f1979aaSAndroid Build Coastguard Worker vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5027*3f1979aaSAndroid Build Coastguard Worker #else
5028*3f1979aaSAndroid Build Coastguard Worker int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5029*3f1979aaSAndroid Build Coastguard Worker int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5030*3f1979aaSAndroid Build Coastguard Worker int8x8x2_t result = vzip_s8(a1, b1);
5031*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5032*3f1979aaSAndroid Build Coastguard Worker #endif
5033*3f1979aaSAndroid Build Coastguard Worker }
5034*3f1979aaSAndroid Build Coastguard Worker
5035*3f1979aaSAndroid Build Coastguard Worker // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
5036*3f1979aaSAndroid Build Coastguard Worker // lower 4 signed or unsigned 16-bit integers in b.
5037*3f1979aaSAndroid Build Coastguard Worker //
5038*3f1979aaSAndroid Build Coastguard Worker // r0 := a0
5039*3f1979aaSAndroid Build Coastguard Worker // r1 := b0
5040*3f1979aaSAndroid Build Coastguard Worker // r2 := a1
5041*3f1979aaSAndroid Build Coastguard Worker // r3 := b1
5042*3f1979aaSAndroid Build Coastguard Worker // r4 := a2
5043*3f1979aaSAndroid Build Coastguard Worker // r5 := b2
5044*3f1979aaSAndroid Build Coastguard Worker // r6 := a3
5045*3f1979aaSAndroid Build Coastguard Worker // r7 := b3
5046*3f1979aaSAndroid Build Coastguard Worker //
5047*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
_mm_unpacklo_epi16(__m128i a,__m128i b)5048*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
5049*3f1979aaSAndroid Build Coastguard Worker {
5050*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5051*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
5052*3f1979aaSAndroid Build Coastguard Worker vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5053*3f1979aaSAndroid Build Coastguard Worker #else
5054*3f1979aaSAndroid Build Coastguard Worker int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5055*3f1979aaSAndroid Build Coastguard Worker int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5056*3f1979aaSAndroid Build Coastguard Worker int16x4x2_t result = vzip_s16(a1, b1);
5057*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5058*3f1979aaSAndroid Build Coastguard Worker #endif
5059*3f1979aaSAndroid Build Coastguard Worker }
5060*3f1979aaSAndroid Build Coastguard Worker
5061*3f1979aaSAndroid Build Coastguard Worker // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
5062*3f1979aaSAndroid Build Coastguard Worker // lower 2 signed or unsigned 32 - bit integers in b.
5063*3f1979aaSAndroid Build Coastguard Worker //
5064*3f1979aaSAndroid Build Coastguard Worker // r0 := a0
5065*3f1979aaSAndroid Build Coastguard Worker // r1 := b0
5066*3f1979aaSAndroid Build Coastguard Worker // r2 := a1
5067*3f1979aaSAndroid Build Coastguard Worker // r3 := b1
5068*3f1979aaSAndroid Build Coastguard Worker //
5069*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
_mm_unpacklo_epi32(__m128i a,__m128i b)5070*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
5071*3f1979aaSAndroid Build Coastguard Worker {
5072*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5073*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
5074*3f1979aaSAndroid Build Coastguard Worker vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5075*3f1979aaSAndroid Build Coastguard Worker #else
5076*3f1979aaSAndroid Build Coastguard Worker int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5077*3f1979aaSAndroid Build Coastguard Worker int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5078*3f1979aaSAndroid Build Coastguard Worker int32x2x2_t result = vzip_s32(a1, b1);
5079*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5080*3f1979aaSAndroid Build Coastguard Worker #endif
5081*3f1979aaSAndroid Build Coastguard Worker }
5082*3f1979aaSAndroid Build Coastguard Worker
_mm_unpacklo_epi64(__m128i a,__m128i b)5083*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
5084*3f1979aaSAndroid Build Coastguard Worker {
5085*3f1979aaSAndroid Build Coastguard Worker int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5086*3f1979aaSAndroid Build Coastguard Worker int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5087*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5088*3f1979aaSAndroid Build Coastguard Worker }
5089*3f1979aaSAndroid Build Coastguard Worker
5090*3f1979aaSAndroid Build Coastguard Worker // Selects and interleaves the lower two single-precision, floating-point values
5091*3f1979aaSAndroid Build Coastguard Worker // from a and b.
5092*3f1979aaSAndroid Build Coastguard Worker //
5093*3f1979aaSAndroid Build Coastguard Worker // r0 := a0
5094*3f1979aaSAndroid Build Coastguard Worker // r1 := b0
5095*3f1979aaSAndroid Build Coastguard Worker // r2 := a1
5096*3f1979aaSAndroid Build Coastguard Worker // r3 := b1
5097*3f1979aaSAndroid Build Coastguard Worker //
5098*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
_mm_unpacklo_ps(__m128 a,__m128 b)5099*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
5100*3f1979aaSAndroid Build Coastguard Worker {
5101*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5102*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
5103*3f1979aaSAndroid Build Coastguard Worker vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5104*3f1979aaSAndroid Build Coastguard Worker #else
5105*3f1979aaSAndroid Build Coastguard Worker float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
5106*3f1979aaSAndroid Build Coastguard Worker float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
5107*3f1979aaSAndroid Build Coastguard Worker float32x2x2_t result = vzip_f32(a1, b1);
5108*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5109*3f1979aaSAndroid Build Coastguard Worker #endif
5110*3f1979aaSAndroid Build Coastguard Worker }
5111*3f1979aaSAndroid Build Coastguard Worker
5112*3f1979aaSAndroid Build Coastguard Worker // Selects and interleaves the upper two single-precision, floating-point values
5113*3f1979aaSAndroid Build Coastguard Worker // from a and b.
5114*3f1979aaSAndroid Build Coastguard Worker //
5115*3f1979aaSAndroid Build Coastguard Worker // r0 := a2
5116*3f1979aaSAndroid Build Coastguard Worker // r1 := b2
5117*3f1979aaSAndroid Build Coastguard Worker // r2 := a3
5118*3f1979aaSAndroid Build Coastguard Worker // r3 := b3
5119*3f1979aaSAndroid Build Coastguard Worker //
5120*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
_mm_unpackhi_ps(__m128 a,__m128 b)5121*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
5122*3f1979aaSAndroid Build Coastguard Worker {
5123*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5124*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(
5125*3f1979aaSAndroid Build Coastguard Worker vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5126*3f1979aaSAndroid Build Coastguard Worker #else
5127*3f1979aaSAndroid Build Coastguard Worker float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
5128*3f1979aaSAndroid Build Coastguard Worker float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
5129*3f1979aaSAndroid Build Coastguard Worker float32x2x2_t result = vzip_f32(a1, b1);
5130*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5131*3f1979aaSAndroid Build Coastguard Worker #endif
5132*3f1979aaSAndroid Build Coastguard Worker }
5133*3f1979aaSAndroid Build Coastguard Worker
5134*3f1979aaSAndroid Build Coastguard Worker // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
5135*3f1979aaSAndroid Build Coastguard Worker // 8 signed or unsigned 8-bit integers in b.
5136*3f1979aaSAndroid Build Coastguard Worker //
5137*3f1979aaSAndroid Build Coastguard Worker // r0 := a8
5138*3f1979aaSAndroid Build Coastguard Worker // r1 := b8
5139*3f1979aaSAndroid Build Coastguard Worker // r2 := a9
5140*3f1979aaSAndroid Build Coastguard Worker // r3 := b9
5141*3f1979aaSAndroid Build Coastguard Worker // ...
5142*3f1979aaSAndroid Build Coastguard Worker // r14 := a15
5143*3f1979aaSAndroid Build Coastguard Worker // r15 := b15
5144*3f1979aaSAndroid Build Coastguard Worker //
5145*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
_mm_unpackhi_epi8(__m128i a,__m128i b)5146*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5147*3f1979aaSAndroid Build Coastguard Worker {
5148*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5149*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(
5150*3f1979aaSAndroid Build Coastguard Worker vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5151*3f1979aaSAndroid Build Coastguard Worker #else
5152*3f1979aaSAndroid Build Coastguard Worker int8x8_t a1 =
5153*3f1979aaSAndroid Build Coastguard Worker vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5154*3f1979aaSAndroid Build Coastguard Worker int8x8_t b1 =
5155*3f1979aaSAndroid Build Coastguard Worker vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5156*3f1979aaSAndroid Build Coastguard Worker int8x8x2_t result = vzip_s8(a1, b1);
5157*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5158*3f1979aaSAndroid Build Coastguard Worker #endif
5159*3f1979aaSAndroid Build Coastguard Worker }
5160*3f1979aaSAndroid Build Coastguard Worker
5161*3f1979aaSAndroid Build Coastguard Worker // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
5162*3f1979aaSAndroid Build Coastguard Worker // upper 4 signed or unsigned 16-bit integers in b.
5163*3f1979aaSAndroid Build Coastguard Worker //
5164*3f1979aaSAndroid Build Coastguard Worker // r0 := a4
5165*3f1979aaSAndroid Build Coastguard Worker // r1 := b4
5166*3f1979aaSAndroid Build Coastguard Worker // r2 := a5
5167*3f1979aaSAndroid Build Coastguard Worker // r3 := b5
5168*3f1979aaSAndroid Build Coastguard Worker // r4 := a6
5169*3f1979aaSAndroid Build Coastguard Worker // r5 := b6
5170*3f1979aaSAndroid Build Coastguard Worker // r6 := a7
5171*3f1979aaSAndroid Build Coastguard Worker // r7 := b7
5172*3f1979aaSAndroid Build Coastguard Worker //
5173*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
_mm_unpackhi_epi16(__m128i a,__m128i b)5174*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5175*3f1979aaSAndroid Build Coastguard Worker {
5176*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5177*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(
5178*3f1979aaSAndroid Build Coastguard Worker vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5179*3f1979aaSAndroid Build Coastguard Worker #else
5180*3f1979aaSAndroid Build Coastguard Worker int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5181*3f1979aaSAndroid Build Coastguard Worker int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5182*3f1979aaSAndroid Build Coastguard Worker int16x4x2_t result = vzip_s16(a1, b1);
5183*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5184*3f1979aaSAndroid Build Coastguard Worker #endif
5185*3f1979aaSAndroid Build Coastguard Worker }
5186*3f1979aaSAndroid Build Coastguard Worker
5187*3f1979aaSAndroid Build Coastguard Worker // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
5188*3f1979aaSAndroid Build Coastguard Worker // upper 2 signed or unsigned 32-bit integers in b.
5189*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
_mm_unpackhi_epi32(__m128i a,__m128i b)5190*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5191*3f1979aaSAndroid Build Coastguard Worker {
5192*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5193*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(
5194*3f1979aaSAndroid Build Coastguard Worker vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5195*3f1979aaSAndroid Build Coastguard Worker #else
5196*3f1979aaSAndroid Build Coastguard Worker int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5197*3f1979aaSAndroid Build Coastguard Worker int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5198*3f1979aaSAndroid Build Coastguard Worker int32x2x2_t result = vzip_s32(a1, b1);
5199*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5200*3f1979aaSAndroid Build Coastguard Worker #endif
5201*3f1979aaSAndroid Build Coastguard Worker }
5202*3f1979aaSAndroid Build Coastguard Worker
5203*3f1979aaSAndroid Build Coastguard Worker // Interleaves the upper signed or unsigned 64-bit integer in a with the
5204*3f1979aaSAndroid Build Coastguard Worker // upper signed or unsigned 64-bit integer in b.
5205*3f1979aaSAndroid Build Coastguard Worker //
5206*3f1979aaSAndroid Build Coastguard Worker // r0 := a1
5207*3f1979aaSAndroid Build Coastguard Worker // r1 := b1
_mm_unpackhi_epi64(__m128i a,__m128i b)5208*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5209*3f1979aaSAndroid Build Coastguard Worker {
5210*3f1979aaSAndroid Build Coastguard Worker int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5211*3f1979aaSAndroid Build Coastguard Worker int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5212*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5213*3f1979aaSAndroid Build Coastguard Worker }
5214*3f1979aaSAndroid Build Coastguard Worker
5215*3f1979aaSAndroid Build Coastguard Worker // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
5216*3f1979aaSAndroid Build Coastguard Worker // in a, store the minimum and index in dst, and zero the remaining bits in dst.
5217*3f1979aaSAndroid Build Coastguard Worker //
5218*3f1979aaSAndroid Build Coastguard Worker // index[2:0] := 0
5219*3f1979aaSAndroid Build Coastguard Worker // min[15:0] := a[15:0]
5220*3f1979aaSAndroid Build Coastguard Worker // FOR j := 0 to 7
5221*3f1979aaSAndroid Build Coastguard Worker // i := j*16
5222*3f1979aaSAndroid Build Coastguard Worker // IF a[i+15:i] < min[15:0]
5223*3f1979aaSAndroid Build Coastguard Worker // index[2:0] := j
5224*3f1979aaSAndroid Build Coastguard Worker // min[15:0] := a[i+15:i]
5225*3f1979aaSAndroid Build Coastguard Worker // FI
5226*3f1979aaSAndroid Build Coastguard Worker // ENDFOR
5227*3f1979aaSAndroid Build Coastguard Worker // dst[15:0] := min[15:0]
5228*3f1979aaSAndroid Build Coastguard Worker // dst[18:16] := index[2:0]
5229*3f1979aaSAndroid Build Coastguard Worker // dst[127:19] := 0
5230*3f1979aaSAndroid Build Coastguard Worker //
5231*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
_mm_minpos_epu16(__m128i a)5232*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
5233*3f1979aaSAndroid Build Coastguard Worker {
5234*3f1979aaSAndroid Build Coastguard Worker __m128i dst;
5235*3f1979aaSAndroid Build Coastguard Worker uint16_t min, idx = 0;
5236*3f1979aaSAndroid Build Coastguard Worker // Find the minimum value
5237*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5238*3f1979aaSAndroid Build Coastguard Worker min = vminvq_u16(vreinterpretq_u16_m128i(a));
5239*3f1979aaSAndroid Build Coastguard Worker #else
5240*3f1979aaSAndroid Build Coastguard Worker __m64 tmp;
5241*3f1979aaSAndroid Build Coastguard Worker tmp = vreinterpret_m64_u16(
5242*3f1979aaSAndroid Build Coastguard Worker vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
5243*3f1979aaSAndroid Build Coastguard Worker vget_high_u16(vreinterpretq_u16_m128i(a))));
5244*3f1979aaSAndroid Build Coastguard Worker tmp = vreinterpret_m64_u16(
5245*3f1979aaSAndroid Build Coastguard Worker vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5246*3f1979aaSAndroid Build Coastguard Worker tmp = vreinterpret_m64_u16(
5247*3f1979aaSAndroid Build Coastguard Worker vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5248*3f1979aaSAndroid Build Coastguard Worker min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
5249*3f1979aaSAndroid Build Coastguard Worker #endif
5250*3f1979aaSAndroid Build Coastguard Worker // Get the index of the minimum value
5251*3f1979aaSAndroid Build Coastguard Worker int i;
5252*3f1979aaSAndroid Build Coastguard Worker for (i = 0; i < 8; i++) {
5253*3f1979aaSAndroid Build Coastguard Worker if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
5254*3f1979aaSAndroid Build Coastguard Worker idx = (uint16_t) i;
5255*3f1979aaSAndroid Build Coastguard Worker break;
5256*3f1979aaSAndroid Build Coastguard Worker }
5257*3f1979aaSAndroid Build Coastguard Worker a = _mm_srli_si128(a, 2);
5258*3f1979aaSAndroid Build Coastguard Worker }
5259*3f1979aaSAndroid Build Coastguard Worker // Generate result
5260*3f1979aaSAndroid Build Coastguard Worker dst = _mm_setzero_si128();
5261*3f1979aaSAndroid Build Coastguard Worker dst = vreinterpretq_m128i_u16(
5262*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
5263*3f1979aaSAndroid Build Coastguard Worker dst = vreinterpretq_m128i_u16(
5264*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
5265*3f1979aaSAndroid Build Coastguard Worker return dst;
5266*3f1979aaSAndroid Build Coastguard Worker }
5267*3f1979aaSAndroid Build Coastguard Worker
5268*3f1979aaSAndroid Build Coastguard Worker // shift to right
5269*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
5270*3f1979aaSAndroid Build Coastguard Worker // http://blog.csdn.net/hemmingway/article/details/44828303
5271*3f1979aaSAndroid Build Coastguard Worker // Clang requires a macro here, as it is extremely picky about c being a
5272*3f1979aaSAndroid Build Coastguard Worker // literal.
5273*3f1979aaSAndroid Build Coastguard Worker #define _mm_alignr_epi8(a, b, c) \
5274*3f1979aaSAndroid Build Coastguard Worker ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
5275*3f1979aaSAndroid Build Coastguard Worker
5276*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5277*3f1979aaSAndroid Build Coastguard Worker // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5278*3f1979aaSAndroid Build Coastguard Worker // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5279*3f1979aaSAndroid Build Coastguard Worker // otherwise set CF to 0. Return the CF value.
5280*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
_mm_testc_si128(__m128i a,__m128i b)5281*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
5282*3f1979aaSAndroid Build Coastguard Worker {
5283*3f1979aaSAndroid Build Coastguard Worker int64x2_t s64 =
5284*3f1979aaSAndroid Build Coastguard Worker vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
5285*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_s64_m128i(b));
5286*3f1979aaSAndroid Build Coastguard Worker return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5287*3f1979aaSAndroid Build Coastguard Worker }
5288*3f1979aaSAndroid Build Coastguard Worker
5289*3f1979aaSAndroid Build Coastguard Worker // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5290*3f1979aaSAndroid Build Coastguard Worker // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5291*3f1979aaSAndroid Build Coastguard Worker // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5292*3f1979aaSAndroid Build Coastguard Worker // otherwise set CF to 0. Return the ZF value.
5293*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
_mm_testz_si128(__m128i a,__m128i b)5294*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
5295*3f1979aaSAndroid Build Coastguard Worker {
5296*3f1979aaSAndroid Build Coastguard Worker int64x2_t s64 =
5297*3f1979aaSAndroid Build Coastguard Worker vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
5298*3f1979aaSAndroid Build Coastguard Worker return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5299*3f1979aaSAndroid Build Coastguard Worker }
5300*3f1979aaSAndroid Build Coastguard Worker
5301*3f1979aaSAndroid Build Coastguard Worker // Extracts the selected signed or unsigned 8-bit integer from a and zero
5302*3f1979aaSAndroid Build Coastguard Worker // extends.
5303*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
5304*3f1979aaSAndroid Build Coastguard Worker #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
5305*3f1979aaSAndroid Build Coastguard Worker
5306*3f1979aaSAndroid Build Coastguard Worker // Inserts the least significant 8 bits of b into the selected 8-bit integer
5307*3f1979aaSAndroid Build Coastguard Worker // of a.
5308*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
5309*3f1979aaSAndroid Build Coastguard Worker // __constrange(0,16) int imm)
5310*3f1979aaSAndroid Build Coastguard Worker #define _mm_insert_epi8(a, b, imm) \
5311*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
5312*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s8( \
5313*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
5314*3f1979aaSAndroid Build Coastguard Worker })
5315*3f1979aaSAndroid Build Coastguard Worker
5316*3f1979aaSAndroid Build Coastguard Worker // Extracts the selected signed or unsigned 16-bit integer from a and zero
5317*3f1979aaSAndroid Build Coastguard Worker // extends.
5318*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
5319*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
5320*3f1979aaSAndroid Build Coastguard Worker #define _mm_extract_epi16(a, imm) \
5321*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
5322*3f1979aaSAndroid Build Coastguard Worker
5323*3f1979aaSAndroid Build Coastguard Worker // Inserts the least significant 16 bits of b into the selected 16-bit integer
5324*3f1979aaSAndroid Build Coastguard Worker // of a.
5325*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
5326*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
5327*3f1979aaSAndroid Build Coastguard Worker // __constrange(0,8) int imm)
5328*3f1979aaSAndroid Build Coastguard Worker #define _mm_insert_epi16(a, b, imm) \
5329*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
5330*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s16( \
5331*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
5332*3f1979aaSAndroid Build Coastguard Worker })
5333*3f1979aaSAndroid Build Coastguard Worker
5334*3f1979aaSAndroid Build Coastguard Worker // Extracts the selected signed or unsigned 32-bit integer from a and zero
5335*3f1979aaSAndroid Build Coastguard Worker // extends.
5336*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
5337*3f1979aaSAndroid Build Coastguard Worker #define _mm_extract_epi32(a, imm) \
5338*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
5339*3f1979aaSAndroid Build Coastguard Worker
5340*3f1979aaSAndroid Build Coastguard Worker // Extracts the selected single-precision (32-bit) floating-point from a.
5341*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
5342*3f1979aaSAndroid Build Coastguard Worker #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
5343*3f1979aaSAndroid Build Coastguard Worker
5344*3f1979aaSAndroid Build Coastguard Worker // Inserts the least significant 32 bits of b into the selected 32-bit integer
5345*3f1979aaSAndroid Build Coastguard Worker // of a.
5346*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
5347*3f1979aaSAndroid Build Coastguard Worker // __constrange(0,4) int imm)
5348*3f1979aaSAndroid Build Coastguard Worker #define _mm_insert_epi32(a, b, imm) \
5349*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
5350*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s32( \
5351*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
5352*3f1979aaSAndroid Build Coastguard Worker })
5353*3f1979aaSAndroid Build Coastguard Worker
5354*3f1979aaSAndroid Build Coastguard Worker // Extracts the selected signed or unsigned 64-bit integer from a and zero
5355*3f1979aaSAndroid Build Coastguard Worker // extends.
5356*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
5357*3f1979aaSAndroid Build Coastguard Worker #define _mm_extract_epi64(a, imm) \
5358*3f1979aaSAndroid Build Coastguard Worker vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
5359*3f1979aaSAndroid Build Coastguard Worker
5360*3f1979aaSAndroid Build Coastguard Worker // Inserts the least significant 64 bits of b into the selected 64-bit integer
5361*3f1979aaSAndroid Build Coastguard Worker // of a.
5362*3f1979aaSAndroid Build Coastguard Worker // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
5363*3f1979aaSAndroid Build Coastguard Worker // __constrange(0,2) int imm)
5364*3f1979aaSAndroid Build Coastguard Worker #define _mm_insert_epi64(a, b, imm) \
5365*3f1979aaSAndroid Build Coastguard Worker __extension__({ \
5366*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_m128i_s64( \
5367*3f1979aaSAndroid Build Coastguard Worker vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
5368*3f1979aaSAndroid Build Coastguard Worker })
5369*3f1979aaSAndroid Build Coastguard Worker
5370*3f1979aaSAndroid Build Coastguard Worker // Count the number of bits set to 1 in unsigned 32-bit integer a, and
5371*3f1979aaSAndroid Build Coastguard Worker // return that count in dst.
5372*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
_mm_popcnt_u32(unsigned int a)5373*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
5374*3f1979aaSAndroid Build Coastguard Worker {
5375*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5376*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_popcount)
5377*3f1979aaSAndroid Build Coastguard Worker return __builtin_popcount(a);
5378*3f1979aaSAndroid Build Coastguard Worker #else
5379*3f1979aaSAndroid Build Coastguard Worker return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
5380*3f1979aaSAndroid Build Coastguard Worker #endif
5381*3f1979aaSAndroid Build Coastguard Worker #else
5382*3f1979aaSAndroid Build Coastguard Worker uint32_t count = 0;
5383*3f1979aaSAndroid Build Coastguard Worker uint8x8_t input_val, count8x8_val;
5384*3f1979aaSAndroid Build Coastguard Worker uint16x4_t count16x4_val;
5385*3f1979aaSAndroid Build Coastguard Worker uint32x2_t count32x2_val;
5386*3f1979aaSAndroid Build Coastguard Worker
5387*3f1979aaSAndroid Build Coastguard Worker input_val = vld1_u8((uint8_t *) &a);
5388*3f1979aaSAndroid Build Coastguard Worker count8x8_val = vcnt_u8(input_val);
5389*3f1979aaSAndroid Build Coastguard Worker count16x4_val = vpaddl_u8(count8x8_val);
5390*3f1979aaSAndroid Build Coastguard Worker count32x2_val = vpaddl_u16(count16x4_val);
5391*3f1979aaSAndroid Build Coastguard Worker
5392*3f1979aaSAndroid Build Coastguard Worker vst1_u32(&count, count32x2_val);
5393*3f1979aaSAndroid Build Coastguard Worker return count;
5394*3f1979aaSAndroid Build Coastguard Worker #endif
5395*3f1979aaSAndroid Build Coastguard Worker }
5396*3f1979aaSAndroid Build Coastguard Worker
5397*3f1979aaSAndroid Build Coastguard Worker // Count the number of bits set to 1 in unsigned 64-bit integer a, and
5398*3f1979aaSAndroid Build Coastguard Worker // return that count in dst.
5399*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
_mm_popcnt_u64(uint64_t a)5400*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
5401*3f1979aaSAndroid Build Coastguard Worker {
5402*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5403*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_popcountll)
5404*3f1979aaSAndroid Build Coastguard Worker return __builtin_popcountll(a);
5405*3f1979aaSAndroid Build Coastguard Worker #else
5406*3f1979aaSAndroid Build Coastguard Worker return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
5407*3f1979aaSAndroid Build Coastguard Worker #endif
5408*3f1979aaSAndroid Build Coastguard Worker #else
5409*3f1979aaSAndroid Build Coastguard Worker uint64_t count = 0;
5410*3f1979aaSAndroid Build Coastguard Worker uint8x8_t input_val, count8x8_val;
5411*3f1979aaSAndroid Build Coastguard Worker uint16x4_t count16x4_val;
5412*3f1979aaSAndroid Build Coastguard Worker uint32x2_t count32x2_val;
5413*3f1979aaSAndroid Build Coastguard Worker uint64x1_t count64x1_val;
5414*3f1979aaSAndroid Build Coastguard Worker
5415*3f1979aaSAndroid Build Coastguard Worker input_val = vld1_u8((uint8_t *) &a);
5416*3f1979aaSAndroid Build Coastguard Worker count8x8_val = vcnt_u8(input_val);
5417*3f1979aaSAndroid Build Coastguard Worker count16x4_val = vpaddl_u8(count8x8_val);
5418*3f1979aaSAndroid Build Coastguard Worker count32x2_val = vpaddl_u16(count16x4_val);
5419*3f1979aaSAndroid Build Coastguard Worker count64x1_val = vpaddl_u32(count32x2_val);
5420*3f1979aaSAndroid Build Coastguard Worker vst1_u64(&count, count64x1_val);
5421*3f1979aaSAndroid Build Coastguard Worker return count;
5422*3f1979aaSAndroid Build Coastguard Worker #endif
5423*3f1979aaSAndroid Build Coastguard Worker }
5424*3f1979aaSAndroid Build Coastguard Worker
5425*3f1979aaSAndroid Build Coastguard Worker // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
5426*3f1979aaSAndroid Build Coastguard Worker // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
5427*3f1979aaSAndroid Build Coastguard Worker // transposed matrix in these vectors (row0 now contains column 0, etc.).
5428*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
5429*3f1979aaSAndroid Build Coastguard Worker #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
5430*3f1979aaSAndroid Build Coastguard Worker do { \
5431*3f1979aaSAndroid Build Coastguard Worker float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
5432*3f1979aaSAndroid Build Coastguard Worker float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
5433*3f1979aaSAndroid Build Coastguard Worker row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
5434*3f1979aaSAndroid Build Coastguard Worker vget_low_f32(ROW23.val[0])); \
5435*3f1979aaSAndroid Build Coastguard Worker row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
5436*3f1979aaSAndroid Build Coastguard Worker vget_low_f32(ROW23.val[1])); \
5437*3f1979aaSAndroid Build Coastguard Worker row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
5438*3f1979aaSAndroid Build Coastguard Worker vget_high_f32(ROW23.val[0])); \
5439*3f1979aaSAndroid Build Coastguard Worker row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
5440*3f1979aaSAndroid Build Coastguard Worker vget_high_f32(ROW23.val[1])); \
5441*3f1979aaSAndroid Build Coastguard Worker } while (0)
5442*3f1979aaSAndroid Build Coastguard Worker
5443*3f1979aaSAndroid Build Coastguard Worker /* Crypto Extensions */
5444*3f1979aaSAndroid Build Coastguard Worker
5445*3f1979aaSAndroid Build Coastguard Worker #if defined(__ARM_FEATURE_CRYPTO)
5446*3f1979aaSAndroid Build Coastguard Worker // Wraps vmull_p64
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)5447*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5448*3f1979aaSAndroid Build Coastguard Worker {
5449*3f1979aaSAndroid Build Coastguard Worker poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
5450*3f1979aaSAndroid Build Coastguard Worker poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
5451*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_u64_p128(vmull_p64(a, b));
5452*3f1979aaSAndroid Build Coastguard Worker }
5453*3f1979aaSAndroid Build Coastguard Worker #else // ARMv7 polyfill
5454*3f1979aaSAndroid Build Coastguard Worker // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
5455*3f1979aaSAndroid Build Coastguard Worker //
5456*3f1979aaSAndroid Build Coastguard Worker // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
5457*3f1979aaSAndroid Build Coastguard Worker // 64-bit->128-bit polynomial multiply.
5458*3f1979aaSAndroid Build Coastguard Worker //
5459*3f1979aaSAndroid Build Coastguard Worker // It needs some work and is somewhat slow, but it is still faster than all
5460*3f1979aaSAndroid Build Coastguard Worker // known scalar methods.
5461*3f1979aaSAndroid Build Coastguard Worker //
5462*3f1979aaSAndroid Build Coastguard Worker // Algorithm adapted to C from
5463*3f1979aaSAndroid Build Coastguard Worker // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
5464*3f1979aaSAndroid Build Coastguard Worker // from "Fast Software Polynomial Multiplication on ARM Processors Using the
5465*3f1979aaSAndroid Build Coastguard Worker // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
5466*3f1979aaSAndroid Build Coastguard Worker // (https://hal.inria.fr/hal-01506572)
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)5467*3f1979aaSAndroid Build Coastguard Worker static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5468*3f1979aaSAndroid Build Coastguard Worker {
5469*3f1979aaSAndroid Build Coastguard Worker poly8x8_t a = vreinterpret_p8_u64(_a);
5470*3f1979aaSAndroid Build Coastguard Worker poly8x8_t b = vreinterpret_p8_u64(_b);
5471*3f1979aaSAndroid Build Coastguard Worker
5472*3f1979aaSAndroid Build Coastguard Worker // Masks
5473*3f1979aaSAndroid Build Coastguard Worker uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
5474*3f1979aaSAndroid Build Coastguard Worker vcreate_u8(0x00000000ffffffff));
5475*3f1979aaSAndroid Build Coastguard Worker uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
5476*3f1979aaSAndroid Build Coastguard Worker vcreate_u8(0x0000000000000000));
5477*3f1979aaSAndroid Build Coastguard Worker
5478*3f1979aaSAndroid Build Coastguard Worker // Do the multiplies, rotating with vext to get all combinations
5479*3f1979aaSAndroid Build Coastguard Worker uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
5480*3f1979aaSAndroid Build Coastguard Worker uint8x16_t e =
5481*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
5482*3f1979aaSAndroid Build Coastguard Worker uint8x16_t f =
5483*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
5484*3f1979aaSAndroid Build Coastguard Worker uint8x16_t g =
5485*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
5486*3f1979aaSAndroid Build Coastguard Worker uint8x16_t h =
5487*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
5488*3f1979aaSAndroid Build Coastguard Worker uint8x16_t i =
5489*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
5490*3f1979aaSAndroid Build Coastguard Worker uint8x16_t j =
5491*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
5492*3f1979aaSAndroid Build Coastguard Worker uint8x16_t k =
5493*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
5494*3f1979aaSAndroid Build Coastguard Worker
5495*3f1979aaSAndroid Build Coastguard Worker // Add cross products
5496*3f1979aaSAndroid Build Coastguard Worker uint8x16_t l = veorq_u8(e, f); // L = E + F
5497*3f1979aaSAndroid Build Coastguard Worker uint8x16_t m = veorq_u8(g, h); // M = G + H
5498*3f1979aaSAndroid Build Coastguard Worker uint8x16_t n = veorq_u8(i, j); // N = I + J
5499*3f1979aaSAndroid Build Coastguard Worker
5500*3f1979aaSAndroid Build Coastguard Worker // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
5501*3f1979aaSAndroid Build Coastguard Worker // instructions.
5502*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5503*3f1979aaSAndroid Build Coastguard Worker uint8x16_t lm_p0 = vreinterpretq_u8_u64(
5504*3f1979aaSAndroid Build Coastguard Worker vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5505*3f1979aaSAndroid Build Coastguard Worker uint8x16_t lm_p1 = vreinterpretq_u8_u64(
5506*3f1979aaSAndroid Build Coastguard Worker vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5507*3f1979aaSAndroid Build Coastguard Worker uint8x16_t nk_p0 = vreinterpretq_u8_u64(
5508*3f1979aaSAndroid Build Coastguard Worker vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5509*3f1979aaSAndroid Build Coastguard Worker uint8x16_t nk_p1 = vreinterpretq_u8_u64(
5510*3f1979aaSAndroid Build Coastguard Worker vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5511*3f1979aaSAndroid Build Coastguard Worker #else
5512*3f1979aaSAndroid Build Coastguard Worker uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
5513*3f1979aaSAndroid Build Coastguard Worker uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
5514*3f1979aaSAndroid Build Coastguard Worker uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
5515*3f1979aaSAndroid Build Coastguard Worker uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
5516*3f1979aaSAndroid Build Coastguard Worker #endif
5517*3f1979aaSAndroid Build Coastguard Worker // t0 = (L) (P0 + P1) << 8
5518*3f1979aaSAndroid Build Coastguard Worker // t1 = (M) (P2 + P3) << 16
5519*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
5520*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
5521*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
5522*3f1979aaSAndroid Build Coastguard Worker
5523*3f1979aaSAndroid Build Coastguard Worker // t2 = (N) (P4 + P5) << 24
5524*3f1979aaSAndroid Build Coastguard Worker // t3 = (K) (P6 + P7) << 32
5525*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
5526*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
5527*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
5528*3f1979aaSAndroid Build Coastguard Worker
5529*3f1979aaSAndroid Build Coastguard Worker // De-interleave
5530*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5531*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t0 = vreinterpretq_u8_u64(
5532*3f1979aaSAndroid Build Coastguard Worker vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5533*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t1 = vreinterpretq_u8_u64(
5534*3f1979aaSAndroid Build Coastguard Worker vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5535*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t2 = vreinterpretq_u8_u64(
5536*3f1979aaSAndroid Build Coastguard Worker vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5537*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t3 = vreinterpretq_u8_u64(
5538*3f1979aaSAndroid Build Coastguard Worker vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5539*3f1979aaSAndroid Build Coastguard Worker #else
5540*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
5541*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
5542*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
5543*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
5544*3f1979aaSAndroid Build Coastguard Worker #endif
5545*3f1979aaSAndroid Build Coastguard Worker // Shift the cross products
5546*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
5547*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
5548*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
5549*3f1979aaSAndroid Build Coastguard Worker uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
5550*3f1979aaSAndroid Build Coastguard Worker
5551*3f1979aaSAndroid Build Coastguard Worker // Accumulate the products
5552*3f1979aaSAndroid Build Coastguard Worker uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
5553*3f1979aaSAndroid Build Coastguard Worker uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
5554*3f1979aaSAndroid Build Coastguard Worker uint8x16_t mix = veorq_u8(d, cross1);
5555*3f1979aaSAndroid Build Coastguard Worker uint8x16_t r = veorq_u8(mix, cross2);
5556*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_u64_u8(r);
5557*3f1979aaSAndroid Build Coastguard Worker }
5558*3f1979aaSAndroid Build Coastguard Worker #endif // ARMv7 polyfill
5559*3f1979aaSAndroid Build Coastguard Worker
_mm_clmulepi64_si128(__m128i _a,__m128i _b,const int imm)5560*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
5561*3f1979aaSAndroid Build Coastguard Worker {
5562*3f1979aaSAndroid Build Coastguard Worker uint64x2_t a = vreinterpretq_u64_m128i(_a);
5563*3f1979aaSAndroid Build Coastguard Worker uint64x2_t b = vreinterpretq_u64_m128i(_b);
5564*3f1979aaSAndroid Build Coastguard Worker switch (imm & 0x11) {
5565*3f1979aaSAndroid Build Coastguard Worker case 0x00:
5566*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(
5567*3f1979aaSAndroid Build Coastguard Worker _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
5568*3f1979aaSAndroid Build Coastguard Worker case 0x01:
5569*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(
5570*3f1979aaSAndroid Build Coastguard Worker _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
5571*3f1979aaSAndroid Build Coastguard Worker case 0x10:
5572*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(
5573*3f1979aaSAndroid Build Coastguard Worker _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
5574*3f1979aaSAndroid Build Coastguard Worker case 0x11:
5575*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u64(
5576*3f1979aaSAndroid Build Coastguard Worker _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
5577*3f1979aaSAndroid Build Coastguard Worker default:
5578*3f1979aaSAndroid Build Coastguard Worker abort();
5579*3f1979aaSAndroid Build Coastguard Worker }
5580*3f1979aaSAndroid Build Coastguard Worker }
5581*3f1979aaSAndroid Build Coastguard Worker
5582*3f1979aaSAndroid Build Coastguard Worker #if !defined(__ARM_FEATURE_CRYPTO)
5583*3f1979aaSAndroid Build Coastguard Worker /* clang-format off */
5584*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_DATA(w) \
5585*3f1979aaSAndroid Build Coastguard Worker { \
5586*3f1979aaSAndroid Build Coastguard Worker w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
5587*3f1979aaSAndroid Build Coastguard Worker w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
5588*3f1979aaSAndroid Build Coastguard Worker w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
5589*3f1979aaSAndroid Build Coastguard Worker w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
5590*3f1979aaSAndroid Build Coastguard Worker w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
5591*3f1979aaSAndroid Build Coastguard Worker w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
5592*3f1979aaSAndroid Build Coastguard Worker w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
5593*3f1979aaSAndroid Build Coastguard Worker w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
5594*3f1979aaSAndroid Build Coastguard Worker w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
5595*3f1979aaSAndroid Build Coastguard Worker w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
5596*3f1979aaSAndroid Build Coastguard Worker w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
5597*3f1979aaSAndroid Build Coastguard Worker w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
5598*3f1979aaSAndroid Build Coastguard Worker w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
5599*3f1979aaSAndroid Build Coastguard Worker w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
5600*3f1979aaSAndroid Build Coastguard Worker w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
5601*3f1979aaSAndroid Build Coastguard Worker w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
5602*3f1979aaSAndroid Build Coastguard Worker w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
5603*3f1979aaSAndroid Build Coastguard Worker w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
5604*3f1979aaSAndroid Build Coastguard Worker w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
5605*3f1979aaSAndroid Build Coastguard Worker w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
5606*3f1979aaSAndroid Build Coastguard Worker w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
5607*3f1979aaSAndroid Build Coastguard Worker w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
5608*3f1979aaSAndroid Build Coastguard Worker w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
5609*3f1979aaSAndroid Build Coastguard Worker w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
5610*3f1979aaSAndroid Build Coastguard Worker w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
5611*3f1979aaSAndroid Build Coastguard Worker w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
5612*3f1979aaSAndroid Build Coastguard Worker w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
5613*3f1979aaSAndroid Build Coastguard Worker w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
5614*3f1979aaSAndroid Build Coastguard Worker w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
5615*3f1979aaSAndroid Build Coastguard Worker w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
5616*3f1979aaSAndroid Build Coastguard Worker w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
5617*3f1979aaSAndroid Build Coastguard Worker w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
5618*3f1979aaSAndroid Build Coastguard Worker w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
5619*3f1979aaSAndroid Build Coastguard Worker w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
5620*3f1979aaSAndroid Build Coastguard Worker w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
5621*3f1979aaSAndroid Build Coastguard Worker w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
5622*3f1979aaSAndroid Build Coastguard Worker w(0xb0), w(0x54), w(0xbb), w(0x16) \
5623*3f1979aaSAndroid Build Coastguard Worker }
5624*3f1979aaSAndroid Build Coastguard Worker /* clang-format on */
5625*3f1979aaSAndroid Build Coastguard Worker
5626*3f1979aaSAndroid Build Coastguard Worker /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
5627*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_H0(x) (x)
5628*3f1979aaSAndroid Build Coastguard Worker static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
5629*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_H0
5630*3f1979aaSAndroid Build Coastguard Worker
5631*3f1979aaSAndroid Build Coastguard Worker // In the absence of crypto extensions, implement aesenc using regular neon
5632*3f1979aaSAndroid Build Coastguard Worker // intrinsics instead. See:
5633*3f1979aaSAndroid Build Coastguard Worker // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
5634*3f1979aaSAndroid Build Coastguard Worker // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
5635*3f1979aaSAndroid Build Coastguard Worker // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
5636*3f1979aaSAndroid Build Coastguard Worker // for more information Reproduced with permission of the author.
_mm_aesenc_si128(__m128i EncBlock,__m128i RoundKey)5637*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
5638*3f1979aaSAndroid Build Coastguard Worker {
5639*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__)
5640*3f1979aaSAndroid Build Coastguard Worker static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
5641*3f1979aaSAndroid Build Coastguard Worker 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
5642*3f1979aaSAndroid Build Coastguard Worker 0xc, 0x1, 0x6, 0xb};
5643*3f1979aaSAndroid Build Coastguard Worker static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
5644*3f1979aaSAndroid Build Coastguard Worker 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
5645*3f1979aaSAndroid Build Coastguard Worker
5646*3f1979aaSAndroid Build Coastguard Worker uint8x16_t v;
5647*3f1979aaSAndroid Build Coastguard Worker uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
5648*3f1979aaSAndroid Build Coastguard Worker
5649*3f1979aaSAndroid Build Coastguard Worker // shift rows
5650*3f1979aaSAndroid Build Coastguard Worker w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
5651*3f1979aaSAndroid Build Coastguard Worker
5652*3f1979aaSAndroid Build Coastguard Worker // sub bytes
5653*3f1979aaSAndroid Build Coastguard Worker v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
5654*3f1979aaSAndroid Build Coastguard Worker v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
5655*3f1979aaSAndroid Build Coastguard Worker v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
5656*3f1979aaSAndroid Build Coastguard Worker v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
5657*3f1979aaSAndroid Build Coastguard Worker
5658*3f1979aaSAndroid Build Coastguard Worker // mix columns
5659*3f1979aaSAndroid Build Coastguard Worker w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
5660*3f1979aaSAndroid Build Coastguard Worker w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
5661*3f1979aaSAndroid Build Coastguard Worker w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
5662*3f1979aaSAndroid Build Coastguard Worker
5663*3f1979aaSAndroid Build Coastguard Worker // add round key
5664*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(w) ^ RoundKey;
5665*3f1979aaSAndroid Build Coastguard Worker
5666*3f1979aaSAndroid Build Coastguard Worker #else /* ARMv7-A NEON implementation */
5667*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
5668*3f1979aaSAndroid Build Coastguard Worker (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
5669*3f1979aaSAndroid Build Coastguard Worker (b0))
5670*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
5671*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
5672*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_U0(p) \
5673*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
5674*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_U1(p) \
5675*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
5676*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_U2(p) \
5677*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
5678*3f1979aaSAndroid Build Coastguard Worker #define SSE2NEON_AES_U3(p) \
5679*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
5680*3f1979aaSAndroid Build Coastguard Worker static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
5681*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
5682*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
5683*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
5684*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
5685*3f1979aaSAndroid Build Coastguard Worker };
5686*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_B2W
5687*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_F2
5688*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_F3
5689*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_U0
5690*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_U1
5691*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_U2
5692*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_U3
5693*3f1979aaSAndroid Build Coastguard Worker
5694*3f1979aaSAndroid Build Coastguard Worker uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
5695*3f1979aaSAndroid Build Coastguard Worker uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
5696*3f1979aaSAndroid Build Coastguard Worker uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
5697*3f1979aaSAndroid Build Coastguard Worker uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
5698*3f1979aaSAndroid Build Coastguard Worker
5699*3f1979aaSAndroid Build Coastguard Worker __m128i out = _mm_set_epi32(
5700*3f1979aaSAndroid Build Coastguard Worker (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
5701*3f1979aaSAndroid Build Coastguard Worker aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
5702*3f1979aaSAndroid Build Coastguard Worker (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
5703*3f1979aaSAndroid Build Coastguard Worker aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
5704*3f1979aaSAndroid Build Coastguard Worker (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
5705*3f1979aaSAndroid Build Coastguard Worker aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
5706*3f1979aaSAndroid Build Coastguard Worker (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
5707*3f1979aaSAndroid Build Coastguard Worker aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
5708*3f1979aaSAndroid Build Coastguard Worker
5709*3f1979aaSAndroid Build Coastguard Worker return _mm_xor_si128(out, RoundKey);
5710*3f1979aaSAndroid Build Coastguard Worker #endif
5711*3f1979aaSAndroid Build Coastguard Worker }
5712*3f1979aaSAndroid Build Coastguard Worker
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)5713*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
5714*3f1979aaSAndroid Build Coastguard Worker {
5715*3f1979aaSAndroid Build Coastguard Worker /* FIXME: optimized for NEON */
5716*3f1979aaSAndroid Build Coastguard Worker uint8_t v[4][4] = {
5717*3f1979aaSAndroid Build Coastguard Worker [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
5718*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
5719*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
5720*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
5721*3f1979aaSAndroid Build Coastguard Worker [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
5722*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
5723*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
5724*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
5725*3f1979aaSAndroid Build Coastguard Worker [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
5726*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
5727*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
5728*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
5729*3f1979aaSAndroid Build Coastguard Worker [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
5730*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
5731*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
5732*3f1979aaSAndroid Build Coastguard Worker SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
5733*3f1979aaSAndroid Build Coastguard Worker };
5734*3f1979aaSAndroid Build Coastguard Worker for (int i = 0; i < 16; i++)
5735*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_nth_u8_m128i(a, i) =
5736*3f1979aaSAndroid Build Coastguard Worker v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
5737*3f1979aaSAndroid Build Coastguard Worker return a;
5738*3f1979aaSAndroid Build Coastguard Worker }
5739*3f1979aaSAndroid Build Coastguard Worker
5740*3f1979aaSAndroid Build Coastguard Worker // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
5741*3f1979aaSAndroid Build Coastguard Worker // This instruction generates a round key for AES encryption. See
5742*3f1979aaSAndroid Build Coastguard Worker // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
5743*3f1979aaSAndroid Build Coastguard Worker // for details.
5744*3f1979aaSAndroid Build Coastguard Worker //
5745*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
_mm_aeskeygenassist_si128(__m128i key,const int rcon)5746*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
5747*3f1979aaSAndroid Build Coastguard Worker {
5748*3f1979aaSAndroid Build Coastguard Worker uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
5749*3f1979aaSAndroid Build Coastguard Worker uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
5750*3f1979aaSAndroid Build Coastguard Worker for (int i = 0; i < 4; ++i) {
5751*3f1979aaSAndroid Build Coastguard Worker ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
5752*3f1979aaSAndroid Build Coastguard Worker ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
5753*3f1979aaSAndroid Build Coastguard Worker }
5754*3f1979aaSAndroid Build Coastguard Worker return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
5755*3f1979aaSAndroid Build Coastguard Worker ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
5756*3f1979aaSAndroid Build Coastguard Worker }
5757*3f1979aaSAndroid Build Coastguard Worker #undef SSE2NEON_AES_DATA
5758*3f1979aaSAndroid Build Coastguard Worker
5759*3f1979aaSAndroid Build Coastguard Worker #else /* __ARM_FEATURE_CRYPTO */
5760*3f1979aaSAndroid Build Coastguard Worker // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
5761*3f1979aaSAndroid Build Coastguard Worker // AESMC and then manually applying the real key as an xor operation. This
5762*3f1979aaSAndroid Build Coastguard Worker // unfortunately means an additional xor op; the compiler should be able to
5763*3f1979aaSAndroid Build Coastguard Worker // optimize this away for repeated calls however. See
5764*3f1979aaSAndroid Build Coastguard Worker // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
5765*3f1979aaSAndroid Build Coastguard Worker // for more details.
_mm_aesenc_si128(__m128i a,__m128i b)5766*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
5767*3f1979aaSAndroid Build Coastguard Worker {
5768*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(
5769*3f1979aaSAndroid Build Coastguard Worker vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
5770*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_m128i(b));
5771*3f1979aaSAndroid Build Coastguard Worker }
5772*3f1979aaSAndroid Build Coastguard Worker
5773*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)5774*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
5775*3f1979aaSAndroid Build Coastguard Worker {
5776*3f1979aaSAndroid Build Coastguard Worker return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
5777*3f1979aaSAndroid Build Coastguard Worker vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
5778*3f1979aaSAndroid Build Coastguard Worker RoundKey);
5779*3f1979aaSAndroid Build Coastguard Worker }
5780*3f1979aaSAndroid Build Coastguard Worker
_mm_aeskeygenassist_si128(__m128i a,const int rcon)5781*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
5782*3f1979aaSAndroid Build Coastguard Worker {
5783*3f1979aaSAndroid Build Coastguard Worker // AESE does ShiftRows and SubBytes on A
5784*3f1979aaSAndroid Build Coastguard Worker uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
5785*3f1979aaSAndroid Build Coastguard Worker
5786*3f1979aaSAndroid Build Coastguard Worker uint8x16_t dest = {
5787*3f1979aaSAndroid Build Coastguard Worker // Undo ShiftRows step from AESE and extract X1 and X3
5788*3f1979aaSAndroid Build Coastguard Worker u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
5789*3f1979aaSAndroid Build Coastguard Worker u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
5790*3f1979aaSAndroid Build Coastguard Worker u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
5791*3f1979aaSAndroid Build Coastguard Worker u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
5792*3f1979aaSAndroid Build Coastguard Worker };
5793*3f1979aaSAndroid Build Coastguard Worker uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
5794*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
5795*3f1979aaSAndroid Build Coastguard Worker }
5796*3f1979aaSAndroid Build Coastguard Worker #endif
5797*3f1979aaSAndroid Build Coastguard Worker
5798*3f1979aaSAndroid Build Coastguard Worker /* Streaming Extensions */
5799*3f1979aaSAndroid Build Coastguard Worker
5800*3f1979aaSAndroid Build Coastguard Worker // Guarantees that every preceding store is globally visible before any
5801*3f1979aaSAndroid Build Coastguard Worker // subsequent store.
5802*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
_mm_sfence(void)5803*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_sfence(void)
5804*3f1979aaSAndroid Build Coastguard Worker {
5805*3f1979aaSAndroid Build Coastguard Worker __sync_synchronize();
5806*3f1979aaSAndroid Build Coastguard Worker }
5807*3f1979aaSAndroid Build Coastguard Worker
5808*3f1979aaSAndroid Build Coastguard Worker // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
5809*3f1979aaSAndroid Build Coastguard Worker // point elements) from a into memory using a non-temporal memory hint.
5810*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
_mm_stream_ps(float * p,__m128 a)5811*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
5812*3f1979aaSAndroid Build Coastguard Worker {
5813*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_nontemporal_store)
5814*3f1979aaSAndroid Build Coastguard Worker __builtin_nontemporal_store(a, (float32x4_t *) p);
5815*3f1979aaSAndroid Build Coastguard Worker #else
5816*3f1979aaSAndroid Build Coastguard Worker vst1q_f32(p, vreinterpretq_f32_m128(a));
5817*3f1979aaSAndroid Build Coastguard Worker #endif
5818*3f1979aaSAndroid Build Coastguard Worker }
5819*3f1979aaSAndroid Build Coastguard Worker
5820*3f1979aaSAndroid Build Coastguard Worker // Stores the data in a to the address p without polluting the caches. If the
5821*3f1979aaSAndroid Build Coastguard Worker // cache line containing address p is already in the cache, the cache will be
5822*3f1979aaSAndroid Build Coastguard Worker // updated.
5823*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
_mm_stream_si128(__m128i * p,__m128i a)5824*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5825*3f1979aaSAndroid Build Coastguard Worker {
5826*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_nontemporal_store)
5827*3f1979aaSAndroid Build Coastguard Worker __builtin_nontemporal_store(a, p);
5828*3f1979aaSAndroid Build Coastguard Worker #else
5829*3f1979aaSAndroid Build Coastguard Worker vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5830*3f1979aaSAndroid Build Coastguard Worker #endif
5831*3f1979aaSAndroid Build Coastguard Worker }
5832*3f1979aaSAndroid Build Coastguard Worker
5833*3f1979aaSAndroid Build Coastguard Worker // Load 128-bits of integer data from memory into dst using a non-temporal
5834*3f1979aaSAndroid Build Coastguard Worker // memory hint. mem_addr must be aligned on a 16-byte boundary or a
5835*3f1979aaSAndroid Build Coastguard Worker // general-protection exception may be generated.
5836*3f1979aaSAndroid Build Coastguard Worker //
5837*3f1979aaSAndroid Build Coastguard Worker // dst[127:0] := MEM[mem_addr+127:mem_addr]
5838*3f1979aaSAndroid Build Coastguard Worker //
5839*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
_mm_stream_load_si128(__m128i * p)5840*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
5841*3f1979aaSAndroid Build Coastguard Worker {
5842*3f1979aaSAndroid Build Coastguard Worker #if __has_builtin(__builtin_nontemporal_store)
5843*3f1979aaSAndroid Build Coastguard Worker return __builtin_nontemporal_load(p);
5844*3f1979aaSAndroid Build Coastguard Worker #else
5845*3f1979aaSAndroid Build Coastguard Worker return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
5846*3f1979aaSAndroid Build Coastguard Worker #endif
5847*3f1979aaSAndroid Build Coastguard Worker }
5848*3f1979aaSAndroid Build Coastguard Worker
5849*3f1979aaSAndroid Build Coastguard Worker // Cache line containing p is flushed and invalidated from all caches in the
5850*3f1979aaSAndroid Build Coastguard Worker // coherency domain. :
5851*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
_mm_clflush(void const * p)5852*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_clflush(void const *p)
5853*3f1979aaSAndroid Build Coastguard Worker {
5854*3f1979aaSAndroid Build Coastguard Worker (void) p;
5855*3f1979aaSAndroid Build Coastguard Worker // no corollary for Neon?
5856*3f1979aaSAndroid Build Coastguard Worker }
5857*3f1979aaSAndroid Build Coastguard Worker
5858*3f1979aaSAndroid Build Coastguard Worker // Allocate aligned blocks of memory.
5859*3f1979aaSAndroid Build Coastguard Worker // https://software.intel.com/en-us/
5860*3f1979aaSAndroid Build Coastguard Worker // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
_mm_malloc(size_t size,size_t align)5861*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
5862*3f1979aaSAndroid Build Coastguard Worker {
5863*3f1979aaSAndroid Build Coastguard Worker void *ptr;
5864*3f1979aaSAndroid Build Coastguard Worker if (align == 1)
5865*3f1979aaSAndroid Build Coastguard Worker return malloc(size);
5866*3f1979aaSAndroid Build Coastguard Worker if (align == 2 || (sizeof(void *) == 8 && align == 4))
5867*3f1979aaSAndroid Build Coastguard Worker align = sizeof(void *);
5868*3f1979aaSAndroid Build Coastguard Worker if (!posix_memalign(&ptr, align, size))
5869*3f1979aaSAndroid Build Coastguard Worker return ptr;
5870*3f1979aaSAndroid Build Coastguard Worker return NULL;
5871*3f1979aaSAndroid Build Coastguard Worker }
5872*3f1979aaSAndroid Build Coastguard Worker
_mm_free(void * addr)5873*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE void _mm_free(void *addr)
5874*3f1979aaSAndroid Build Coastguard Worker {
5875*3f1979aaSAndroid Build Coastguard Worker free(addr);
5876*3f1979aaSAndroid Build Coastguard Worker }
5877*3f1979aaSAndroid Build Coastguard Worker
5878*3f1979aaSAndroid Build Coastguard Worker // Starting with the initial value in crc, accumulates a CRC32 value for
5879*3f1979aaSAndroid Build Coastguard Worker // unsigned 8-bit integer v.
5880*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
_mm_crc32_u8(uint32_t crc,uint8_t v)5881*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
5882*3f1979aaSAndroid Build Coastguard Worker {
5883*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5884*3f1979aaSAndroid Build Coastguard Worker __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
5885*3f1979aaSAndroid Build Coastguard Worker : [c] "+r"(crc)
5886*3f1979aaSAndroid Build Coastguard Worker : [v] "r"(v));
5887*3f1979aaSAndroid Build Coastguard Worker #else
5888*3f1979aaSAndroid Build Coastguard Worker crc ^= v;
5889*3f1979aaSAndroid Build Coastguard Worker for (int bit = 0; bit < 8; bit++) {
5890*3f1979aaSAndroid Build Coastguard Worker if (crc & 1)
5891*3f1979aaSAndroid Build Coastguard Worker crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
5892*3f1979aaSAndroid Build Coastguard Worker else
5893*3f1979aaSAndroid Build Coastguard Worker crc = (crc >> 1);
5894*3f1979aaSAndroid Build Coastguard Worker }
5895*3f1979aaSAndroid Build Coastguard Worker #endif
5896*3f1979aaSAndroid Build Coastguard Worker return crc;
5897*3f1979aaSAndroid Build Coastguard Worker }
5898*3f1979aaSAndroid Build Coastguard Worker
5899*3f1979aaSAndroid Build Coastguard Worker // Starting with the initial value in crc, accumulates a CRC32 value for
5900*3f1979aaSAndroid Build Coastguard Worker // unsigned 16-bit integer v.
5901*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
_mm_crc32_u16(uint32_t crc,uint16_t v)5902*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
5903*3f1979aaSAndroid Build Coastguard Worker {
5904*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5905*3f1979aaSAndroid Build Coastguard Worker __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
5906*3f1979aaSAndroid Build Coastguard Worker : [c] "+r"(crc)
5907*3f1979aaSAndroid Build Coastguard Worker : [v] "r"(v));
5908*3f1979aaSAndroid Build Coastguard Worker #else
5909*3f1979aaSAndroid Build Coastguard Worker crc = _mm_crc32_u8(crc, v & 0xff);
5910*3f1979aaSAndroid Build Coastguard Worker crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
5911*3f1979aaSAndroid Build Coastguard Worker #endif
5912*3f1979aaSAndroid Build Coastguard Worker return crc;
5913*3f1979aaSAndroid Build Coastguard Worker }
5914*3f1979aaSAndroid Build Coastguard Worker
5915*3f1979aaSAndroid Build Coastguard Worker // Starting with the initial value in crc, accumulates a CRC32 value for
5916*3f1979aaSAndroid Build Coastguard Worker // unsigned 32-bit integer v.
5917*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
_mm_crc32_u32(uint32_t crc,uint32_t v)5918*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
5919*3f1979aaSAndroid Build Coastguard Worker {
5920*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5921*3f1979aaSAndroid Build Coastguard Worker __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
5922*3f1979aaSAndroid Build Coastguard Worker : [c] "+r"(crc)
5923*3f1979aaSAndroid Build Coastguard Worker : [v] "r"(v));
5924*3f1979aaSAndroid Build Coastguard Worker #else
5925*3f1979aaSAndroid Build Coastguard Worker crc = _mm_crc32_u16(crc, v & 0xffff);
5926*3f1979aaSAndroid Build Coastguard Worker crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
5927*3f1979aaSAndroid Build Coastguard Worker #endif
5928*3f1979aaSAndroid Build Coastguard Worker return crc;
5929*3f1979aaSAndroid Build Coastguard Worker }
5930*3f1979aaSAndroid Build Coastguard Worker
5931*3f1979aaSAndroid Build Coastguard Worker // Starting with the initial value in crc, accumulates a CRC32 value for
5932*3f1979aaSAndroid Build Coastguard Worker // unsigned 64-bit integer v.
5933*3f1979aaSAndroid Build Coastguard Worker // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
_mm_crc32_u64(uint64_t crc,uint64_t v)5934*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
5935*3f1979aaSAndroid Build Coastguard Worker {
5936*3f1979aaSAndroid Build Coastguard Worker #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
5937*3f1979aaSAndroid Build Coastguard Worker __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
5938*3f1979aaSAndroid Build Coastguard Worker : [c] "+r"(crc)
5939*3f1979aaSAndroid Build Coastguard Worker : [v] "r"(v));
5940*3f1979aaSAndroid Build Coastguard Worker #else
5941*3f1979aaSAndroid Build Coastguard Worker crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
5942*3f1979aaSAndroid Build Coastguard Worker crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
5943*3f1979aaSAndroid Build Coastguard Worker #endif
5944*3f1979aaSAndroid Build Coastguard Worker return crc;
5945*3f1979aaSAndroid Build Coastguard Worker }
5946*3f1979aaSAndroid Build Coastguard Worker
5947*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__) || defined(__clang__)
5948*3f1979aaSAndroid Build Coastguard Worker #pragma pop_macro("ALIGN_STRUCT")
5949*3f1979aaSAndroid Build Coastguard Worker #pragma pop_macro("FORCE_INLINE")
5950*3f1979aaSAndroid Build Coastguard Worker #endif
5951*3f1979aaSAndroid Build Coastguard Worker
5952*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__)
5953*3f1979aaSAndroid Build Coastguard Worker #pragma GCC pop_options
5954*3f1979aaSAndroid Build Coastguard Worker #endif
5955*3f1979aaSAndroid Build Coastguard Worker
5956*3f1979aaSAndroid Build Coastguard Worker #endif
5957