xref: /aosp_15_r20/external/neon_2_sse/NEON_2_SSE.h (revision 80a68eefdccd99baeea4880baa1b4c25f2618725)
1*80a68eefSBob Badour //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation,  [email protected]
2*80a68eefSBob Badour 
3*80a68eefSBob Badour //*** Copyright (C) 2012-2018 Intel Corporation.  All rights reserved.
4*80a68eefSBob Badour 
5*80a68eefSBob Badour //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
6*80a68eefSBob Badour 
7*80a68eefSBob Badour //By downloading, copying, installing or using the software you agree to this license.
8*80a68eefSBob Badour //If you do not agree to this license, do not download, install, copy or use the software.
9*80a68eefSBob Badour 
10*80a68eefSBob Badour //                              License Agreement
11*80a68eefSBob Badour //Redistribution and use in source and binary forms, with or without modification,
12*80a68eefSBob Badour //are permitted provided that the following conditions are met:
13*80a68eefSBob Badour 
14*80a68eefSBob Badour //  * Redistributions of source code must retain the above copyright notice,
15*80a68eefSBob Badour //    this list of conditions and the following disclaimer.
16*80a68eefSBob Badour 
17*80a68eefSBob Badour //  * The name of the copyright holders may not be used to endorse or promote products
18*80a68eefSBob Badour //    derived from this software without specific prior written permission.
19*80a68eefSBob Badour 
20*80a68eefSBob Badour //This software is provided by the copyright holders and contributors "as is" and
21*80a68eefSBob Badour //any express or implied warranties, including, but not limited to, the implied
22*80a68eefSBob Badour //warranties of merchantability and fitness for a particular purpose are disclaimed.
23*80a68eefSBob Badour //In no event shall the Intel Corporation or contributors be liable for any direct,
24*80a68eefSBob Badour //indirect, incidental, special, exemplary, or consequential damages
25*80a68eefSBob Badour //(including, but not limited to, procurement of substitute goods or services;
26*80a68eefSBob Badour //loss of use, data, or profits; or business interruption) however caused
27*80a68eefSBob Badour //and on any theory of liability, whether in contract, strict liability,
28*80a68eefSBob Badour //or tort (including negligence or otherwise) arising in any way out of
29*80a68eefSBob Badour //the use of this software, even if advised of the possibility of such damage.
30*80a68eefSBob Badour 
31*80a68eefSBob Badour //*****************************************************************************************
32*80a68eefSBob Badour // This file is intended to simplify ARM->IA32 porting
33*80a68eefSBob Badour // It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
34*80a68eefSBob Badour // and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
35*80a68eefSBob Badour //MMX instruction set is not used due to non availability on x64 systems,
36*80a68eefSBob Badour //performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point  switching
37*80a68eefSBob Badour //*****************************************************************************************
38*80a68eefSBob Badour 
39*80a68eefSBob Badour //!!!!!!!!!!!!!!  To use this file just include it in your project that uses ARM NEON intinsics instead of "arm_neon.h" and compile it as usual
40*80a68eefSBob Badour //!!!!!!!!!!!!!!  but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance.
41*80a68eefSBob Badour 
42*80a68eefSBob Badour #ifndef NEON2SSE_H
43*80a68eefSBob Badour #define NEON2SSE_H
44*80a68eefSBob Badour 
45*80a68eefSBob Badour /*********************************************************************************************************************/
46*80a68eefSBob Badour //!!!!!!!!!!!!!!
47*80a68eefSBob Badour //if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used
48*80a68eefSBob Badour //For older devices without SSE4 support it should be undefined,  for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine
49*80a68eefSBob Badour #ifndef USE_SSE4
50*80a68eefSBob Badour #   if defined(__SSE4_2__)
51*80a68eefSBob Badour #       define USE_SSE4
52*80a68eefSBob Badour #   endif
53*80a68eefSBob Badour #endif
54*80a68eefSBob Badour /*********************************************************************************************************************/
55*80a68eefSBob Badour 
56*80a68eefSBob Badour #include <xmmintrin.h>     //SSE
57*80a68eefSBob Badour #include <emmintrin.h>     //SSE2
58*80a68eefSBob Badour #include <pmmintrin.h>     //SSE3
59*80a68eefSBob Badour #include <tmmintrin.h>     //SSSE3
60*80a68eefSBob Badour #ifdef USE_SSE4
61*80a68eefSBob Badour #   include <smmintrin.h> //SSE4.1
62*80a68eefSBob Badour #   include <nmmintrin.h> //SSE4.2
63*80a68eefSBob Badour #endif
64*80a68eefSBob Badour 
65*80a68eefSBob Badour #include <math.h>
66*80a68eefSBob Badour 
67*80a68eefSBob Badour //***************  functions and data attributes, compiler dependent  *********************************
68*80a68eefSBob Badour //***********************************************************************************
69*80a68eefSBob Badour #ifdef __GNUC__
70*80a68eefSBob Badour #   define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
71*80a68eefSBob Badour #   define _NEON2SSESTORAGE static
72*80a68eefSBob Badour #   define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
73*80a68eefSBob Badour #   define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
74*80a68eefSBob Badour #   ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING
75*80a68eefSBob Badour #       if _GCC_VERSION <  40500
76*80a68eefSBob Badour #           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
77*80a68eefSBob Badour #       else
78*80a68eefSBob Badour #           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
79*80a68eefSBob Badour #       endif
80*80a68eefSBob Badour #   else
81*80a68eefSBob Badour #       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
82*80a68eefSBob Badour #   endif
83*80a68eefSBob Badour #   if defined(__x86_64__)
84*80a68eefSBob Badour #       define _NEON2SSE_64BIT  __x86_64__
85*80a68eefSBob Badour #   endif
86*80a68eefSBob Badour #else
87*80a68eefSBob Badour #   define _NEON2SSESTORAGE static
88*80a68eefSBob Badour #   define _NEON2SSE_ALIGN_16  __declspec(align(16))
89*80a68eefSBob Badour #   define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline
90*80a68eefSBob Badour #   if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING)
91*80a68eefSBob Badour #       define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
92*80a68eefSBob Badour #       if defined(_M_X64)
93*80a68eefSBob Badour #           define _NEON2SSE_64BIT  _M_X64
94*80a68eefSBob Badour #       endif
95*80a68eefSBob Badour #   else
96*80a68eefSBob Badour #       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
97*80a68eefSBob Badour #   endif
98*80a68eefSBob Badour #endif
99*80a68eefSBob Badour 
100*80a68eefSBob Badour #if defined  (_NEON2SSE_64BIT) && defined (USE_SSE4)
101*80a68eefSBob Badour #   define _NEON2SSE_64BIT_SSE4
102*80a68eefSBob Badour #endif
103*80a68eefSBob Badour 
104*80a68eefSBob Badour /*********************************************************************************************************************/
105*80a68eefSBob Badour //    data types conversion
106*80a68eefSBob Badour /*********************************************************************************************************************/
107*80a68eefSBob Badour #if defined(_MSC_VER) && (_MSC_VER < 1300)
108*80a68eefSBob Badour     typedef signed char int8_t;
109*80a68eefSBob Badour     typedef unsigned char uint8_t;
110*80a68eefSBob Badour     typedef signed short int16_t;
111*80a68eefSBob Badour     typedef unsigned short uint16_t;
112*80a68eefSBob Badour     typedef signed int int32_t;
113*80a68eefSBob Badour     typedef unsigned int uint32_t;
114*80a68eefSBob Badour     typedef signed long long int64_t;
115*80a68eefSBob Badour     typedef unsigned long long uint64_t;
116*80a68eefSBob Badour #elif defined(_MSC_VER)
117*80a68eefSBob Badour     typedef signed __int8 int8_t;
118*80a68eefSBob Badour     typedef unsigned __int8 uint8_t;
119*80a68eefSBob Badour     typedef signed __int16 int16_t;
120*80a68eefSBob Badour     typedef unsigned __int16 uint16_t;
121*80a68eefSBob Badour     typedef signed __int32 int32_t;
122*80a68eefSBob Badour     typedef unsigned __int32 uint32_t;
123*80a68eefSBob Badour 
124*80a68eefSBob Badour     typedef signed long long int64_t;
125*80a68eefSBob Badour     typedef unsigned long long uint64_t;
126*80a68eefSBob Badour #else
127*80a68eefSBob Badour #   include <stdint.h>
128*80a68eefSBob Badour #   include <limits.h>
129*80a68eefSBob Badour #endif
130*80a68eefSBob Badour 
131*80a68eefSBob Badour typedef union   __m64_128 {
132*80a68eefSBob Badour     uint64_t m64_u64[1];
133*80a68eefSBob Badour     float m64_f32[2];
134*80a68eefSBob Badour     int8_t m64_i8[8];
135*80a68eefSBob Badour     int16_t m64_i16[4];
136*80a68eefSBob Badour     int32_t m64_i32[2];
137*80a68eefSBob Badour     int64_t m64_i64[1];
138*80a68eefSBob Badour     uint8_t m64_u8[8];
139*80a68eefSBob Badour     uint16_t m64_u16[4];
140*80a68eefSBob Badour     uint32_t m64_u32[2];
141*80a68eefSBob Badour } __m64_128;
142*80a68eefSBob Badour 
143*80a68eefSBob Badour typedef __m64_128 int8x8_t;
144*80a68eefSBob Badour typedef __m64_128 uint8x8_t;
145*80a68eefSBob Badour typedef __m64_128 int16x4_t;
146*80a68eefSBob Badour typedef __m64_128 uint16x4_t;
147*80a68eefSBob Badour typedef __m64_128 int32x2_t;
148*80a68eefSBob Badour typedef __m64_128 uint32x2_t;
149*80a68eefSBob Badour typedef __m64_128 int64x1_t;
150*80a68eefSBob Badour typedef __m64_128 uint64x1_t;
151*80a68eefSBob Badour typedef __m64_128 poly8x8_t;
152*80a68eefSBob Badour typedef __m64_128 poly16x4_t;
153*80a68eefSBob Badour 
154*80a68eefSBob Badour typedef __m64_128 float32x2_t;
155*80a68eefSBob Badour typedef __m128 float32x4_t;
156*80a68eefSBob Badour 
157*80a68eefSBob Badour typedef __m128 float16x4_t; //not supported by IA, for compartibility
158*80a68eefSBob Badour typedef __m128 float16x8_t; //not supported by IA, for compartibility
159*80a68eefSBob Badour 
160*80a68eefSBob Badour typedef __m64_128 float64x1_t;
161*80a68eefSBob Badour typedef __m128d float64x2_t;
162*80a68eefSBob Badour 
163*80a68eefSBob Badour typedef __m128i int8x16_t;
164*80a68eefSBob Badour typedef __m128i int16x8_t;
165*80a68eefSBob Badour typedef __m128i int32x4_t;
166*80a68eefSBob Badour typedef __m128i int64x2_t;
167*80a68eefSBob Badour typedef __m128i uint8x16_t;
168*80a68eefSBob Badour typedef __m128i uint16x8_t;
169*80a68eefSBob Badour typedef __m128i uint32x4_t;
170*80a68eefSBob Badour typedef __m128i uint64x2_t;
171*80a68eefSBob Badour typedef __m128i poly8x16_t;
172*80a68eefSBob Badour typedef __m128i poly16x8_t;
173*80a68eefSBob Badour 
174*80a68eefSBob Badour #if defined(_MSC_VER)
175*80a68eefSBob Badour #   define SINT_MIN     (-2147483647 - 1) /* min signed int value */
176*80a68eefSBob Badour #   define SINT_MAX       2147483647 /* max signed int value */
177*80a68eefSBob Badour #else
178*80a68eefSBob Badour #   define SINT_MIN     INT_MIN /* min signed int value */
179*80a68eefSBob Badour #   define SINT_MAX     INT_MAX /* max signed int value */
180*80a68eefSBob Badour #endif
181*80a68eefSBob Badour 
182*80a68eefSBob Badour typedef   float float32_t;
183*80a68eefSBob Badour #if !defined(__clang__)
184*80a68eefSBob Badour typedef   float __fp16;
185*80a68eefSBob Badour #endif
186*80a68eefSBob Badour 
187*80a68eefSBob Badour typedef   double float64_t;
188*80a68eefSBob Badour 
189*80a68eefSBob Badour 
190*80a68eefSBob Badour typedef  uint8_t poly8_t;
191*80a68eefSBob Badour typedef  uint16_t poly16_t;
192*80a68eefSBob Badour 
193*80a68eefSBob Badour 
194*80a68eefSBob Badour //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
195*80a68eefSBob Badour //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
196*80a68eefSBob Badour struct int8x16x2_t {
197*80a68eefSBob Badour     int8x16_t val[2];
198*80a68eefSBob Badour };
199*80a68eefSBob Badour struct int16x8x2_t {
200*80a68eefSBob Badour     int16x8_t val[2];
201*80a68eefSBob Badour };
202*80a68eefSBob Badour struct int32x4x2_t {
203*80a68eefSBob Badour     int32x4_t val[2];
204*80a68eefSBob Badour };
205*80a68eefSBob Badour struct int64x2x2_t {
206*80a68eefSBob Badour     int64x2_t val[2];
207*80a68eefSBob Badour };
208*80a68eefSBob Badour //Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
209*80a68eefSBob Badour struct int8x8x2_t {
210*80a68eefSBob Badour     int8x8_t val[2];
211*80a68eefSBob Badour };
212*80a68eefSBob Badour struct int16x4x2_t {
213*80a68eefSBob Badour     int16x4_t val[2];
214*80a68eefSBob Badour };
215*80a68eefSBob Badour struct int32x2x2_t {
216*80a68eefSBob Badour     int32x2_t val[2];
217*80a68eefSBob Badour };
218*80a68eefSBob Badour struct int64x1x2_t {
219*80a68eefSBob Badour     int64x1_t val[2];
220*80a68eefSBob Badour };
221*80a68eefSBob Badour 
222*80a68eefSBob Badour typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
223*80a68eefSBob Badour typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
224*80a68eefSBob Badour typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
225*80a68eefSBob Badour typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
226*80a68eefSBob Badour 
227*80a68eefSBob Badour typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
228*80a68eefSBob Badour typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
229*80a68eefSBob Badour typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
230*80a68eefSBob Badour typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
231*80a68eefSBob Badour 
232*80a68eefSBob Badour /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
233*80a68eefSBob Badour typedef struct int8x16x2_t uint8x16x2_t;
234*80a68eefSBob Badour typedef struct int16x8x2_t uint16x8x2_t;
235*80a68eefSBob Badour typedef struct int32x4x2_t uint32x4x2_t;
236*80a68eefSBob Badour typedef struct int64x2x2_t uint64x2x2_t;
237*80a68eefSBob Badour typedef struct int8x16x2_t poly8x16x2_t;
238*80a68eefSBob Badour typedef struct int16x8x2_t poly16x8x2_t;
239*80a68eefSBob Badour 
240*80a68eefSBob Badour typedef struct int8x8x2_t uint8x8x2_t;
241*80a68eefSBob Badour typedef struct int16x4x2_t uint16x4x2_t;
242*80a68eefSBob Badour typedef struct int32x2x2_t uint32x2x2_t;
243*80a68eefSBob Badour typedef struct int64x1x2_t uint64x1x2_t;
244*80a68eefSBob Badour typedef struct int8x8x2_t poly8x8x2_t;
245*80a68eefSBob Badour typedef struct int16x4x2_t poly16x4x2_t;
246*80a68eefSBob Badour 
247*80a68eefSBob Badour //float
248*80a68eefSBob Badour struct float32x4x2_t {
249*80a68eefSBob Badour     float32x4_t val[2];
250*80a68eefSBob Badour };
251*80a68eefSBob Badour struct float16x8x2_t {
252*80a68eefSBob Badour     float16x8_t val[2];
253*80a68eefSBob Badour };
254*80a68eefSBob Badour struct float32x2x2_t {
255*80a68eefSBob Badour     float32x2_t val[2];
256*80a68eefSBob Badour };
257*80a68eefSBob Badour 
258*80a68eefSBob Badour typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
259*80a68eefSBob Badour typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
260*80a68eefSBob Badour typedef struct  float32x2x2_t float32x2x2_t; //for C compilers to make them happy
261*80a68eefSBob Badour typedef  float16x8x2_t float16x4x2_t;
262*80a68eefSBob Badour 
263*80a68eefSBob Badour //4
264*80a68eefSBob Badour struct int8x16x4_t {
265*80a68eefSBob Badour     int8x16_t val[4];
266*80a68eefSBob Badour };
267*80a68eefSBob Badour struct int16x8x4_t {
268*80a68eefSBob Badour     int16x8_t val[4];
269*80a68eefSBob Badour };
270*80a68eefSBob Badour struct int32x4x4_t {
271*80a68eefSBob Badour     int32x4_t val[4];
272*80a68eefSBob Badour };
273*80a68eefSBob Badour struct int64x2x4_t {
274*80a68eefSBob Badour     int64x2_t val[4];
275*80a68eefSBob Badour };
276*80a68eefSBob Badour 
277*80a68eefSBob Badour struct int8x8x4_t {
278*80a68eefSBob Badour     int8x8_t val[4];
279*80a68eefSBob Badour };
280*80a68eefSBob Badour struct int16x4x4_t {
281*80a68eefSBob Badour     int16x4_t val[4];
282*80a68eefSBob Badour };
283*80a68eefSBob Badour struct int32x2x4_t {
284*80a68eefSBob Badour     int32x2_t val[4];
285*80a68eefSBob Badour };
286*80a68eefSBob Badour struct int64x1x4_t {
287*80a68eefSBob Badour     int64x1_t val[4];
288*80a68eefSBob Badour };
289*80a68eefSBob Badour 
290*80a68eefSBob Badour typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
291*80a68eefSBob Badour typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
292*80a68eefSBob Badour typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
293*80a68eefSBob Badour typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
294*80a68eefSBob Badour 
295*80a68eefSBob Badour typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
296*80a68eefSBob Badour typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
297*80a68eefSBob Badour typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
298*80a68eefSBob Badour typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
299*80a68eefSBob Badour 
300*80a68eefSBob Badour /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
301*80a68eefSBob Badour typedef struct int8x8x4_t uint8x8x4_t;
302*80a68eefSBob Badour typedef struct int16x4x4_t uint16x4x4_t;
303*80a68eefSBob Badour typedef struct int32x2x4_t uint32x2x4_t;
304*80a68eefSBob Badour typedef struct int64x1x4_t uint64x1x4_t;
305*80a68eefSBob Badour typedef struct int8x8x4_t poly8x8x4_t;
306*80a68eefSBob Badour typedef struct int16x4x4_t poly16x4x4_t;
307*80a68eefSBob Badour 
308*80a68eefSBob Badour typedef struct int8x16x4_t uint8x16x4_t;
309*80a68eefSBob Badour typedef struct int16x8x4_t uint16x8x4_t;
310*80a68eefSBob Badour typedef struct int32x4x4_t uint32x4x4_t;
311*80a68eefSBob Badour typedef struct int64x2x4_t uint64x2x4_t;
312*80a68eefSBob Badour typedef struct int8x16x4_t poly8x16x4_t;
313*80a68eefSBob Badour typedef struct int16x8x4_t poly16x8x4_t;
314*80a68eefSBob Badour 
315*80a68eefSBob Badour struct float32x4x4_t {
316*80a68eefSBob Badour     float32x4_t val[4];
317*80a68eefSBob Badour };
318*80a68eefSBob Badour struct float16x8x4_t {
319*80a68eefSBob Badour     float16x8_t val[4];
320*80a68eefSBob Badour };
321*80a68eefSBob Badour struct float32x2x4_t {
322*80a68eefSBob Badour     float32x2_t val[4];
323*80a68eefSBob Badour };
324*80a68eefSBob Badour 
325*80a68eefSBob Badour typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
326*80a68eefSBob Badour typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
327*80a68eefSBob Badour typedef struct  float32x2x4_t float32x2x4_t; //for C compilers to make them happy
328*80a68eefSBob Badour typedef  float16x8x4_t float16x4x4_t;
329*80a68eefSBob Badour 
330*80a68eefSBob Badour //3
331*80a68eefSBob Badour struct int16x8x3_t {
332*80a68eefSBob Badour     int16x8_t val[3];
333*80a68eefSBob Badour };
334*80a68eefSBob Badour struct int32x4x3_t {
335*80a68eefSBob Badour     int32x4_t val[3];
336*80a68eefSBob Badour };
337*80a68eefSBob Badour struct int64x2x3_t {
338*80a68eefSBob Badour     int64x2_t val[3];
339*80a68eefSBob Badour };
340*80a68eefSBob Badour struct int8x16x3_t {
341*80a68eefSBob Badour     int8x16_t val[3];
342*80a68eefSBob Badour };
343*80a68eefSBob Badour 
344*80a68eefSBob Badour struct int16x4x3_t {
345*80a68eefSBob Badour     int16x4_t val[3];
346*80a68eefSBob Badour };
347*80a68eefSBob Badour struct int32x2x3_t {
348*80a68eefSBob Badour     int32x2_t val[3];
349*80a68eefSBob Badour };
350*80a68eefSBob Badour struct int64x1x3_t {
351*80a68eefSBob Badour     int64x1_t val[3];
352*80a68eefSBob Badour };
353*80a68eefSBob Badour struct int8x8x3_t {
354*80a68eefSBob Badour     int8x8_t val[3];
355*80a68eefSBob Badour };
356*80a68eefSBob Badour typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
357*80a68eefSBob Badour typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
358*80a68eefSBob Badour typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
359*80a68eefSBob Badour typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
360*80a68eefSBob Badour 
361*80a68eefSBob Badour typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
362*80a68eefSBob Badour typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
363*80a68eefSBob Badour typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
364*80a68eefSBob Badour typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
365*80a68eefSBob Badour 
366*80a68eefSBob Badour 
367*80a68eefSBob Badour /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
368*80a68eefSBob Badour typedef struct int8x16x3_t uint8x16x3_t;
369*80a68eefSBob Badour typedef struct int16x8x3_t uint16x8x3_t;
370*80a68eefSBob Badour typedef struct int32x4x3_t uint32x4x3_t;
371*80a68eefSBob Badour typedef struct int64x2x3_t uint64x2x3_t;
372*80a68eefSBob Badour typedef struct int8x16x3_t poly8x16x3_t;
373*80a68eefSBob Badour typedef struct int16x8x3_t poly16x8x3_t;
374*80a68eefSBob Badour typedef struct  int8x8x3_t uint8x8x3_t;
375*80a68eefSBob Badour typedef struct  int16x4x3_t uint16x4x3_t;
376*80a68eefSBob Badour typedef struct  int32x2x3_t uint32x2x3_t;
377*80a68eefSBob Badour typedef struct  int64x1x3_t uint64x1x3_t;
378*80a68eefSBob Badour typedef struct  int8x8x3_t poly8x8x3_t;
379*80a68eefSBob Badour typedef struct  int16x4x3_t poly16x4x3_t;
380*80a68eefSBob Badour 
381*80a68eefSBob Badour //float
382*80a68eefSBob Badour struct float32x4x3_t {
383*80a68eefSBob Badour     float32x4_t val[3];
384*80a68eefSBob Badour };
385*80a68eefSBob Badour struct float32x2x3_t {
386*80a68eefSBob Badour     float32x2_t val[3];
387*80a68eefSBob Badour };
388*80a68eefSBob Badour struct float16x8x3_t {
389*80a68eefSBob Badour     float16x8_t val[3];
390*80a68eefSBob Badour };
391*80a68eefSBob Badour 
392*80a68eefSBob Badour typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
393*80a68eefSBob Badour typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
394*80a68eefSBob Badour typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
395*80a68eefSBob Badour typedef  float16x8x3_t float16x4x3_t;
396*80a68eefSBob Badour 
397*80a68eefSBob Badour 
398*80a68eefSBob Badour //****************************************************************************
399*80a68eefSBob Badour //****** Porting auxiliary macros ********************************************
400*80a68eefSBob Badour 
401*80a68eefSBob Badour //** floating point related macros **
402*80a68eefSBob Badour #define _M128i(a) _mm_castps_si128(a)
403*80a68eefSBob Badour #define _M128(a) _mm_castsi128_ps(a)
404*80a68eefSBob Badour //here the most performance effective implementation is compiler and 32/64 bits build dependent
405*80a68eefSBob Badour #if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER  >= 1500) )
406*80a68eefSBob Badour #   define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
407*80a68eefSBob Badour #   define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
408*80a68eefSBob Badour #   define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
409*80a68eefSBob Badour #else
410*80a68eefSBob Badour    //for 32bit gcc and Microsoft compilers builds
411*80a68eefSBob Badour #   define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
412*80a68eefSBob Badour #   define _M64(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), inp)
413*80a68eefSBob Badour #   define _M64f(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
414*80a68eefSBob Badour #endif
415*80a68eefSBob Badour #define _pM128(a) _mm_castsi128_ps(_pM128i(a))
416*80a68eefSBob Badour 
417*80a68eefSBob Badour #define return64(a)  _M64(res64,a); return res64;
418*80a68eefSBob Badour #define return64f(a)  _M64f(res64,a); return res64;
419*80a68eefSBob Badour 
420*80a68eefSBob Badour #define _Ui64(a) (*(uint64_t*)&(a))
421*80a68eefSBob Badour #define _UNSIGNED_T(a) u ## a
422*80a68eefSBob Badour 
423*80a68eefSBob Badour #define _SIGNBIT64 ((uint64_t)1 << 63)
424*80a68eefSBob Badour #define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
425*80a68eefSBob Badour #define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
426*80a68eefSBob Badour 
427*80a68eefSBob Badour #define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
428*80a68eefSBob Badour #define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
429*80a68eefSBob Badour 
430*80a68eefSBob Badour //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
431*80a68eefSBob Badour #define __constrange(min,max)  const
432*80a68eefSBob Badour #define __transfersize(size)
433*80a68eefSBob Badour //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
434*80a68eefSBob Badour 
435*80a68eefSBob Badour //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
436*80a68eefSBob Badour _NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7,  9, 11, 13, 15 };
437*80a68eefSBob Badour _NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[16] = { 0, 1, 4, 5, 8,  9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
438*80a68eefSBob Badour //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
439*80a68eefSBob Badour 
440*80a68eefSBob Badour //*************************************************************************
441*80a68eefSBob Badour //*************************************************************************
442*80a68eefSBob Badour //*********  Functions declarations as declared in original arm_neon.h *****
443*80a68eefSBob Badour //*************************************************************************
444*80a68eefSBob Badour //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
445*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
446*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
447*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
448*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
449*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
450*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
451*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
452*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
453*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
454*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
455*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
456*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
457*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
458*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
459*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
460*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
461*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
462*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
463*80a68eefSBob Badour //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
464*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
465*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
466*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
467*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
468*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
469*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
470*80a68eefSBob Badour //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
471*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
472*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
473*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
474*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
475*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
476*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
477*80a68eefSBob Badour //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
478*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
479*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
480*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
481*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
482*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
483*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
484*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
485*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
486*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
487*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
488*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
489*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
490*80a68eefSBob Badour //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
491*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
492*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
493*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
494*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
495*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
496*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
497*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
498*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
499*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
500*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
501*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
502*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
503*80a68eefSBob Badour //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
504*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
505*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
506*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
507*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
508*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
509*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
510*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
511*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
512*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
513*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
514*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
515*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
516*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
517*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
518*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
519*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
520*80a68eefSBob Badour //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
521*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
522*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
523*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
524*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
525*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
526*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
527*80a68eefSBob Badour //Vector rounding add high half: vraddhn
528*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
529*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
530*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
531*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
532*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
533*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
534*80a68eefSBob Badour //Multiplication
535*80a68eefSBob Badour //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
536*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
537*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
538*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
539*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
540*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
541*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
542*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
543*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
544*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
545*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
546*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
547*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
548*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
549*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
550*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
551*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
552*80a68eefSBob Badour //multiply lane
553*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
554*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
555*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
556*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
557*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
558*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
559*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
560*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
561*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
562*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
563*80a68eefSBob Badour //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
564*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
565*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
566*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
567*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
568*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
569*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
570*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
571*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
572*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
573*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
574*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
575*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
576*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
577*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
578*80a68eefSBob Badour //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
579*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
580*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
581*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
582*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
583*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
584*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
585*80a68eefSBob Badour //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
586*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
587*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
588*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
589*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
590*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
591*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
592*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
593*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
594*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
595*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
596*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
597*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
598*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
599*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
600*80a68eefSBob Badour //Vector multiply subtract long
601*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
602*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
603*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
604*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
605*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
606*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
607*80a68eefSBob Badour //Vector saturating doubling multiply high
608*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
609*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
610*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
611*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
612*80a68eefSBob Badour //Vector saturating rounding doubling multiply high
613*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
614*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
615*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
616*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
617*80a68eefSBob Badour //Vector saturating doubling multiply accumulate long
618*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
619*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
620*80a68eefSBob Badour //Vector saturating doubling multiply subtract long
621*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
622*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
623*80a68eefSBob Badour //Vector long multiply
624*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
625*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
626*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
627*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
628*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
629*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
630*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
631*80a68eefSBob Badour //Vector saturating doubling long multiply
632*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
633*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
634*80a68eefSBob Badour //Subtraction
635*80a68eefSBob Badour //Vector subtract
636*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
637*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
638*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
639*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
640*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
641*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
642*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
643*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
644*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
645*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
646*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
647*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
648*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
649*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
650*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
651*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
652*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
653*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
654*80a68eefSBob Badour //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
655*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
656*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
657*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
658*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
659*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
660*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
661*80a68eefSBob Badour //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
662*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
663*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
664*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
665*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
666*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
667*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
668*80a68eefSBob Badour //Vector saturating subtract
669*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
670*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
671*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
672*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
673*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
674*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
675*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
676*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
677*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
678*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
679*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
680*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
681*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
682*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
683*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
684*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
685*80a68eefSBob Badour //Vector halving subtract
686*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
687*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
688*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
689*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
690*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
691*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
692*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
693*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
694*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
695*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
696*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
697*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
698*80a68eefSBob Badour //Vector subtract high half
699*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
700*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
701*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
702*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
703*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
704*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
705*80a68eefSBob Badour //Vector rounding subtract high half
706*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
707*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
708*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
709*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
710*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
711*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
712*80a68eefSBob Badour //Comparison
713*80a68eefSBob Badour //Vector compare equal
714*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
715*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
716*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
717*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
718*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
719*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
720*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
721*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
722*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
723*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
724*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
725*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
726*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
727*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
728*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
729*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
730*80a68eefSBob Badour //Vector compare greater-than or equal
731*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
732*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
733*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
734*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
735*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
736*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
737*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
738*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
739*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
740*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
741*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
742*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
743*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
744*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
745*80a68eefSBob Badour //Vector compare less-than or equal
746*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
747*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
748*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
749*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
750*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
751*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
752*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
753*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
754*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
755*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
756*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
757*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
758*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
759*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
760*80a68eefSBob Badour //Vector compare greater-than
761*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
762*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
763*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
764*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
765*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
766*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
767*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
768*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
769*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
770*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
771*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
772*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
773*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
774*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
775*80a68eefSBob Badour //Vector compare less-than
776*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
777*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
778*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
779*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
780*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
781*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
782*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
783*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
784*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
785*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
786*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
787*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
788*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
789*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
790*80a68eefSBob Badour //Vector compare absolute greater-than or equal
791*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
792*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
793*80a68eefSBob Badour //Vector compare absolute less-than or equal
794*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
795*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
796*80a68eefSBob Badour //Vector compare absolute greater-than
797*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
798*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
799*80a68eefSBob Badour //Vector compare absolute less-than
800*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
801*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
802*80a68eefSBob Badour //Vector test bits
803*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
804*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
805*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
806*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
807*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
808*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
809*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
810*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
811*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
812*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
813*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
814*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
815*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
816*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
817*80a68eefSBob Badour //Absolute difference
818*80a68eefSBob Badour //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
819*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
820*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
821*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
822*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
823*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
824*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
825*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
826*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
827*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
828*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
829*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
830*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
831*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
832*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
833*80a68eefSBob Badour //Absolute difference - long
834*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
835*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
836*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
837*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
838*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
839*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
840*80a68eefSBob Badour //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
841*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
842*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
843*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
844*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
845*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
846*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
847*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
848*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
849*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
850*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
851*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
852*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
853*80a68eefSBob Badour //Absolute difference and accumulate - long
854*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
855*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
856*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
857*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
858*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
859*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
860*80a68eefSBob Badour //Max/Min
861*80a68eefSBob Badour //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
862*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
863*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
864*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
865*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
866*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
867*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
868*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
869*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
870*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
871*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
872*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
873*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
874*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
875*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
876*80a68eefSBob Badour 
877*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
878*80a68eefSBob Badour 
879*80a68eefSBob Badour //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
880*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
881*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
882*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
883*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
884*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
885*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
886*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
887*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
888*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
889*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
890*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
891*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
892*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
893*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
894*80a68eefSBob Badour 
895*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
896*80a68eefSBob Badour 
897*80a68eefSBob Badour //Pairwise addition
898*80a68eefSBob Badour //Pairwise add
899*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
900*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
901*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
902*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
903*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
904*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
905*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
906*80a68eefSBob Badour //Long pairwise add
907*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
908*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
909*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
910*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
911*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
912*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
913*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
914*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
915*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
916*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
917*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
918*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
919*80a68eefSBob Badour //Long pairwise add and accumulate
920*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
921*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
922*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
923*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
924*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
925*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
926*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
927*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
928*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
929*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
930*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
931*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
932*80a68eefSBob Badour //Folding maximum vpmax -> takes maximum of adjacent pairs
933*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
934*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
935*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
936*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
937*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
938*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
939*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
940*80a68eefSBob Badour //Folding minimum vpmin -> takes minimum of adjacent pairs
941*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
942*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
943*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
944*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
945*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
946*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
947*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
948*80a68eefSBob Badour //Reciprocal/Sqrt
949*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
950*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
951*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
952*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
953*80a68eefSBob Badour //Shifts by signed variable
954*80a68eefSBob Badour //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
955*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
956*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
957*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
958*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
959*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
960*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
961*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
962*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
963*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
964*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
965*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
966*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
967*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
968*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
969*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
970*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
971*80a68eefSBob Badour //Vector saturating shift left: (negative values shift right)
972*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
973*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
974*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
975*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
976*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
977*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
978*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
979*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
980*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
981*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
982*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
983*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
984*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
985*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
986*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
987*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
988*80a68eefSBob Badour //Vector rounding shift left: (negative values shift right)
989*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
990*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
991*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
992*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
993*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
994*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
995*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
996*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
997*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
998*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
999*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
1000*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
1001*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
1002*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
1003*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
1004*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
1005*80a68eefSBob Badour //Vector saturating rounding shift left: (negative values shift right)
1006*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
1007*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
1008*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
1009*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
1010*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
1011*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
1012*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
1013*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
1014*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
1015*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
1016*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
1017*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
1018*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
1019*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
1020*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
1021*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
1022*80a68eefSBob Badour //Shifts by a constant
1023*80a68eefSBob Badour //Vector shift right by constant
1024*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
1025*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
1026*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
1027*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
1028*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
1029*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
1030*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
1031*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
1032*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
1033*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
1034*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
1035*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
1036*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
1037*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
1038*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
1039*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
1040*80a68eefSBob Badour //Vector shift left by constant
1041*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
1042*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
1043*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
1044*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
1045*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
1046*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
1047*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
1048*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
1049*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
1050*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
1051*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
1052*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
1053*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
1054*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
1055*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
1056*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
1057*80a68eefSBob Badour //Vector rounding shift right by constant
1058*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
1059*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
1060*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
1061*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
1062*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
1063*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
1064*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
1065*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
1066*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
1067*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
1068*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
1069*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
1070*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
1071*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
1072*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
1073*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
1074*80a68eefSBob Badour //Vector shift right by constant and accumulate
1075*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
1076*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
1077*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
1078*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
1079*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
1080*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
1081*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
1082*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
1083*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
1084*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
1085*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
1086*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
1087*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
1088*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
1089*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
1090*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
1091*80a68eefSBob Badour //Vector rounding shift right by constant and accumulate
1092*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
1093*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
1094*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
1095*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
1096*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
1097*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
1098*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
1099*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
1100*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
1101*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
1102*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
1103*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
1104*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
1105*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
1106*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
1107*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
1108*80a68eefSBob Badour //Vector saturating shift left by constant
1109*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
1110*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
1111*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
1112*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
1113*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
1114*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
1115*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
1116*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
1117*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
1118*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
1119*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
1120*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
1121*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
1122*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
1123*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
1124*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
1125*80a68eefSBob Badour //Vector signed->unsigned saturating shift left by constant
1126*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
1127*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
1128*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
1129*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
1130*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
1131*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
1132*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
1133*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
1134*80a68eefSBob Badour //Vector narrowing shift right by constant
1135*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
1136*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
1137*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
1138*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
1139*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
1140*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
1141*80a68eefSBob Badour //Vector signed->unsigned narrowing saturating shift right by constant
1142*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
1143*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
1144*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
1145*80a68eefSBob Badour //Vector signed->unsigned rounding narrowing saturating shift right by constant
1146*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
1147*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
1148*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
1149*80a68eefSBob Badour //Vector narrowing saturating shift right by constant
1150*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
1151*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
1152*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
1153*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
1154*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
1155*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
1156*80a68eefSBob Badour //Vector rounding narrowing shift right by constant
1157*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
1158*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
1159*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
1160*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
1161*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
1162*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
1163*80a68eefSBob Badour //Vector rounding narrowing saturating shift right by constant
1164*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
1165*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
1166*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
1167*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
1168*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
1169*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
1170*80a68eefSBob Badour //Vector widening shift left by constant
1171*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
1172*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
1173*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
1174*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
1175*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
1176*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
1177*80a68eefSBob Badour //Shifts with insert
1178*80a68eefSBob Badour //Vector shift right and insert
1179*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1180*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1181*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
1182*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
1183*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1184*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1185*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
1186*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
1187*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1188*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1189*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1190*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1191*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
1192*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
1193*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1194*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1195*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
1196*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
1197*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1198*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1199*80a68eefSBob Badour //Vector shift left and insert
1200*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1201*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1202*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
1203*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
1204*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1205*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1206*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
1207*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
1208*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1209*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1210*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1211*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1212*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
1213*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
1214*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1215*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1216*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
1217*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
1218*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1219*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1220*80a68eefSBob Badour //Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
1221*80a68eefSBob Badour //Load a single vector from memory
1222*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1223*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1224*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1225*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1226*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1227*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1228*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1229*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1230*80a68eefSBob Badour _NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
1231*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1232*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1233*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1234*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
1235*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
1236*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
1237*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1238*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
1239*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
1240*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
1241*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1242*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
1243*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
1244*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
1245*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
1246*80a68eefSBob Badour 
1247*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1248*80a68eefSBob Badour 
1249*80a68eefSBob Badour //Load a single lane from memory
1250*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1251*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
1252*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
1253*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
1254*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1255*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
1256*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
1257*80a68eefSBob Badour _NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
1258*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
1259*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
1260*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1261*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
1262*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
1263*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1264*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
1265*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
1266*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
1267*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1268*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
1269*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1270*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
1271*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
1272*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
1273*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1274*80a68eefSBob Badour //Load all lanes of vector with same value from memory
1275*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1276*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1277*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1278*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1279*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1280*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1281*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1282*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1283*80a68eefSBob Badour _NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1284*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1285*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1286*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1287*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1288*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1289*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1290*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1291*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1292*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1293*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1294*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1295*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1296*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1297*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1298*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1299*80a68eefSBob Badour //Store a single vector or lane. Stores all lanes or a single lane of a vector.
1300*80a68eefSBob Badour //Store a single vector into memory
1301*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
1302*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
1303*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
1304*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
1305*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
1306*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
1307*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
1308*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
1309*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
1310*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
1311*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
1312*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
1313*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
1314*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
1315*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
1316*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
1317*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
1318*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
1319*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
1320*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
1321*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
1322*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
1323*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
1324*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
1325*80a68eefSBob Badour //Store a lane of a vector into memory
1326*80a68eefSBob Badour //Loads of an N-element structure
1327*80a68eefSBob Badour //Load N-element structure from memory
1328*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1329*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1330*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1331*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1332*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1333*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1334*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
1335*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1336*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1337*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1338*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1339*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1340*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1341*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1342*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1343*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1344*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1345*80a68eefSBob Badour _NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1346*80a68eefSBob Badour //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
1347*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1348*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1349*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1350*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1351*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1352*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1353*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1354*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1355*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1356*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1357*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1358*80a68eefSBob Badour poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1359*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1360*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1361*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1362*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1363*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1364*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1365*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1366*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1367*80a68eefSBob Badour int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1368*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1369*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1370*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1371*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1372*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1373*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1374*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1375*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1376*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1377*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1378*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1379*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1380*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1381*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1382*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1383*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1384*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1385*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1386*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1387*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1388*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1389*80a68eefSBob Badour int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1390*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1391*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1392*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1393*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1394*80a68eefSBob Badour //Load all lanes of N-element structure with same value from memory
1395*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1396*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1397*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1398*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1399*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1400*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1401*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1402*80a68eefSBob Badour _NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1403*80a68eefSBob Badour //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1404*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1405*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1406*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1407*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1408*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1409*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1410*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1411*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1412*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1413*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1414*80a68eefSBob Badour int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1415*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1416*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1417*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1418*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1419*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1420*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1421*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1422*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1423*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1424*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1425*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1426*80a68eefSBob Badour int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1427*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1428*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1429*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1430*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1431*80a68eefSBob Badour //Load a single lane of N-element structure from memory
1432*80a68eefSBob Badour //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
1433*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1434*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1435*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1436*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1437*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1438*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1439*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1440*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1441*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1442*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1443*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1444*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
1445*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
1446*80a68eefSBob Badour //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1447*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t  src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1448*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t  src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1449*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t  src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1450*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1451*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1452*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1453*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1454*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1455*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1456*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1457*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1458*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1459*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1460*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1461*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1462*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1463*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1464*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1465*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1466*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1467*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1468*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1469*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1470*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1471*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1472*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1473*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1474*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1475*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1476*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1477*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1478*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1479*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1480*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1481*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1482*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1483*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1484*80a68eefSBob Badour //Store N-element structure to memory
1485*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1486*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1487*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1488*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1489*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1490*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1491*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1492*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1493*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1494*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1495*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1496*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1497*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0]
1498*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0]
1499*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1500*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1501*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
1502*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0]
1503*80a68eefSBob Badour //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
1504*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
1505*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1506*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1507*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1508*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1509*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1510*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1511*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1512*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1513*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1514*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1515*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1516*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1517*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1518*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1519*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1520*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
1521*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1522*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1523*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1524*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
1525*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
1526*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1527*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1528*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1529*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1530*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1531*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1532*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1533*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1534*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1535*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1536*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1537*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1538*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1539*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1540*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1541*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1542*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
1543*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1544*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1545*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1546*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
1547*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
1548*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1549*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1550*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1551*80a68eefSBob Badour //Store a single lane of N-element structure to memory
1552*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1553*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1554*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1555*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1556*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1557*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
1558*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1559*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1560*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1561*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1562*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
1563*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1564*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1565*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1566*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1567*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1568*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1569*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1570*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1571*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1572*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1573*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1574*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
1575*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1576*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1577*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1578*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1579*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
1580*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1581*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1582*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1583*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1584*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1585*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1586*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1587*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1588*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1589*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1590*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1591*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1592*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1593*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1594*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1595*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1596*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
1597*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1598*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1599*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1600*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1601*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1602*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1603*80a68eefSBob Badour //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
1604*80a68eefSBob Badour _NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
1605*80a68eefSBob Badour _NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
1606*80a68eefSBob Badour _NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1607*80a68eefSBob Badour _NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
1608*80a68eefSBob Badour _NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
1609*80a68eefSBob Badour _NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1610*80a68eefSBob Badour _NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
1611*80a68eefSBob Badour _NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
1612*80a68eefSBob Badour _NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1613*80a68eefSBob Badour _NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
1614*80a68eefSBob Badour _NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
1615*80a68eefSBob Badour _NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1616*80a68eefSBob Badour _NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
1617*80a68eefSBob Badour _NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
1618*80a68eefSBob Badour _NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1619*80a68eefSBob Badour _NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
1620*80a68eefSBob Badour _NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
1621*80a68eefSBob Badour _NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1622*80a68eefSBob Badour _NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
1623*80a68eefSBob Badour _NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
1624*80a68eefSBob Badour _NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
1625*80a68eefSBob Badour _NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
1626*80a68eefSBob Badour //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
1627*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1628*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1629*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1630*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1631*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1632*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1633*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1634*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1635*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1636*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1637*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1638*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1639*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1640*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1641*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1642*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1643*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1644*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1645*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
1646*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
1647*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
1648*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
1649*80a68eefSBob Badour //Initialize a vector from a literal bit pattern.
1650*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
1651*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
1652*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
1653*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
1654*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
1655*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
1656*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
1657*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
1658*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
1659*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
1660*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
1661*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
1662*80a68eefSBob Badour //Set all lanes to same value
1663*80a68eefSBob Badour //Load all lanes of vector to the same literal value
1664*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
1665*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
1666*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
1667*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
1668*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
1669*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
1670*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
1671*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
1672*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
1673*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
1674*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
1675*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
1676*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
1677*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
1678*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
1679*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
1680*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
1681*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
1682*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
1683*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
1684*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
1685*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
1686*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
1687*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
1688*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
1689*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
1690*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
1691*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
1692*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
1693*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
1694*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
1695*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
1696*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
1697*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
1698*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
1699*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
1700*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
1701*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
1702*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
1703*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
1704*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
1705*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
1706*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
1707*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
1708*80a68eefSBob Badour //Load all lanes of the vector to the value of a lane of a vector
1709*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1710*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1711*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1712*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1713*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1714*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1715*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1716*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1717*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1718*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1719*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1720*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1721*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1722*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1723*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1724*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1725*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1726*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1727*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
1728*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
1729*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
1730*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
1731*80a68eefSBob Badour //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
1732*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
1733*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
1734*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
1735*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
1736*80a68eefSBob Badour _NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
1737*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
1738*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
1739*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
1740*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
1741*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
1742*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
1743*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
1744*80a68eefSBob Badour //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
1745*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
1746*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
1747*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
1748*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
1749*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
1750*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
1751*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
1752*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
1753*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
1754*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
1755*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
1756*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
1757*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
1758*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
1759*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
1760*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
1761*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
1762*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
1763*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
1764*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
1765*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
1766*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
1767*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
1768*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
1769*80a68eefSBob Badour //Converting vectors. These intrinsics are used to convert vectors.
1770*80a68eefSBob Badour //Convert from float
1771*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
1772*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
1773*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
1774*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
1775*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
1776*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
1777*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
1778*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
1779*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
1780*80a68eefSBob Badour //Convert to float
1781*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
1782*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
1783*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
1784*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
1785*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
1786*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
1787*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
1788*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
1789*80a68eefSBob Badour //Convert between floats
1790*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
1791*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
1792*80a68eefSBob Badour //Vector narrow integer
1793*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
1794*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
1795*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
1796*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
1797*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
1798*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
1799*80a68eefSBob Badour //Vector long move
1800*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
1801*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
1802*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
1803*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
1804*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
1805*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
1806*80a68eefSBob Badour //Vector saturating narrow integer
1807*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
1808*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
1809*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
1810*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
1811*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
1812*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
1813*80a68eefSBob Badour //Vector saturating narrow integer signed->unsigned
1814*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
1815*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
1816*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
1817*80a68eefSBob Badour //Table look up
1818*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1819*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
1820*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1821*80a68eefSBob Badour //Extended table look up intrinsics
1822*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1823*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
1824*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1825*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1826*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1827*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1828*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1829*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1830*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1831*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1832*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1833*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1834*80a68eefSBob Badour //Operations with a scalar value
1835*80a68eefSBob Badour //Vector multiply accumulate with scalar
1836*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
1837*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
1838*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
1839*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
1840*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
1841*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
1842*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
1843*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
1844*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
1845*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
1846*80a68eefSBob Badour //Vector widening multiply accumulate with scalar
1847*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
1848*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
1849*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
1850*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
1851*80a68eefSBob Badour //Vector widening saturating doubling multiply accumulate with scalar
1852*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
1853*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
1854*80a68eefSBob Badour //Vector multiply subtract with scalar
1855*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
1856*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
1857*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
1858*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
1859*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
1860*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
1861*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
1862*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
1863*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
1864*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
1865*80a68eefSBob Badour //Vector widening multiply subtract with scalar
1866*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
1867*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
1868*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
1869*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
1870*80a68eefSBob Badour //Vector widening saturating doubling multiply subtract with scalar
1871*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
1872*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
1873*80a68eefSBob Badour //Vector multiply by scalar
1874*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
1875*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
1876*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
1877*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
1878*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
1879*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
1880*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
1881*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
1882*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
1883*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
1884*80a68eefSBob Badour //Vector long multiply with scalar
1885*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
1886*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
1887*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
1888*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
1889*80a68eefSBob Badour //Vector long multiply by scalar
1890*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
1891*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
1892*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
1893*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
1894*80a68eefSBob Badour //Vector saturating doubling long multiply with scalar
1895*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
1896*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
1897*80a68eefSBob Badour //Vector saturating doubling long multiply by scalar
1898*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
1899*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
1900*80a68eefSBob Badour //Vector saturating doubling multiply high with scalar
1901*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
1902*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
1903*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
1904*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
1905*80a68eefSBob Badour //Vector saturating doubling multiply high by scalar
1906*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
1907*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
1908*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
1909*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
1910*80a68eefSBob Badour //Vector saturating rounding doubling multiply high with scalar
1911*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
1912*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
1913*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
1914*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
1915*80a68eefSBob Badour //Vector rounding saturating doubling multiply high by scalar
1916*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
1917*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
1918*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
1919*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
1920*80a68eefSBob Badour //Vector multiply accumulate with scalar
1921*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
1922*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
1923*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
1924*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
1925*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
1926*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
1927*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
1928*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
1929*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
1930*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
1931*80a68eefSBob Badour //Vector widening multiply accumulate with scalar
1932*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
1933*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
1934*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
1935*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
1936*80a68eefSBob Badour //Vector widening saturating doubling multiply accumulate with scalar
1937*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
1938*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
1939*80a68eefSBob Badour //Vector multiply subtract with scalar
1940*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
1941*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
1942*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
1943*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
1944*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
1945*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
1946*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
1947*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
1948*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
1949*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
1950*80a68eefSBob Badour //Vector widening multiply subtract with scalar
1951*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
1952*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
1953*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
1954*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
1955*80a68eefSBob Badour //Vector widening saturating doubling multiply subtract with scalar
1956*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
1957*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
1958*80a68eefSBob Badour //Vector extract
1959*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1960*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1961*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1962*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1963*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1964*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1965*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1966*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1967*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
1968*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
1969*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1970*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1971*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1972*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1973*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1974*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1975*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1976*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
1977*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
1978*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
1979*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
1980*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
1981*80a68eefSBob Badour //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
1982*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
1983*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
1984*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
1985*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
1986*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
1987*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
1988*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
1989*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
1990*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
1991*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
1992*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
1993*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
1994*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
1995*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
1996*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
1997*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
1998*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
1999*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
2000*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
2001*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
2002*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
2003*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
2004*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
2005*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
2006*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
2007*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
2008*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
2009*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
2010*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
2011*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
2012*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
2013*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
2014*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
2015*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
2016*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
2017*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
2018*80a68eefSBob Badour //Other single operand arithmetic
2019*80a68eefSBob Badour //Absolute: Vd[i] = |Va[i]|
2020*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
2021*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
2022*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
2023*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
2024*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
2025*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
2026*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
2027*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
2028*80a68eefSBob Badour 
2029*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
2030*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
2031*80a68eefSBob Badour 
2032*80a68eefSBob Badour //Saturating absolute: Vd[i] = sat(|Va[i]|)
2033*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
2034*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
2035*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
2036*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
2037*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
2038*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
2039*80a68eefSBob Badour //Negate: Vd[i] = - Va[i]
2040*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
2041*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
2042*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
2043*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
2044*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
2045*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
2046*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
2047*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
2048*80a68eefSBob Badour //Saturating Negate: sat(Vd[i] = - Va[i])
2049*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
2050*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
2051*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
2052*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
2053*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
2054*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
2055*80a68eefSBob Badour //Count leading sign bits
2056*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
2057*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
2058*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
2059*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
2060*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
2061*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
2062*80a68eefSBob Badour //Count leading zeros
2063*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
2064*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
2065*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
2066*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
2067*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
2068*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
2069*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
2070*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
2071*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
2072*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
2073*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
2074*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
2075*80a68eefSBob Badour //Count number of set bits
2076*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
2077*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
2078*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
2079*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
2080*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
2081*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
2082*80a68eefSBob Badour //Reciprocal estimate
2083*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
2084*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
2085*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
2086*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
2087*80a68eefSBob Badour //Reciprocal square root estimate
2088*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
2089*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
2090*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
2091*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
2092*80a68eefSBob Badour //Logical operations
2093*80a68eefSBob Badour //Bitwise not
2094*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
2095*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
2096*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
2097*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
2098*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
2099*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
2100*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
2101*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
2102*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
2103*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
2104*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
2105*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
2106*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
2107*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
2108*80a68eefSBob Badour //Bitwise and
2109*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
2110*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
2111*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
2112*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
2113*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
2114*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
2115*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
2116*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
2117*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
2118*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
2119*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
2120*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
2121*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
2122*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
2123*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
2124*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
2125*80a68eefSBob Badour //Bitwise or
2126*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
2127*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
2128*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
2129*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
2130*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
2131*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
2132*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
2133*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
2134*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
2135*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
2136*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
2137*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
2138*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
2139*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
2140*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
2141*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
2142*80a68eefSBob Badour //Bitwise exclusive or (EOR or XOR)
2143*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
2144*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
2145*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
2146*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
2147*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
2148*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
2149*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
2150*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
2151*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
2152*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
2153*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
2154*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
2155*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
2156*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
2157*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
2158*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
2159*80a68eefSBob Badour //Bit Clear
2160*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
2161*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
2162*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
2163*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
2164*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
2165*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
2166*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
2167*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
2168*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
2169*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
2170*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
2171*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
2172*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
2173*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
2174*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
2175*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
2176*80a68eefSBob Badour //Bitwise OR complement
2177*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
2178*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
2179*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
2180*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
2181*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
2182*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
2183*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
2184*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
2185*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
2186*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
2187*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
2188*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
2189*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
2190*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
2191*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
2192*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
2193*80a68eefSBob Badour //Bitwise Select
2194*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
2195*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
2196*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
2197*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
2198*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
2199*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
2200*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
2201*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
2202*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
2203*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
2204*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
2205*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
2206*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
2207*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
2208*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
2209*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
2210*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
2211*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
2212*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
2213*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
2214*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
2215*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
2216*80a68eefSBob Badour //Transposition operations
2217*80a68eefSBob Badour //Transpose elements
2218*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
2219*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
2220*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
2221*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
2222*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
2223*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
2224*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
2225*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
2226*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
2227*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
2228*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
2229*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
2230*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
2231*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
2232*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
2233*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
2234*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
2235*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
2236*80a68eefSBob Badour //Interleave elements
2237*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
2238*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
2239*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
2240*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
2241*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
2242*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
2243*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
2244*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
2245*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
2246*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
2247*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
2248*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
2249*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
2250*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
2251*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
2252*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
2253*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
2254*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
2255*80a68eefSBob Badour //De-Interleave elements
2256*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
2257*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
2258*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
2259*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
2260*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
2261*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
2262*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
2263*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
2264*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
2265*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
2266*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
2267*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
2268*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
2269*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
2270*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
2271*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
2272*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
2273*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
2274*80a68eefSBob Badour 
2275*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0
2276*80a68eefSBob Badour 
2277*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0
2278*80a68eefSBob Badour 
2279*80a68eefSBob Badour //Sqrt
2280*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0
2281*80a68eefSBob Badour 
2282*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
2283*80a68eefSBob Badour 
2284*80a68eefSBob Badour 
2285*80a68eefSBob Badour 
2286*80a68eefSBob Badour //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2287*80a68eefSBob Badour // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics.
2288*80a68eefSBob Badour // we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
2289*80a68eefSBob Badour //
2290*80a68eefSBob Badour #if  ( defined (__INTEL_COMPILER)  || defined (__GNUC__) && !defined(__llvm__) )
2291*80a68eefSBob Badour #   define _MM_ALIGNR_EPI8 _mm_alignr_epi8
2292*80a68eefSBob Badour #   define _MM_EXTRACT_EPI16  (int16_t) _mm_extract_epi16
2293*80a68eefSBob Badour #   define _MM_INSERT_EPI16 _mm_insert_epi16
2294*80a68eefSBob Badour #   ifdef USE_SSE4
2295*80a68eefSBob Badour #       define _MM_EXTRACT_EPI8  _mm_extract_epi8
2296*80a68eefSBob Badour #       define _MM_EXTRACT_EPI32  _mm_extract_epi32
2297*80a68eefSBob Badour #       define _MM_EXTRACT_PS  _mm_extract_ps
2298*80a68eefSBob Badour #       define _MM_INSERT_EPI8  _mm_insert_epi8
2299*80a68eefSBob Badour #       define _MM_INSERT_EPI32 _mm_insert_epi32
2300*80a68eefSBob Badour #       define _MM_INSERT_PS    _mm_insert_ps
2301*80a68eefSBob Badour #       ifdef  _NEON2SSE_64BIT
2302*80a68eefSBob Badour #           define _MM_INSERT_EPI64 _mm_insert_epi64
2303*80a68eefSBob Badour #           define _MM_EXTRACT_EPI64 _mm_extract_epi64
2304*80a68eefSBob Badour #       endif
2305*80a68eefSBob Badour #   endif //SSE4
2306*80a68eefSBob Badour #else
2307*80a68eefSBob Badour #   define _NEON2SSE_COMMA ,
2308*80a68eefSBob Badour #   define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
2309*80a68eefSBob Badour         switch(LANE)         \
2310*80a68eefSBob Badour         {                \
2311*80a68eefSBob Badour         case 0:     return NAME(a b, 0); \
2312*80a68eefSBob Badour         case 1:     return NAME(a b, 1); \
2313*80a68eefSBob Badour         case 2:     return NAME(a b, 2); \
2314*80a68eefSBob Badour         case 3:     return NAME(a b, 3); \
2315*80a68eefSBob Badour         case 4:     return NAME(a b, 4); \
2316*80a68eefSBob Badour         case 5:     return NAME(a b, 5); \
2317*80a68eefSBob Badour         case 6:     return NAME(a b, 6); \
2318*80a68eefSBob Badour         case 7:     return NAME(a b, 7); \
2319*80a68eefSBob Badour         case 8:     return NAME(a b, 8); \
2320*80a68eefSBob Badour         case 9:     return NAME(a b, 9); \
2321*80a68eefSBob Badour         case 10:    return NAME(a b, 10); \
2322*80a68eefSBob Badour         case 11:    return NAME(a b, 11); \
2323*80a68eefSBob Badour         case 12:    return NAME(a b, 12); \
2324*80a68eefSBob Badour         case 13:    return NAME(a b, 13); \
2325*80a68eefSBob Badour         case 14:    return NAME(a b, 14); \
2326*80a68eefSBob Badour         case 15:    return NAME(a b, 15); \
2327*80a68eefSBob Badour         default:    return NAME(a b, 0); \
2328*80a68eefSBob Badour         }
2329*80a68eefSBob Badour 
2330*80a68eefSBob Badour #   define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
2331*80a68eefSBob Badour         switch(LANE)              \
2332*80a68eefSBob Badour         {                          \
2333*80a68eefSBob Badour         case 0:  return NAME(vec p,0); \
2334*80a68eefSBob Badour         case 1:  return NAME(vec p,1); \
2335*80a68eefSBob Badour         case 2:  return NAME(vec p,2); \
2336*80a68eefSBob Badour         case 3:  return NAME(vec p,3); \
2337*80a68eefSBob Badour         case 4:  return NAME(vec p,4); \
2338*80a68eefSBob Badour         case 5:  return NAME(vec p,5); \
2339*80a68eefSBob Badour         case 6:  return NAME(vec p,6); \
2340*80a68eefSBob Badour         case 7:  return NAME(vec p,7); \
2341*80a68eefSBob Badour         default: return NAME(vec p,0); \
2342*80a68eefSBob Badour         }
2343*80a68eefSBob Badour 
2344*80a68eefSBob Badour #   define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
2345*80a68eefSBob Badour         switch(LANE)              \
2346*80a68eefSBob Badour         {                          \
2347*80a68eefSBob Badour         case case0:  return NAME(vec p,case0); \
2348*80a68eefSBob Badour         case case1:  return NAME(vec p,case1); \
2349*80a68eefSBob Badour         case case2:  return NAME(vec p,case2); \
2350*80a68eefSBob Badour         case case3:  return NAME(vec p,case3); \
2351*80a68eefSBob Badour         default:     return NAME(vec p,case0); \
2352*80a68eefSBob Badour         }
2353*80a68eefSBob Badour 
_MM_ALIGNR_EPI8(__m128i a,__m128i b,int LANE)2354*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
2355*80a68eefSBob Badour     {
2356*80a68eefSBob Badour         _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
2357*80a68eefSBob Badour     }
2358*80a68eefSBob Badour 
_MM_INSERT_EPI16(__m128i vec,int p,const int LANE)2359*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
2360*80a68eefSBob Badour     {
2361*80a68eefSBob Badour         _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
2362*80a68eefSBob Badour     }
2363*80a68eefSBob Badour 
_MM_EXTRACT_EPI16(__m128i vec,const int LANE)2364*80a68eefSBob Badour     _NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
2365*80a68eefSBob Badour     {
2366*80a68eefSBob Badour         _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
2367*80a68eefSBob Badour     }
2368*80a68eefSBob Badour 
2369*80a68eefSBob Badour #ifdef USE_SSE4
_MM_EXTRACT_EPI32(__m128i vec,const int LANE)2370*80a68eefSBob Badour         _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2371*80a68eefSBob Badour         {
2372*80a68eefSBob Badour             _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
2373*80a68eefSBob Badour         }
2374*80a68eefSBob Badour 
_MM_EXTRACT_PS(__m128 vec,const int LANE)2375*80a68eefSBob Badour         _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2376*80a68eefSBob Badour         {
2377*80a68eefSBob Badour             _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
2378*80a68eefSBob Badour         }
2379*80a68eefSBob Badour 
_MM_EXTRACT_EPI8(__m128i vec,const int LANE)2380*80a68eefSBob Badour         _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2381*80a68eefSBob Badour         {
2382*80a68eefSBob Badour             _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
2383*80a68eefSBob Badour         }
2384*80a68eefSBob Badour 
_MM_INSERT_EPI32(__m128i vec,int p,const int LANE)2385*80a68eefSBob Badour         _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2386*80a68eefSBob Badour         {
2387*80a68eefSBob Badour             _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
2388*80a68eefSBob Badour         }
2389*80a68eefSBob Badour 
_MM_INSERT_EPI8(__m128i vec,int p,const int LANE)2390*80a68eefSBob Badour         _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2391*80a68eefSBob Badour         {
2392*80a68eefSBob Badour             _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
2393*80a68eefSBob Badour         }
2394*80a68eefSBob Badour 
2395*80a68eefSBob Badour #ifdef  _NEON2SSE_64BIT
2396*80a68eefSBob Badour             //the special case of functions available only for SSE4 and 64-bit build.
_MM_INSERT_EPI64(__m128i vec,int p,const int LANE)2397*80a68eefSBob Badour             _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
2398*80a68eefSBob Badour             {
2399*80a68eefSBob Badour                 switch(LANE) {
2400*80a68eefSBob Badour                 case 0:
2401*80a68eefSBob Badour                     return _mm_insert_epi64(vec,  p, 0);
2402*80a68eefSBob Badour                 case 1:
2403*80a68eefSBob Badour                     return _mm_insert_epi64(vec,  p, 1);
2404*80a68eefSBob Badour                 default:
2405*80a68eefSBob Badour                     return _mm_insert_epi64(vec,  p, 0);
2406*80a68eefSBob Badour                 }
2407*80a68eefSBob Badour             }
2408*80a68eefSBob Badour 
_MM_EXTRACT_EPI64(__m128i val,const int LANE)2409*80a68eefSBob Badour             _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
2410*80a68eefSBob Badour             {
2411*80a68eefSBob Badour                 if (LANE ==0) return _mm_extract_epi64(val, 0);
2412*80a68eefSBob Badour                 else return _mm_extract_epi64(val, 1);
2413*80a68eefSBob Badour             }
2414*80a68eefSBob Badour #endif
2415*80a68eefSBob Badour 
_MM_INSERT_PS(__m128 vec,__m128 p,const int LANE)2416*80a68eefSBob Badour         _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2417*80a68eefSBob Badour         {
2418*80a68eefSBob Badour             _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
2419*80a68eefSBob Badour         }
2420*80a68eefSBob Badour 
2421*80a68eefSBob Badour #endif //USE_SSE4
2422*80a68eefSBob Badour 
2423*80a68eefSBob Badour #endif     //#ifdef NDEBUG
2424*80a68eefSBob Badour 
2425*80a68eefSBob Badour //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2426*80a68eefSBob Badour // Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
2427*80a68eefSBob Badour // or for some specific commonly used operations implementation missing in SSE
2428*80a68eefSBob Badour #ifdef USE_SSE4
2429*80a68eefSBob Badour #   define _MM_CVTEPU8_EPI16  _mm_cvtepu8_epi16
2430*80a68eefSBob Badour #   define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
2431*80a68eefSBob Badour #   define _MM_CVTEPU32_EPI64  _mm_cvtepu32_epi64
2432*80a68eefSBob Badour 
2433*80a68eefSBob Badour #   define _MM_CVTEPI8_EPI16  _mm_cvtepi8_epi16
2434*80a68eefSBob Badour #   define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
2435*80a68eefSBob Badour #   define _MM_CVTEPI32_EPI64  _mm_cvtepi32_epi64
2436*80a68eefSBob Badour 
2437*80a68eefSBob Badour #   define _MM_MAX_EPI8  _mm_max_epi8
2438*80a68eefSBob Badour #   define _MM_MAX_EPI32 _mm_max_epi32
2439*80a68eefSBob Badour #   define _MM_MAX_EPU16 _mm_max_epu16
2440*80a68eefSBob Badour #   define _MM_MAX_EPU32 _mm_max_epu32
2441*80a68eefSBob Badour 
2442*80a68eefSBob Badour #   define _MM_MIN_EPI8  _mm_min_epi8
2443*80a68eefSBob Badour #   define _MM_MIN_EPI32 _mm_min_epi32
2444*80a68eefSBob Badour #   define _MM_MIN_EPU16 _mm_min_epu16
2445*80a68eefSBob Badour #   define _MM_MIN_EPU32 _mm_min_epu32
2446*80a68eefSBob Badour 
2447*80a68eefSBob Badour #   define _MM_BLENDV_EPI8 _mm_blendv_epi8
2448*80a68eefSBob Badour #   define _MM_PACKUS_EPI32 _mm_packus_epi32
2449*80a68eefSBob Badour #   define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
2450*80a68eefSBob Badour 
2451*80a68eefSBob Badour #   define _MM_MULLO_EPI32 _mm_mullo_epi32
2452*80a68eefSBob Badour #   define _MM_MUL_EPI32  _mm_mul_epi32
2453*80a68eefSBob Badour 
2454*80a68eefSBob Badour #   define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
2455*80a68eefSBob Badour #else     //no SSE4 !!!!!!
_MM_CVTEPU8_EPI16(__m128i a)2456*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
2457*80a68eefSBob Badour     {
2458*80a68eefSBob Badour         __m128i zero = _mm_setzero_si128();
2459*80a68eefSBob Badour         return _mm_unpacklo_epi8(a, zero);
2460*80a68eefSBob Badour     }
2461*80a68eefSBob Badour 
_MM_CVTEPU16_EPI32(__m128i a)2462*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
2463*80a68eefSBob Badour     {
2464*80a68eefSBob Badour         __m128i zero = _mm_setzero_si128();
2465*80a68eefSBob Badour         return _mm_unpacklo_epi16(a, zero);
2466*80a68eefSBob Badour     }
2467*80a68eefSBob Badour 
_MM_CVTEPU32_EPI64(__m128i a)2468*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
2469*80a68eefSBob Badour     {
2470*80a68eefSBob Badour         __m128i zero = _mm_setzero_si128();
2471*80a68eefSBob Badour         return _mm_unpacklo_epi32(a, zero);
2472*80a68eefSBob Badour     }
2473*80a68eefSBob Badour 
_MM_CVTEPI8_EPI16(__m128i a)2474*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
2475*80a68eefSBob Badour     {
2476*80a68eefSBob Badour         __m128i zero = _mm_setzero_si128();
2477*80a68eefSBob Badour         __m128i sign = _mm_cmpgt_epi8(zero, a);
2478*80a68eefSBob Badour         return _mm_unpacklo_epi8(a, sign);
2479*80a68eefSBob Badour     }
2480*80a68eefSBob Badour 
_MM_CVTEPI16_EPI32(__m128i a)2481*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
2482*80a68eefSBob Badour     {
2483*80a68eefSBob Badour         __m128i zero = _mm_setzero_si128();
2484*80a68eefSBob Badour         __m128i sign = _mm_cmpgt_epi16(zero, a);
2485*80a68eefSBob Badour         return _mm_unpacklo_epi16(a, sign);
2486*80a68eefSBob Badour     }
2487*80a68eefSBob Badour 
_MM_CVTEPI32_EPI64(__m128i a)2488*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
2489*80a68eefSBob Badour     {
2490*80a68eefSBob Badour         __m128i zero = _mm_setzero_si128();
2491*80a68eefSBob Badour         __m128i sign = _mm_cmpgt_epi32(zero, a);
2492*80a68eefSBob Badour         return _mm_unpacklo_epi32(a, sign);
2493*80a68eefSBob Badour     }
2494*80a68eefSBob Badour 
_MM_EXTRACT_EPI32(__m128i vec,const int LANE)2495*80a68eefSBob Badour     _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2496*80a68eefSBob Badour     {
2497*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 int32_t tmp[4];
2498*80a68eefSBob Badour         _mm_store_si128((__m128i*)tmp, vec);
2499*80a68eefSBob Badour         return tmp[LANE];
2500*80a68eefSBob Badour     }
2501*80a68eefSBob Badour 
_MM_EXTRACT_EPI8(__m128i vec,const int LANE)2502*80a68eefSBob Badour     _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2503*80a68eefSBob Badour     {
2504*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 int8_t tmp[16];
2505*80a68eefSBob Badour         _mm_store_si128((__m128i*)tmp, vec);
2506*80a68eefSBob Badour         return (int)tmp[LANE];
2507*80a68eefSBob Badour     }
2508*80a68eefSBob Badour 
_MM_EXTRACT_PS(__m128 vec,const int LANE)2509*80a68eefSBob Badour     _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2510*80a68eefSBob Badour     {
2511*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 int32_t tmp[4];
2512*80a68eefSBob Badour         _mm_store_si128((__m128i*)tmp, _M128i(vec));
2513*80a68eefSBob Badour         return tmp[LANE];
2514*80a68eefSBob Badour     }
2515*80a68eefSBob Badour 
_MM_INSERT_EPI32(__m128i vec,int p,const int LANE)2516*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2517*80a68eefSBob Badour     {
2518*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
2519*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2520*80a68eefSBob Badour         __m128i vec_masked, p_masked;
2521*80a68eefSBob Badour         pvec[LANE] = p;
2522*80a68eefSBob Badour         mask[LANE] = 0x0;
2523*80a68eefSBob Badour         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2524*80a68eefSBob Badour         p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2525*80a68eefSBob Badour         return _mm_or_si128(vec_masked, p_masked);
2526*80a68eefSBob Badour     }
2527*80a68eefSBob Badour 
_MM_INSERT_EPI8(__m128i vec,int p,const int LANE)2528*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2529*80a68eefSBob Badour     {
2530*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
2531*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
2532*80a68eefSBob Badour         __m128i vec_masked, p_masked;
2533*80a68eefSBob Badour         pvec[LANE] = (int8_t)p;
2534*80a68eefSBob Badour         mask[LANE] = 0x0;
2535*80a68eefSBob Badour         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2536*80a68eefSBob Badour         p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2537*80a68eefSBob Badour         return _mm_or_si128(vec_masked, p_masked);
2538*80a68eefSBob Badour     }
2539*80a68eefSBob Badour 
_MM_INSERT_PS(__m128 vec,__m128 p,const int LANE)2540*80a68eefSBob Badour     _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2541*80a68eefSBob Badour     {
2542*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2543*80a68eefSBob Badour         __m128 tmp, vec_masked, p_masked;
2544*80a68eefSBob Badour         mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
2545*80a68eefSBob Badour         vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
2546*80a68eefSBob Badour         p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
2547*80a68eefSBob Badour         tmp = _mm_or_ps(vec_masked, p_masked);
2548*80a68eefSBob Badour         return tmp;
2549*80a68eefSBob Badour     }
2550*80a68eefSBob Badour 
_MM_MAX_EPI8(__m128i a,__m128i b)2551*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
2552*80a68eefSBob Badour     {
2553*80a68eefSBob Badour         __m128i cmp, resa, resb;
2554*80a68eefSBob Badour         cmp = _mm_cmpgt_epi8 (a, b);
2555*80a68eefSBob Badour         resa = _mm_and_si128 (cmp, a);
2556*80a68eefSBob Badour         resb = _mm_andnot_si128 (cmp,b);
2557*80a68eefSBob Badour         return _mm_or_si128(resa, resb);
2558*80a68eefSBob Badour     }
2559*80a68eefSBob Badour 
_MM_MAX_EPI32(__m128i a,__m128i b)2560*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
2561*80a68eefSBob Badour     {
2562*80a68eefSBob Badour         __m128i cmp, resa, resb;
2563*80a68eefSBob Badour         cmp = _mm_cmpgt_epi32(a, b);
2564*80a68eefSBob Badour         resa = _mm_and_si128 (cmp, a);
2565*80a68eefSBob Badour         resb = _mm_andnot_si128 (cmp,b);
2566*80a68eefSBob Badour         return _mm_or_si128(resa, resb);
2567*80a68eefSBob Badour     }
2568*80a68eefSBob Badour 
_MM_MAX_EPU16(__m128i a,__m128i b)2569*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
2570*80a68eefSBob Badour     {
2571*80a68eefSBob Badour         __m128i c8000, b_s, a_s, cmp;
2572*80a68eefSBob Badour         c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2573*80a68eefSBob Badour         c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
2574*80a68eefSBob Badour         b_s = _mm_sub_epi16 (b, c8000);
2575*80a68eefSBob Badour         a_s = _mm_sub_epi16 (a, c8000);
2576*80a68eefSBob Badour         cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
2577*80a68eefSBob Badour         a_s = _mm_and_si128 (cmp,a);
2578*80a68eefSBob Badour         b_s = _mm_andnot_si128 (cmp,b);
2579*80a68eefSBob Badour         return _mm_or_si128(a_s, b_s);
2580*80a68eefSBob Badour     }
2581*80a68eefSBob Badour 
_MM_MAX_EPU32(__m128i a,__m128i b)2582*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
2583*80a68eefSBob Badour     {
2584*80a68eefSBob Badour         __m128i c80000000, b_s, a_s, cmp;
2585*80a68eefSBob Badour         c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2586*80a68eefSBob Badour         c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
2587*80a68eefSBob Badour         b_s = _mm_sub_epi32 (b, c80000000);
2588*80a68eefSBob Badour         a_s = _mm_sub_epi32 (a, c80000000);
2589*80a68eefSBob Badour         cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
2590*80a68eefSBob Badour         a_s = _mm_and_si128 (cmp,a);
2591*80a68eefSBob Badour         b_s = _mm_andnot_si128 (cmp,b);
2592*80a68eefSBob Badour         return _mm_or_si128(a_s, b_s);
2593*80a68eefSBob Badour     }
2594*80a68eefSBob Badour 
_MM_MIN_EPI8(__m128i a,__m128i b)2595*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
2596*80a68eefSBob Badour     {
2597*80a68eefSBob Badour         __m128i cmp, resa, resb;
2598*80a68eefSBob Badour         cmp = _mm_cmpgt_epi8 (b, a);
2599*80a68eefSBob Badour         resa = _mm_and_si128 (cmp, a);
2600*80a68eefSBob Badour         resb = _mm_andnot_si128 (cmp,b);
2601*80a68eefSBob Badour         return _mm_or_si128(resa, resb);
2602*80a68eefSBob Badour     }
2603*80a68eefSBob Badour 
_MM_MIN_EPI32(__m128i a,__m128i b)2604*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
2605*80a68eefSBob Badour     {
2606*80a68eefSBob Badour         __m128i cmp, resa, resb;
2607*80a68eefSBob Badour         cmp = _mm_cmpgt_epi32(b, a);
2608*80a68eefSBob Badour         resa = _mm_and_si128 (cmp, a);
2609*80a68eefSBob Badour         resb = _mm_andnot_si128 (cmp,b);
2610*80a68eefSBob Badour         return _mm_or_si128(resa, resb);
2611*80a68eefSBob Badour     }
2612*80a68eefSBob Badour 
_MM_MIN_EPU16(__m128i a,__m128i b)2613*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
2614*80a68eefSBob Badour     {
2615*80a68eefSBob Badour         __m128i c8000, b_s, a_s, cmp;
2616*80a68eefSBob Badour         c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2617*80a68eefSBob Badour         c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
2618*80a68eefSBob Badour         b_s = _mm_sub_epi16 (b, c8000);
2619*80a68eefSBob Badour         a_s = _mm_sub_epi16 (a, c8000);
2620*80a68eefSBob Badour         cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
2621*80a68eefSBob Badour         a_s = _mm_and_si128 (cmp,a);
2622*80a68eefSBob Badour         b_s = _mm_andnot_si128 (cmp,b);
2623*80a68eefSBob Badour         return _mm_or_si128(a_s, b_s);
2624*80a68eefSBob Badour     }
2625*80a68eefSBob Badour 
_MM_MIN_EPU32(__m128i a,__m128i b)2626*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
2627*80a68eefSBob Badour     {
2628*80a68eefSBob Badour         __m128i c80000000, b_s, a_s, cmp;
2629*80a68eefSBob Badour         c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2630*80a68eefSBob Badour         c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
2631*80a68eefSBob Badour         b_s = _mm_sub_epi32 (b, c80000000);
2632*80a68eefSBob Badour         a_s = _mm_sub_epi32 (a, c80000000);
2633*80a68eefSBob Badour         cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
2634*80a68eefSBob Badour         a_s = _mm_and_si128 (cmp,a);
2635*80a68eefSBob Badour         b_s = _mm_andnot_si128 (cmp,b);
2636*80a68eefSBob Badour         return _mm_or_si128(a_s, b_s);
2637*80a68eefSBob Badour     }
2638*80a68eefSBob Badour 
_MM_BLENDV_EPI8(__m128i a,__m128i b,__m128i mask)2639*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
2640*80a68eefSBob Badour     {
2641*80a68eefSBob Badour         //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
2642*80a68eefSBob Badour         __m128i a_masked, b_masked;
2643*80a68eefSBob Badour         b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
2644*80a68eefSBob Badour         a_masked = _mm_andnot_si128 (mask,a);
2645*80a68eefSBob Badour         return _mm_or_si128(a_masked, b_masked);
2646*80a68eefSBob Badour     }
2647*80a68eefSBob Badour 
_MM_PACKUS_EPI32(__m128i a,__m128i b)2648*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
2649*80a68eefSBob Badour     {
2650*80a68eefSBob Badour         __m128i a16, b16, res, reshi,cmp, zero;
2651*80a68eefSBob Badour         zero = _mm_setzero_si128();
2652*80a68eefSBob Badour         a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
2653*80a68eefSBob Badour         b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
2654*80a68eefSBob Badour         res = _mm_unpacklo_epi64(a16, b16); //result without saturation
2655*80a68eefSBob Badour         reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
2656*80a68eefSBob Badour         cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2657*80a68eefSBob Badour         res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
2658*80a68eefSBob Badour         cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2659*80a68eefSBob Badour         return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2660*80a68eefSBob Badour     }
2661*80a68eefSBob Badour 
_MM_PACKUS1_EPI32(__m128i a)2662*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
2663*80a68eefSBob Badour     {
2664*80a68eefSBob Badour         __m128i a16, res, reshi,cmp, zero;
2665*80a68eefSBob Badour         zero = _mm_setzero_si128();
2666*80a68eefSBob Badour         a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
2667*80a68eefSBob Badour         reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
2668*80a68eefSBob Badour         cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2669*80a68eefSBob Badour         res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
2670*80a68eefSBob Badour         cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2671*80a68eefSBob Badour         return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2672*80a68eefSBob Badour     }
2673*80a68eefSBob Badour 
2674*80a68eefSBob Badour 
_NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32 (__m128i a,__m128i b),_NEON2SSE_REASON_SLOW_SERIAL)2675*80a68eefSBob Badour     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
2676*80a68eefSBob Badour     {
2677*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
2678*80a68eefSBob Badour         int64_t res64;
2679*80a68eefSBob Badour         int i;
2680*80a68eefSBob Badour         _mm_store_si128((__m128i*)atmp, a);
2681*80a68eefSBob Badour         _mm_store_si128((__m128i*)btmp, b);
2682*80a68eefSBob Badour         for (i = 0; i<4; i++) {
2683*80a68eefSBob Badour             res64 = atmp[i] * btmp[i];
2684*80a68eefSBob Badour             res[i] = (int)(res64 & 0xffffffff);
2685*80a68eefSBob Badour         }
2686*80a68eefSBob Badour         return _mm_load_si128((__m128i*)res);
2687*80a68eefSBob Badour     }
2688*80a68eefSBob Badour 
_MM_MUL_EPI32(__m128i a,__m128i b)2689*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
2690*80a68eefSBob Badour     {
2691*80a68eefSBob Badour         __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
2692*80a68eefSBob Badour         sign = _mm_xor_si128 (a, b);
2693*80a68eefSBob Badour         sign =  _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
2694*80a68eefSBob Badour         sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(2, 2, 0, 0)); //promote sign bit to 3 and 1st data lanes
2695*80a68eefSBob Badour         zero = _mm_setzero_si128();
2696*80a68eefSBob Badour         a_neg = _mm_abs_epi32 (a); //negate a and b
2697*80a68eefSBob Badour         b_neg = _mm_abs_epi32 (b); //negate a and b
2698*80a68eefSBob Badour         mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
2699*80a68eefSBob Badour         mul_us_neg = _mm_sub_epi64(zero, mul_us);
2700*80a68eefSBob Badour         mul_us_neg = _mm_and_si128(sign, mul_us_neg);
2701*80a68eefSBob Badour         mul_us = _mm_andnot_si128(sign, mul_us);
2702*80a68eefSBob Badour         return _mm_or_si128 (mul_us, mul_us_neg);
2703*80a68eefSBob Badour     }
2704*80a68eefSBob Badour 
_MM_CMPEQ_EPI64(__m128i a,__m128i b)2705*80a68eefSBob Badour     _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
2706*80a68eefSBob Badour     {
2707*80a68eefSBob Badour         __m128i res;
2708*80a68eefSBob Badour         res = _mm_cmpeq_epi32 (a, b);
2709*80a68eefSBob Badour         return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
2710*80a68eefSBob Badour     }
2711*80a68eefSBob Badour #endif     //SSE4
2712*80a68eefSBob Badour 
2713*80a68eefSBob Badour //the special case of functions working only for 32 bits, no SSE4
_MM_INSERT_EPI64_32(__m128i vec,int p,const int LANE)2714*80a68eefSBob Badour _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
2715*80a68eefSBob Badour {
2716*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
2717*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
2718*80a68eefSBob Badour     __m128i vec_masked, p_masked;
2719*80a68eefSBob Badour     pvec[LANE] = p;
2720*80a68eefSBob Badour     mask[LANE] = 0x0;
2721*80a68eefSBob Badour     vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2722*80a68eefSBob Badour     p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2723*80a68eefSBob Badour     return _mm_or_si128(vec_masked, p_masked);
2724*80a68eefSBob Badour }
2725*80a68eefSBob Badour 
_MM_EXTRACT_EPI64_32(__m128i val,const int LANE)2726*80a68eefSBob Badour _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
2727*80a68eefSBob Badour {
2728*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t tmp[2];
2729*80a68eefSBob Badour     _mm_store_si128((__m128i*)tmp, val);
2730*80a68eefSBob Badour     return tmp[LANE];
2731*80a68eefSBob Badour }
2732*80a68eefSBob Badour 
2733*80a68eefSBob Badour #ifndef _NEON2SSE_64BIT_SSE4
2734*80a68eefSBob Badour #   define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
2735*80a68eefSBob Badour #   define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
2736*80a68eefSBob Badour #endif
2737*80a68eefSBob Badour 
2738*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t  vqd_s32(int32x4_t a); //Doubling saturation for signed ints
vqd_s32(int32x4_t a)2739*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
2740*80a68eefSBob Badour {
2741*80a68eefSBob Badour     //Overflow happens only if a and sum have the opposite signs
2742*80a68eefSBob Badour     __m128i c7fffffff, res, res_sat, res_xor_a;
2743*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32(0x7fffffff);
2744*80a68eefSBob Badour     res = _mm_slli_epi32 (a, 1); // res = a*2
2745*80a68eefSBob Badour     res_sat = _mm_srli_epi32(a, 31);
2746*80a68eefSBob Badour     res_sat = _mm_add_epi32(res_sat, c7fffffff);
2747*80a68eefSBob Badour     res_xor_a = _mm_xor_si128(res, a);
2748*80a68eefSBob Badour     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
2749*80a68eefSBob Badour     res_sat = _mm_and_si128(res_xor_a, res_sat);
2750*80a68eefSBob Badour     res = _mm_andnot_si128(res_xor_a, res);
2751*80a68eefSBob Badour     return _mm_or_si128(res, res_sat);
2752*80a68eefSBob Badour }
2753*80a68eefSBob Badour 
2754*80a68eefSBob Badour 
2755*80a68eefSBob Badour //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2756*80a68eefSBob Badour //*************************************************************************
2757*80a68eefSBob Badour //*************************************************************************
2758*80a68eefSBob Badour //*****************  Functions redefinition\implementatin starts here *****
2759*80a68eefSBob Badour //*************************************************************************
2760*80a68eefSBob Badour //*************************************************************************
2761*80a68eefSBob Badour //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2762*80a68eefSBob Badour 
2763*80a68eefSBob Badour /*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
2764*80a68eefSBob Badour #ifdef ARM
2765*80a68eefSBob Badour #define vector_addq_s32 _mm_add_epi32
2766*80a68eefSBob Badour #else //if we have IA
2767*80a68eefSBob Badour #define vector_addq_s32 vadd_s32
2768*80a68eefSBob Badour #endif
2769*80a68eefSBob Badour 
2770*80a68eefSBob Badour ********************************************************************************************
2771*80a68eefSBob Badour Functions below are organised in the following way:
2772*80a68eefSBob Badour 
2773*80a68eefSBob Badour Each NEON intrinsic function has one of the following options:
2774*80a68eefSBob Badour 1.  its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
2775*80a68eefSBob Badour 2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
2776*80a68eefSBob Badour 3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
2777*80a68eefSBob Badour 4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
2778*80a68eefSBob Badour the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
2779*80a68eefSBob Badour - please consider such functions removal from your code.
2780*80a68eefSBob Badour */
2781*80a68eefSBob Badour 
2782*80a68eefSBob Badour //***********************************************************************
2783*80a68eefSBob Badour //************************      Vector add   *****************************
2784*80a68eefSBob Badour //***********************************************************************
2785*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
vadd_s8(int8x8_t a,int8x8_t b)2786*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
2787*80a68eefSBob Badour {
2788*80a68eefSBob Badour     int8x8_t res64;
2789*80a68eefSBob Badour     return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
2790*80a68eefSBob Badour }
2791*80a68eefSBob Badour 
2792*80a68eefSBob Badour 
2793*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
vadd_s16(int16x4_t a,int16x4_t b)2794*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
2795*80a68eefSBob Badour {
2796*80a68eefSBob Badour     int16x4_t res64;
2797*80a68eefSBob Badour     return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
2798*80a68eefSBob Badour }
2799*80a68eefSBob Badour 
2800*80a68eefSBob Badour 
2801*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
vadd_s32(int32x2_t a,int32x2_t b)2802*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
2803*80a68eefSBob Badour {
2804*80a68eefSBob Badour     int32x2_t res64;
2805*80a68eefSBob Badour     return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
2806*80a68eefSBob Badour }
2807*80a68eefSBob Badour 
2808*80a68eefSBob Badour 
2809*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b); // VADD.I64 d0,d0,d0
vadd_s64(int64x1_t a,int64x1_t b)2810*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b)
2811*80a68eefSBob Badour {
2812*80a68eefSBob Badour     int64x1_t res64;
2813*80a68eefSBob Badour     res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
2814*80a68eefSBob Badour     return res64;
2815*80a68eefSBob Badour }
2816*80a68eefSBob Badour 
2817*80a68eefSBob Badour 
2818*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
vadd_f32(float32x2_t a,float32x2_t b)2819*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
2820*80a68eefSBob Badour {
2821*80a68eefSBob Badour     __m128 res;
2822*80a68eefSBob Badour     __m64_128 res64;
2823*80a68eefSBob Badour     res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
2824*80a68eefSBob Badour     _M64f(res64, res);
2825*80a68eefSBob Badour     return res64;
2826*80a68eefSBob Badour }
2827*80a68eefSBob Badour 
2828*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t  vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
2829*80a68eefSBob Badour #define vadd_u8 vadd_s8
2830*80a68eefSBob Badour 
2831*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t  vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
2832*80a68eefSBob Badour #define vadd_u16 vadd_s16
2833*80a68eefSBob Badour 
2834*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t  vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
2835*80a68eefSBob Badour #define vadd_u32 vadd_s32
2836*80a68eefSBob Badour 
2837*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b); // VADD.I64 d0,d0,d0
vadd_u64(uint64x1_t a,uint64x1_t b)2838*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b)
2839*80a68eefSBob Badour {
2840*80a68eefSBob Badour     uint64x1_t res64;
2841*80a68eefSBob Badour     res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
2842*80a68eefSBob Badour     return res64;
2843*80a68eefSBob Badour }
2844*80a68eefSBob Badour 
2845*80a68eefSBob Badour 
2846*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
2847*80a68eefSBob Badour #define vaddq_s8 _mm_add_epi8
2848*80a68eefSBob Badour 
2849*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
2850*80a68eefSBob Badour #define vaddq_s16 _mm_add_epi16
2851*80a68eefSBob Badour 
2852*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
2853*80a68eefSBob Badour #define vaddq_s32 _mm_add_epi32
2854*80a68eefSBob Badour 
2855*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
2856*80a68eefSBob Badour #define vaddq_s64 _mm_add_epi64
2857*80a68eefSBob Badour 
2858*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
2859*80a68eefSBob Badour #define vaddq_f32 _mm_add_ps
2860*80a68eefSBob Badour 
2861*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
2862*80a68eefSBob Badour #define vaddq_u8 _mm_add_epi8
2863*80a68eefSBob Badour 
2864*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
2865*80a68eefSBob Badour #define vaddq_u16 _mm_add_epi16
2866*80a68eefSBob Badour 
2867*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
2868*80a68eefSBob Badour #define vaddq_u32 _mm_add_epi32
2869*80a68eefSBob Badour 
2870*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
2871*80a68eefSBob Badour #define vaddq_u64 _mm_add_epi64
2872*80a68eefSBob Badour 
2873*80a68eefSBob Badour //**************************** Vector long add *****************************:
2874*80a68eefSBob Badour //***********************************************************************
2875*80a68eefSBob Badour //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
2876*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
vaddl_s8(int8x8_t a,int8x8_t b)2877*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
2878*80a68eefSBob Badour {
2879*80a68eefSBob Badour     __m128i a16, b16;
2880*80a68eefSBob Badour     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
2881*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2882*80a68eefSBob Badour     return _mm_add_epi16 (a16, b16);
2883*80a68eefSBob Badour }
2884*80a68eefSBob Badour 
2885*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
vaddl_s16(int16x4_t a,int16x4_t b)2886*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
2887*80a68eefSBob Badour {
2888*80a68eefSBob Badour     __m128i a32, b32;
2889*80a68eefSBob Badour     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
2890*80a68eefSBob Badour     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
2891*80a68eefSBob Badour     return _mm_add_epi32 (a32, b32);
2892*80a68eefSBob Badour }
2893*80a68eefSBob Badour 
2894*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
vaddl_s32(int32x2_t a,int32x2_t b)2895*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
2896*80a68eefSBob Badour {
2897*80a68eefSBob Badour     //may be not optimal
2898*80a68eefSBob Badour     __m128i a64, b64;
2899*80a68eefSBob Badour     a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
2900*80a68eefSBob Badour     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2901*80a68eefSBob Badour     return _mm_add_epi64 ( a64, b64);
2902*80a68eefSBob Badour }
2903*80a68eefSBob Badour 
2904*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
vaddl_u8(uint8x8_t a,uint8x8_t b)2905*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
2906*80a68eefSBob Badour {
2907*80a68eefSBob Badour     __m128i a16, b16;
2908*80a68eefSBob Badour     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
2909*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2910*80a68eefSBob Badour     return _mm_add_epi16 (a16, b16);
2911*80a68eefSBob Badour }
2912*80a68eefSBob Badour 
2913*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
vaddl_u16(uint16x4_t a,uint16x4_t b)2914*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
2915*80a68eefSBob Badour {
2916*80a68eefSBob Badour     __m128i a32, b32;
2917*80a68eefSBob Badour     a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
2918*80a68eefSBob Badour     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2919*80a68eefSBob Badour     return _mm_add_epi32 (a32, b32);
2920*80a68eefSBob Badour }
2921*80a68eefSBob Badour 
2922*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
vaddl_u32(uint32x2_t a,uint32x2_t b)2923*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
2924*80a68eefSBob Badour {
2925*80a68eefSBob Badour     //may be not optimal
2926*80a68eefSBob Badour     __m128i a64, b64;
2927*80a68eefSBob Badour     a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
2928*80a68eefSBob Badour     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2929*80a68eefSBob Badour     return _mm_add_epi64 (a64, b64);
2930*80a68eefSBob Badour }
2931*80a68eefSBob Badour 
2932*80a68eefSBob Badour //***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
2933*80a68eefSBob Badour //*************** *********************************************************************
2934*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
vaddw_s8(int16x8_t a,int8x8_t b)2935*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
2936*80a68eefSBob Badour {
2937*80a68eefSBob Badour     __m128i b16;
2938*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2939*80a68eefSBob Badour     return _mm_add_epi16 (a, b16);
2940*80a68eefSBob Badour }
2941*80a68eefSBob Badour 
2942*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
vaddw_s16(int32x4_t a,int16x4_t b)2943*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
2944*80a68eefSBob Badour {
2945*80a68eefSBob Badour     __m128i b32;
2946*80a68eefSBob Badour     b32 =  _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
2947*80a68eefSBob Badour     return _mm_add_epi32 (a, b32);
2948*80a68eefSBob Badour }
2949*80a68eefSBob Badour 
2950*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
vaddw_s32(int64x2_t a,int32x2_t b)2951*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
2952*80a68eefSBob Badour {
2953*80a68eefSBob Badour     __m128i b64;
2954*80a68eefSBob Badour     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2955*80a68eefSBob Badour     return _mm_add_epi64 (a, b64);
2956*80a68eefSBob Badour }
2957*80a68eefSBob Badour 
2958*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
vaddw_u8(uint16x8_t a,uint8x8_t b)2959*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
2960*80a68eefSBob Badour {
2961*80a68eefSBob Badour     __m128i b16;
2962*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2963*80a68eefSBob Badour     return _mm_add_epi16 (a, b16);
2964*80a68eefSBob Badour }
2965*80a68eefSBob Badour 
2966*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
vaddw_u16(uint32x4_t a,uint16x4_t b)2967*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
2968*80a68eefSBob Badour {
2969*80a68eefSBob Badour     __m128i b32;
2970*80a68eefSBob Badour     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2971*80a68eefSBob Badour     return _mm_add_epi32 (a, b32);
2972*80a68eefSBob Badour }
2973*80a68eefSBob Badour 
2974*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
vaddw_u32(uint64x2_t a,uint32x2_t b)2975*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
2976*80a68eefSBob Badour {
2977*80a68eefSBob Badour     __m128i b64;
2978*80a68eefSBob Badour     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2979*80a68eefSBob Badour     return _mm_add_epi64 (a, b64);
2980*80a68eefSBob Badour }
2981*80a68eefSBob Badour 
2982*80a68eefSBob Badour //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
2983*80a68eefSBob Badour //*************************************************************************************************************************
2984*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b); // VHADD.S8 d0,d0,d0
vhadd_s8(int8x8_t a,int8x8_t b)2985*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b)
2986*80a68eefSBob Badour {
2987*80a68eefSBob Badour     int8x8_t res64;
2988*80a68eefSBob Badour     return64(vhaddq_s8(_pM128i(a), _pM128i(b)));
2989*80a68eefSBob Badour }
2990*80a68eefSBob Badour 
2991*80a68eefSBob Badour 
2992*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b); // VHADD.S16 d0,d0,d0
vhadd_s16(int16x4_t a,int16x4_t b)2993*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b)
2994*80a68eefSBob Badour {
2995*80a68eefSBob Badour     int16x4_t res64;
2996*80a68eefSBob Badour     return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
2997*80a68eefSBob Badour }
2998*80a68eefSBob Badour 
2999*80a68eefSBob Badour 
3000*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b); // VHADD.S32 d0,d0,d0
vhadd_s32(int32x2_t a,int32x2_t b)3001*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b)
3002*80a68eefSBob Badour {
3003*80a68eefSBob Badour     int32x2_t res64;
3004*80a68eefSBob Badour     return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
3005*80a68eefSBob Badour }
3006*80a68eefSBob Badour 
3007*80a68eefSBob Badour 
3008*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b); // VHADD.w d0,d0,d0
vhadd_u8(uint8x8_t a,uint8x8_t b)3009*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b)
3010*80a68eefSBob Badour {
3011*80a68eefSBob Badour     uint8x8_t res64;
3012*80a68eefSBob Badour     return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
3013*80a68eefSBob Badour }
3014*80a68eefSBob Badour 
3015*80a68eefSBob Badour 
3016*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b); // VHADD.s16 d0,d0,d0
vhadd_u16(uint16x4_t a,uint16x4_t b)3017*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b)
3018*80a68eefSBob Badour {
3019*80a68eefSBob Badour     uint16x4_t res64;
3020*80a68eefSBob Badour     return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
3021*80a68eefSBob Badour }
3022*80a68eefSBob Badour 
3023*80a68eefSBob Badour 
3024*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b); // VHADD.U32 d0,d0,d0
vhadd_u32(uint32x2_t a,uint32x2_t b)3025*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b)
3026*80a68eefSBob Badour {
3027*80a68eefSBob Badour     uint32x2_t res64;
3028*80a68eefSBob Badour     return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
3029*80a68eefSBob Badour }
3030*80a68eefSBob Badour 
3031*80a68eefSBob Badour 
3032*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
vhaddq_s8(int8x16_t a,int8x16_t b)3033*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
3034*80a68eefSBob Badour {
3035*80a68eefSBob Badour     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3036*80a68eefSBob Badour     __m128i tmp1, tmp2;
3037*80a68eefSBob Badour     tmp1 = _mm_and_si128(a,b);
3038*80a68eefSBob Badour     tmp2 = _mm_xor_si128(a,b);
3039*80a68eefSBob Badour     tmp2 = vshrq_n_s8(tmp2,1);
3040*80a68eefSBob Badour     return _mm_add_epi8(tmp1,tmp2);
3041*80a68eefSBob Badour }
3042*80a68eefSBob Badour 
3043*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
vhaddq_s16(int16x8_t a,int16x8_t b)3044*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
3045*80a68eefSBob Badour {
3046*80a68eefSBob Badour     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3047*80a68eefSBob Badour     __m128i tmp1, tmp2;
3048*80a68eefSBob Badour     tmp1 = _mm_and_si128(a,b);
3049*80a68eefSBob Badour     tmp2 = _mm_xor_si128(a,b);
3050*80a68eefSBob Badour     tmp2 = _mm_srai_epi16(tmp2,1);
3051*80a68eefSBob Badour     return _mm_add_epi16(tmp1,tmp2);
3052*80a68eefSBob Badour }
3053*80a68eefSBob Badour 
3054*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
vhaddq_s32(int32x4_t a,int32x4_t b)3055*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
3056*80a68eefSBob Badour {
3057*80a68eefSBob Badour     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3058*80a68eefSBob Badour     __m128i tmp1, tmp2;
3059*80a68eefSBob Badour     tmp1 = _mm_and_si128(a,b);
3060*80a68eefSBob Badour     tmp2 = _mm_xor_si128(a,b);
3061*80a68eefSBob Badour     tmp2 = _mm_srai_epi32(tmp2,1);
3062*80a68eefSBob Badour     return _mm_add_epi32(tmp1,tmp2);
3063*80a68eefSBob Badour }
3064*80a68eefSBob Badour 
3065*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
vhaddq_u8(uint8x16_t a,uint8x16_t b)3066*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
3067*80a68eefSBob Badour {
3068*80a68eefSBob Badour     __m128i c1, sum, res;
3069*80a68eefSBob Badour     c1 = _mm_set1_epi8(1);
3070*80a68eefSBob Badour     sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
3071*80a68eefSBob Badour     res = _mm_xor_si128(a, b); //for rounding compensation
3072*80a68eefSBob Badour     res = _mm_and_si128(res,c1); //for rounding compensation
3073*80a68eefSBob Badour     return _mm_sub_epi8 (sum, res); //actual rounding compensation
3074*80a68eefSBob Badour }
3075*80a68eefSBob Badour 
3076*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
vhaddq_u16(uint16x8_t a,uint16x8_t b)3077*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
3078*80a68eefSBob Badour {
3079*80a68eefSBob Badour     __m128i sum, res;
3080*80a68eefSBob Badour     sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
3081*80a68eefSBob Badour     res = _mm_xor_si128(a, b); //for rounding compensation
3082*80a68eefSBob Badour     res = _mm_slli_epi16 (res,15); //shift left  then back right to
3083*80a68eefSBob Badour     res = _mm_srli_epi16 (res,15); //get 1 or zero
3084*80a68eefSBob Badour     return _mm_sub_epi16 (sum, res); //actual rounding compensation
3085*80a68eefSBob Badour }
3086*80a68eefSBob Badour 
3087*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
vhaddq_u32(uint32x4_t a,uint32x4_t b)3088*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
3089*80a68eefSBob Badour {
3090*80a68eefSBob Badour     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3091*80a68eefSBob Badour     __m128i tmp1, tmp2;
3092*80a68eefSBob Badour     tmp1 = _mm_and_si128(a,b);
3093*80a68eefSBob Badour     tmp2 = _mm_xor_si128(a,b);
3094*80a68eefSBob Badour     tmp2 = _mm_srli_epi32(tmp2,1);
3095*80a68eefSBob Badour     return _mm_add_epi32(tmp1,tmp2);
3096*80a68eefSBob Badour }
3097*80a68eefSBob Badour 
3098*80a68eefSBob Badour //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
3099*80a68eefSBob Badour //*****************************************************************************************************************************
3100*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b); // VRHADD.S8 d0,d0,d0
vrhadd_s8(int8x8_t a,int8x8_t b)3101*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b)
3102*80a68eefSBob Badour {
3103*80a68eefSBob Badour     int8x8_t res64;
3104*80a68eefSBob Badour     return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
3105*80a68eefSBob Badour }
3106*80a68eefSBob Badour 
3107*80a68eefSBob Badour 
3108*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b); // VRHADD.S16 d0,d0,d0
vrhadd_s16(int16x4_t a,int16x4_t b)3109*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b)
3110*80a68eefSBob Badour {
3111*80a68eefSBob Badour     int16x4_t res64;
3112*80a68eefSBob Badour     return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
3113*80a68eefSBob Badour }
3114*80a68eefSBob Badour 
3115*80a68eefSBob Badour 
3116*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b); // VRHADD.S32 d0,d0,d0
vrhadd_s32(int32x2_t a,int32x2_t b)3117*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b)
3118*80a68eefSBob Badour {
3119*80a68eefSBob Badour     int32x2_t res64;
3120*80a68eefSBob Badour     return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
3121*80a68eefSBob Badour }
3122*80a68eefSBob Badour 
3123*80a68eefSBob Badour 
3124*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
vrhadd_u8(uint8x8_t a,uint8x8_t b)3125*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
3126*80a68eefSBob Badour {
3127*80a68eefSBob Badour     uint8x8_t res64;
3128*80a68eefSBob Badour     return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3129*80a68eefSBob Badour }
3130*80a68eefSBob Badour 
3131*80a68eefSBob Badour 
3132*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
vrhadd_u16(uint16x4_t a,uint16x4_t b)3133*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
3134*80a68eefSBob Badour {
3135*80a68eefSBob Badour     uint16x4_t res64;
3136*80a68eefSBob Badour     return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3137*80a68eefSBob Badour }
3138*80a68eefSBob Badour 
3139*80a68eefSBob Badour 
3140*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b); // VRHADD.U32 d0,d0,d0
vrhadd_u32(uint32x2_t a,uint32x2_t b)3141*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b)
3142*80a68eefSBob Badour {
3143*80a68eefSBob Badour     uint32x2_t res64;
3144*80a68eefSBob Badour     return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
3145*80a68eefSBob Badour }
3146*80a68eefSBob Badour 
3147*80a68eefSBob Badour 
3148*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
vrhaddq_s8(int8x16_t a,int8x16_t b)3149*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
3150*80a68eefSBob Badour {
3151*80a68eefSBob Badour     //no signed average in x86 SIMD, go to unsigned
3152*80a68eefSBob Badour     __m128i c128, au, bu, sum;
3153*80a68eefSBob Badour     c128 = _mm_set1_epi8((int8_t)0x80); //-128
3154*80a68eefSBob Badour     au = _mm_sub_epi8(a, c128); //add 128
3155*80a68eefSBob Badour     bu = _mm_sub_epi8(b, c128); //add 128
3156*80a68eefSBob Badour     sum = _mm_avg_epu8(au, bu);
3157*80a68eefSBob Badour     return _mm_add_epi8 (sum, c128); //sub 128
3158*80a68eefSBob Badour }
3159*80a68eefSBob Badour 
3160*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
vrhaddq_s16(int16x8_t a,int16x8_t b)3161*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
3162*80a68eefSBob Badour {
3163*80a68eefSBob Badour     //no signed average in x86 SIMD, go to unsigned
3164*80a68eefSBob Badour     __m128i cx8000, au, bu, sum;
3165*80a68eefSBob Badour     cx8000 = _mm_set1_epi16((int16_t)0x8000); // - 32768
3166*80a68eefSBob Badour     au = _mm_sub_epi16(a, cx8000); //add 32768
3167*80a68eefSBob Badour     bu = _mm_sub_epi16(b, cx8000); //add 32768
3168*80a68eefSBob Badour     sum = _mm_avg_epu16(au, bu);
3169*80a68eefSBob Badour     return _mm_add_epi16 (sum, cx8000); //sub 32768
3170*80a68eefSBob Badour }
3171*80a68eefSBob Badour 
3172*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
vrhaddq_s32(int32x4_t a,int32x4_t b)3173*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
3174*80a68eefSBob Badour {
3175*80a68eefSBob Badour     //need to avoid overflow
3176*80a68eefSBob Badour     __m128i a2, b2, res, sum;
3177*80a68eefSBob Badour     a2 = _mm_srai_epi32(a,1); //a2=a/2;
3178*80a68eefSBob Badour     b2 = _mm_srai_epi32(b,1); // b2=b/2;
3179*80a68eefSBob Badour     res = _mm_or_si128(a,b); //for rounding
3180*80a68eefSBob Badour     res = _mm_slli_epi32 (res,31); //shift left  then back right to
3181*80a68eefSBob Badour     res = _mm_srli_epi32 (res,31); //get 1 or zero
3182*80a68eefSBob Badour     sum = _mm_add_epi32(a2,b2);
3183*80a68eefSBob Badour     return _mm_add_epi32(sum,res);
3184*80a68eefSBob Badour }
3185*80a68eefSBob Badour 
3186*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
3187*80a68eefSBob Badour #define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
3188*80a68eefSBob Badour 
3189*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
3190*80a68eefSBob Badour #define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
3191*80a68eefSBob Badour 
3192*80a68eefSBob Badour 
3193*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
vrhaddq_u32(uint32x4_t a,uint32x4_t b)3194*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
3195*80a68eefSBob Badour {
3196*80a68eefSBob Badour     //need to avoid overflow
3197*80a68eefSBob Badour     __m128i a2, b2, res, sum;
3198*80a68eefSBob Badour     a2 = _mm_srli_epi32(a,1); //a2=a/2;
3199*80a68eefSBob Badour     b2 = _mm_srli_epi32(b,1); // b2=b/2;
3200*80a68eefSBob Badour     res = _mm_or_si128(a,b); //for rounding
3201*80a68eefSBob Badour     res = _mm_slli_epi32 (res,31); //shift left  then back right to
3202*80a68eefSBob Badour     res = _mm_srli_epi32 (res,31); //get 1 or zero
3203*80a68eefSBob Badour     sum = _mm_add_epi32(a2,b2);
3204*80a68eefSBob Badour     return _mm_add_epi32(sum,res);
3205*80a68eefSBob Badour }
3206*80a68eefSBob Badour 
3207*80a68eefSBob Badour //****************** VQADD: Vector saturating add ************************
3208*80a68eefSBob Badour //************************************************************************
3209*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
vqadd_s8(int8x8_t a,int8x8_t b)3210*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
3211*80a68eefSBob Badour {
3212*80a68eefSBob Badour     int8x8_t res64;
3213*80a68eefSBob Badour     return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
3214*80a68eefSBob Badour }
3215*80a68eefSBob Badour 
3216*80a68eefSBob Badour 
3217*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
vqadd_s16(int16x4_t a,int16x4_t b)3218*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
3219*80a68eefSBob Badour {
3220*80a68eefSBob Badour     int16x4_t res64;
3221*80a68eefSBob Badour     return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
3222*80a68eefSBob Badour }
3223*80a68eefSBob Badour 
3224*80a68eefSBob Badour 
3225*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b); // VQADD.S32 d0,d0,d0
vqadd_s32(int32x2_t a,int32x2_t b)3226*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b)
3227*80a68eefSBob Badour {
3228*80a68eefSBob Badour     int32x2_t res64;
3229*80a68eefSBob Badour     return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
3230*80a68eefSBob Badour }
3231*80a68eefSBob Badour 
3232*80a68eefSBob Badour 
3233*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t  vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)3234*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3235*80a68eefSBob Badour {
3236*80a68eefSBob Badour     int64x1_t res;
3237*80a68eefSBob Badour     uint64_t a64, b64;
3238*80a68eefSBob Badour     a64 = a.m64_u64[0];
3239*80a68eefSBob Badour     b64 = b.m64_u64[0];
3240*80a68eefSBob Badour     res.m64_u64[0] = a64 + b64;
3241*80a68eefSBob Badour     a64 = (a64 >> 63) + (~_SIGNBIT64);
3242*80a68eefSBob Badour     if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
3243*80a68eefSBob Badour         res.m64_u64[0] = a64;
3244*80a68eefSBob Badour     }
3245*80a68eefSBob Badour     return res;
3246*80a68eefSBob Badour }
3247*80a68eefSBob Badour 
3248*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
vqadd_u8(uint8x8_t a,uint8x8_t b)3249*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
3250*80a68eefSBob Badour {
3251*80a68eefSBob Badour     uint8x8_t res64;
3252*80a68eefSBob Badour     return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
3253*80a68eefSBob Badour }
3254*80a68eefSBob Badour 
3255*80a68eefSBob Badour 
3256*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
vqadd_u16(uint16x4_t a,uint16x4_t b)3257*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
3258*80a68eefSBob Badour {
3259*80a68eefSBob Badour     uint16x4_t res64;
3260*80a68eefSBob Badour     return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
3261*80a68eefSBob Badour }
3262*80a68eefSBob Badour 
3263*80a68eefSBob Badour 
3264*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b); // VQADD.U32 d0,d0,d0
vqadd_u32(uint32x2_t a,uint32x2_t b)3265*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b)
3266*80a68eefSBob Badour {
3267*80a68eefSBob Badour     uint32x2_t res64;
3268*80a68eefSBob Badour     return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
3269*80a68eefSBob Badour }
3270*80a68eefSBob Badour 
3271*80a68eefSBob Badour 
3272*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64 (uint64x1_t a,uint64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)3273*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3274*80a68eefSBob Badour {
3275*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t a64, b64;
3276*80a68eefSBob Badour     uint64x1_t res;
3277*80a68eefSBob Badour     a64 = a.m64_u64[0];
3278*80a68eefSBob Badour     b64 = b.m64_u64[0];
3279*80a68eefSBob Badour     res.m64_u64[0] = a64 + b64;
3280*80a68eefSBob Badour     if (res.m64_u64[0] < a64) {
3281*80a68eefSBob Badour         res.m64_u64[0] = ~(uint64_t)0;
3282*80a68eefSBob Badour     }
3283*80a68eefSBob Badour     return res;
3284*80a68eefSBob Badour }
3285*80a68eefSBob Badour 
3286*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
3287*80a68eefSBob Badour #define vqaddq_s8 _mm_adds_epi8
3288*80a68eefSBob Badour 
3289*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
3290*80a68eefSBob Badour #define vqaddq_s16 _mm_adds_epi16
3291*80a68eefSBob Badour 
3292*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
vqaddq_s32(int32x4_t a,int32x4_t b)3293*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
3294*80a68eefSBob Badour {
3295*80a68eefSBob Badour     //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
3296*80a68eefSBob Badour     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
3297*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32(0x7fffffff);
3298*80a68eefSBob Badour     res = _mm_add_epi32(a, b);
3299*80a68eefSBob Badour     res_sat = _mm_srli_epi32(a, 31);
3300*80a68eefSBob Badour     res_sat = _mm_add_epi32(res_sat, c7fffffff);
3301*80a68eefSBob Badour     res_xor_a = _mm_xor_si128(res, a);
3302*80a68eefSBob Badour     b_xor_a_ = _mm_xor_si128(b, a);
3303*80a68eefSBob Badour     res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
3304*80a68eefSBob Badour     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
3305*80a68eefSBob Badour     res_sat = _mm_and_si128(res_xor_a, res_sat);
3306*80a68eefSBob Badour     res = _mm_andnot_si128(res_xor_a, res);
3307*80a68eefSBob Badour     return _mm_or_si128(res, res_sat);
3308*80a68eefSBob Badour }
3309*80a68eefSBob Badour 
3310*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3311*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3312*80a68eefSBob Badour {
3313*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
3314*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
3315*80a68eefSBob Badour     _mm_store_si128((__m128i*)btmp, b);
3316*80a68eefSBob Badour     res[0] = atmp[0] + btmp[0];
3317*80a68eefSBob Badour     res[1] = atmp[1] + btmp[1];
3318*80a68eefSBob Badour 
3319*80a68eefSBob Badour     atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
3320*80a68eefSBob Badour     atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
3321*80a68eefSBob Badour 
3322*80a68eefSBob Badour     if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
3323*80a68eefSBob Badour         res[0] = atmp[0];
3324*80a68eefSBob Badour     }
3325*80a68eefSBob Badour     if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
3326*80a68eefSBob Badour         res[1] = atmp[1];
3327*80a68eefSBob Badour     }
3328*80a68eefSBob Badour     return _mm_load_si128((__m128i*)res);
3329*80a68eefSBob Badour }
3330*80a68eefSBob Badour 
3331*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
3332*80a68eefSBob Badour #define vqaddq_u8 _mm_adds_epu8
3333*80a68eefSBob Badour 
3334*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
3335*80a68eefSBob Badour #define vqaddq_u16 _mm_adds_epu16
3336*80a68eefSBob Badour 
3337*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
vqaddq_u32(uint32x4_t a,uint32x4_t b)3338*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
3339*80a68eefSBob Badour {
3340*80a68eefSBob Badour     __m128i c80000000, cmp, subsum, suba, sum;
3341*80a68eefSBob Badour     c80000000 = _mm_set1_epi32 (0x80000000);
3342*80a68eefSBob Badour     sum = _mm_add_epi32 (a, b);
3343*80a68eefSBob Badour     subsum = _mm_sub_epi32 (sum, c80000000);
3344*80a68eefSBob Badour     suba = _mm_sub_epi32 (a, c80000000);
3345*80a68eefSBob Badour     cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
3346*80a68eefSBob Badour     return _mm_or_si128 (sum, cmp); //saturation
3347*80a68eefSBob Badour }
3348*80a68eefSBob Badour 
3349*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
3350*80a68eefSBob Badour #ifdef USE_SSE4
vqaddq_u64(uint64x2_t a,uint64x2_t b)3351*80a68eefSBob Badour     _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
3352*80a68eefSBob Badour     {
3353*80a68eefSBob Badour         __m128i c80000000, sum, cmp, suba, subsum;
3354*80a68eefSBob Badour         c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
3355*80a68eefSBob Badour         sum = _mm_add_epi64 (a, b);
3356*80a68eefSBob Badour         subsum = _mm_sub_epi64 (sum, c80000000);
3357*80a68eefSBob Badour         suba = _mm_sub_epi64 (a, c80000000);
3358*80a68eefSBob Badour         cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
3359*80a68eefSBob Badour         return _mm_or_si128 (sum, cmp); //saturation
3360*80a68eefSBob Badour     }
3361*80a68eefSBob Badour #else
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64 (uint64x2_t a,uint64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3362*80a68eefSBob Badour     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3363*80a68eefSBob Badour     {
3364*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
3365*80a68eefSBob Badour         _mm_store_si128((__m128i*)atmp, a);
3366*80a68eefSBob Badour         _mm_store_si128((__m128i*)btmp, b);
3367*80a68eefSBob Badour         res[0] = atmp[0] + btmp[0];
3368*80a68eefSBob Badour         res[1] = atmp[1] + btmp[1];
3369*80a68eefSBob Badour         if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
3370*80a68eefSBob Badour         if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
3371*80a68eefSBob Badour         return _mm_load_si128((__m128i*)(res));
3372*80a68eefSBob Badour     }
3373*80a68eefSBob Badour #endif
3374*80a68eefSBob Badour 
3375*80a68eefSBob Badour 
3376*80a68eefSBob Badour //******************* Vector add high half (truncated)  ******************
3377*80a68eefSBob Badour //************************************************************************
3378*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
vaddhn_s16(int16x8_t a,int16x8_t b)3379*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
3380*80a68eefSBob Badour {
3381*80a68eefSBob Badour     int8x8_t res64;
3382*80a68eefSBob Badour     __m128i sum;
3383*80a68eefSBob Badour     sum = _mm_add_epi16 (a, b);
3384*80a68eefSBob Badour     sum = _mm_srai_epi16 (sum, 8);
3385*80a68eefSBob Badour     sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
3386*80a68eefSBob Badour     return64(sum);
3387*80a68eefSBob Badour }
3388*80a68eefSBob Badour 
3389*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
vaddhn_s32(int32x4_t a,int32x4_t b)3390*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
3391*80a68eefSBob Badour {
3392*80a68eefSBob Badour     int16x4_t res64;
3393*80a68eefSBob Badour     __m128i sum;
3394*80a68eefSBob Badour     sum = _mm_add_epi32 (a, b);
3395*80a68eefSBob Badour     sum = _mm_srai_epi32(sum, 16);
3396*80a68eefSBob Badour     sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
3397*80a68eefSBob Badour     return64(sum);
3398*80a68eefSBob Badour }
3399*80a68eefSBob Badour 
3400*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
vaddhn_s64(int64x2_t a,int64x2_t b)3401*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b)
3402*80a68eefSBob Badour {
3403*80a68eefSBob Badour     int32x2_t res64;
3404*80a68eefSBob Badour     __m128i sum;
3405*80a68eefSBob Badour     sum = _mm_add_epi64 (a, b);
3406*80a68eefSBob Badour     sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (0 << 4) | (2 << 6));
3407*80a68eefSBob Badour     return64(sum);
3408*80a68eefSBob Badour }
3409*80a68eefSBob Badour 
3410*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
vaddhn_u16(uint16x8_t a,uint16x8_t b)3411*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
3412*80a68eefSBob Badour {
3413*80a68eefSBob Badour     uint8x8_t res64;
3414*80a68eefSBob Badour     __m128i sum;
3415*80a68eefSBob Badour     sum = _mm_add_epi16 (a, b);
3416*80a68eefSBob Badour     sum = _mm_srli_epi16 (sum, 8);
3417*80a68eefSBob Badour     sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
3418*80a68eefSBob Badour     return64(sum);
3419*80a68eefSBob Badour }
3420*80a68eefSBob Badour 
3421*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
vaddhn_u32(uint32x4_t a,uint32x4_t b)3422*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
3423*80a68eefSBob Badour {
3424*80a68eefSBob Badour     uint16x4_t res64;
3425*80a68eefSBob Badour      __m128i sum;
3426*80a68eefSBob Badour     sum = _mm_add_epi32 (a, b);
3427*80a68eefSBob Badour     sum = _mm_srli_epi32 (sum, 16);
3428*80a68eefSBob Badour #ifdef USE_SSE4
3429*80a68eefSBob Badour     sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
3430*80a68eefSBob Badour #else
3431*80a68eefSBob Badour     sum = _mm_shuffle_epi8 (sum, *(__m128i*) mask8_32_even_odd); //go to 16 bits
3432*80a68eefSBob Badour #endif
3433*80a68eefSBob Badour     return64(sum);
3434*80a68eefSBob Badour }
3435*80a68eefSBob Badour 
3436*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
3437*80a68eefSBob Badour #define vaddhn_u64 vaddhn_s64
3438*80a68eefSBob Badour 
3439*80a68eefSBob Badour //*********** Vector rounding add high half: vraddhn_<type> ******************.
3440*80a68eefSBob Badour //***************************************************************************
3441*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
vraddhn_s16(int16x8_t a,int16x8_t b)3442*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
3443*80a68eefSBob Badour {
3444*80a68eefSBob Badour     int8x8_t res64;
3445*80a68eefSBob Badour     __m128i sum, mask1;
3446*80a68eefSBob Badour     sum = _mm_add_epi16 (a, b);
3447*80a68eefSBob Badour     mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
3448*80a68eefSBob Badour     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
3449*80a68eefSBob Badour     sum = _mm_srai_epi16 (sum, 8); //get high half
3450*80a68eefSBob Badour     sum = _mm_add_epi16 (sum, mask1); //actual rounding
3451*80a68eefSBob Badour     sum = _mm_packs_epi16 (sum, sum);
3452*80a68eefSBob Badour     return64(sum);
3453*80a68eefSBob Badour }
3454*80a68eefSBob Badour 
3455*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
vraddhn_s32(int32x4_t a,int32x4_t b)3456*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
3457*80a68eefSBob Badour {
3458*80a68eefSBob Badour     //SIMD may be not optimal, serial may be faster
3459*80a68eefSBob Badour     int16x4_t res64;
3460*80a68eefSBob Badour     __m128i sum, mask1;
3461*80a68eefSBob Badour     sum = _mm_add_epi32 (a, b);
3462*80a68eefSBob Badour     mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
3463*80a68eefSBob Badour     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
3464*80a68eefSBob Badour     sum = _mm_srai_epi32 (sum, 16); //get high half
3465*80a68eefSBob Badour     sum = _mm_add_epi32 (sum, mask1); //actual rounding
3466*80a68eefSBob Badour     sum = _mm_packs_epi32 (sum, sum);
3467*80a68eefSBob Badour     return64(sum);
3468*80a68eefSBob Badour }
3469*80a68eefSBob Badour 
3470*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t  vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
vraddhn_s64(int64x2_t a,int64x2_t b)3471*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
3472*80a68eefSBob Badour {
3473*80a68eefSBob Badour     //SIMD may be not optimal, serial may be faster
3474*80a68eefSBob Badour     int32x2_t res64;
3475*80a68eefSBob Badour     __m128i sum, mask1;
3476*80a68eefSBob Badour     sum = _mm_add_epi64 (a, b);
3477*80a68eefSBob Badour     mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
3478*80a68eefSBob Badour     mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
3479*80a68eefSBob Badour     sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
3480*80a68eefSBob Badour     sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (1 << 4) | (3 << 6));
3481*80a68eefSBob Badour     return64(sum);
3482*80a68eefSBob Badour }
3483*80a68eefSBob Badour 
3484*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
vraddhn_u16(uint16x8_t a,uint16x8_t b)3485*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
3486*80a68eefSBob Badour {
3487*80a68eefSBob Badour     uint8x8_t res64;
3488*80a68eefSBob Badour     __m128i sum, mask1;
3489*80a68eefSBob Badour     sum = _mm_add_epi16 (a, b);
3490*80a68eefSBob Badour     mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
3491*80a68eefSBob Badour     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
3492*80a68eefSBob Badour     sum = _mm_srai_epi16 (sum, 8); //get high half
3493*80a68eefSBob Badour     sum = _mm_add_epi16 (sum, mask1); //actual rounding
3494*80a68eefSBob Badour     sum = _mm_packus_epi16 (sum, sum);
3495*80a68eefSBob Badour     return64(sum);
3496*80a68eefSBob Badour }
3497*80a68eefSBob Badour 
3498*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
vraddhn_u32(uint32x4_t a,uint32x4_t b)3499*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
3500*80a68eefSBob Badour {
3501*80a68eefSBob Badour     //SIMD may be not optimal, serial may be faster
3502*80a68eefSBob Badour     uint16x4_t res64;
3503*80a68eefSBob Badour     __m128i sum, mask1;
3504*80a68eefSBob Badour     sum = _mm_add_epi32 (a, b);
3505*80a68eefSBob Badour     mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
3506*80a68eefSBob Badour     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
3507*80a68eefSBob Badour     sum = _mm_srai_epi32 (sum, 16); //get high half
3508*80a68eefSBob Badour     sum = _mm_add_epi32 (sum, mask1); //actual rounding
3509*80a68eefSBob Badour     sum = _MM_PACKUS1_EPI32 (sum);
3510*80a68eefSBob Badour     return64(sum);
3511*80a68eefSBob Badour }
3512*80a68eefSBob Badour 
3513*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
3514*80a68eefSBob Badour #define vraddhn_u64 vraddhn_s64
3515*80a68eefSBob Badour 
3516*80a68eefSBob Badour //**********************************************************************************
3517*80a68eefSBob Badour //*********             Multiplication            *************************************
3518*80a68eefSBob Badour //**************************************************************************************
3519*80a68eefSBob Badour 
3520*80a68eefSBob Badour //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
3521*80a68eefSBob Badour //As we don't go to wider result functions are equal to "multiply low" in x86
3522*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
vmul_s8(int8x8_t a,int8x8_t b)3523*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
3524*80a68eefSBob Badour {
3525*80a68eefSBob Badour     // no 8 bit simd multiply, need to go to 16 bits in SSE
3526*80a68eefSBob Badour     int8x8_t res64;
3527*80a68eefSBob Badour     __m128i a128, b128, res;
3528*80a68eefSBob Badour     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
3529*80a68eefSBob Badour     b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3530*80a68eefSBob Badour     res = _mm_mullo_epi16 (a128, b128);
3531*80a68eefSBob Badour     res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
3532*80a68eefSBob Badour     return64(res);
3533*80a68eefSBob Badour }
3534*80a68eefSBob Badour 
3535*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a,  int16x4_t b); // VMUL.I16 d0,d0,d0
3536*80a68eefSBob Badour #define vmul_s16 vmul_u16
3537*80a68eefSBob Badour 
3538*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a,  int32x2_t b); // VMUL.I32 d0,d0,d0
3539*80a68eefSBob Badour #define vmul_s32 vmul_u32
3540*80a68eefSBob Badour 
3541*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
vmul_f32(float32x2_t a,float32x2_t b)3542*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
3543*80a68eefSBob Badour {
3544*80a68eefSBob Badour     float32x4_t tmp;
3545*80a68eefSBob Badour     __m64_128 res64;
3546*80a68eefSBob Badour     tmp =  _mm_mul_ps(_pM128(a),_pM128(b));
3547*80a68eefSBob Badour     _M64f(res64, tmp); //use low 64 bits
3548*80a68eefSBob Badour     return res64;
3549*80a68eefSBob Badour }
3550*80a68eefSBob Badour 
3551*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
vmul_u8(uint8x8_t a,uint8x8_t b)3552*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
3553*80a68eefSBob Badour {
3554*80a68eefSBob Badour     // no 8 bit simd multiply, need to go to 16 bits in SSE
3555*80a68eefSBob Badour     uint8x8_t res64;
3556*80a68eefSBob Badour     __m128i mask, a128, b128, res;
3557*80a68eefSBob Badour     mask = _mm_set1_epi16(0xff);
3558*80a68eefSBob Badour     a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
3559*80a68eefSBob Badour     b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
3560*80a68eefSBob Badour     res = _mm_mullo_epi16 (a128, b128);
3561*80a68eefSBob Badour     res = _mm_and_si128(res, mask); //to avoid saturation
3562*80a68eefSBob Badour     res = _mm_packus_epi16 (res,res); //use only low 64 bits
3563*80a68eefSBob Badour     return64(res);
3564*80a68eefSBob Badour }
3565*80a68eefSBob Badour 
3566*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
vmul_u16(uint16x4_t a,uint16x4_t b)3567*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
3568*80a68eefSBob Badour {
3569*80a68eefSBob Badour     uint16x4_t res64;
3570*80a68eefSBob Badour     return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
3571*80a68eefSBob Badour }
3572*80a68eefSBob Badour 
3573*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vmul_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3574*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3575*80a68eefSBob Badour {
3576*80a68eefSBob Badour     uint32x2_t res;
3577*80a68eefSBob Badour     res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
3578*80a68eefSBob Badour     res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
3579*80a68eefSBob Badour     return res;
3580*80a68eefSBob Badour }
3581*80a68eefSBob Badour 
3582*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
vmul_p8(poly8x8_t a,poly8x8_t b)3583*80a68eefSBob Badour _NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
3584*80a68eefSBob Badour {
3585*80a68eefSBob Badour     //may be optimized
3586*80a68eefSBob Badour     poly8x8_t res64;
3587*80a68eefSBob Badour     __m128i a64, b64, c1, res, tmp, bmasked;
3588*80a68eefSBob Badour     int i;
3589*80a68eefSBob Badour     a64 = _pM128i(a);
3590*80a68eefSBob Badour     b64 = _pM128i(b);
3591*80a68eefSBob Badour     c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
3592*80a68eefSBob Badour     c1 = vshrq_n_u8(c1,7); //0x1
3593*80a68eefSBob Badour     bmasked = _mm_and_si128(b64, c1); //0x1
3594*80a68eefSBob Badour     res = vmulq_u8(a64, bmasked);
3595*80a68eefSBob Badour     for(i = 1; i<8; i++) {
3596*80a68eefSBob Badour         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3597*80a68eefSBob Badour         bmasked = _mm_and_si128(b64, c1); //0x1
3598*80a68eefSBob Badour         tmp = vmulq_u8(a64, bmasked);
3599*80a68eefSBob Badour         res = _mm_xor_si128(res, tmp);
3600*80a68eefSBob Badour     }
3601*80a68eefSBob Badour     return64 (res);
3602*80a68eefSBob Badour }
3603*80a68eefSBob Badour 
3604*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
vmulq_s8(int8x16_t a,int8x16_t b)3605*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
3606*80a68eefSBob Badour {
3607*80a68eefSBob Badour     // no 8 bit simd multiply, need to go to 16 bits
3608*80a68eefSBob Badour     //solution may be not optimal
3609*80a68eefSBob Badour     __m128i a16, b16, r16_1, r16_2;
3610*80a68eefSBob Badour     a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
3611*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3612*80a68eefSBob Badour     r16_1 = _mm_mullo_epi16 (a16, b16);
3613*80a68eefSBob Badour     //swap hi and low part of a and b to process the remaining data
3614*80a68eefSBob Badour     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3615*80a68eefSBob Badour     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3616*80a68eefSBob Badour     a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3617*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1  __m128i r16_2
3618*80a68eefSBob Badour 
3619*80a68eefSBob Badour     r16_2 = _mm_mullo_epi16 (a16, b16);
3620*80a68eefSBob Badour     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
3621*80a68eefSBob Badour     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
3622*80a68eefSBob Badour 
3623*80a68eefSBob Badour     return _mm_unpacklo_epi64(r16_1,  r16_2);
3624*80a68eefSBob Badour }
3625*80a68eefSBob Badour 
3626*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
3627*80a68eefSBob Badour #define vmulq_s16 _mm_mullo_epi16
3628*80a68eefSBob Badour 
3629*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
3630*80a68eefSBob Badour #define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
3631*80a68eefSBob Badour 
3632*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
3633*80a68eefSBob Badour #define vmulq_f32 _mm_mul_ps
3634*80a68eefSBob Badour 
3635*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
vmulq_u8(uint8x16_t a,uint8x16_t b)3636*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
3637*80a68eefSBob Badour {
3638*80a68eefSBob Badour     // no 8 bit simd multiply, need to go to 16 bits
3639*80a68eefSBob Badour     //solution may be not optimal
3640*80a68eefSBob Badour     __m128i maskff, a16, b16, r16_1, r16_2;
3641*80a68eefSBob Badour     maskff = _mm_set1_epi16(0xff);
3642*80a68eefSBob Badour     a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
3643*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3644*80a68eefSBob Badour     r16_1 = _mm_mullo_epi16 (a16, b16);
3645*80a68eefSBob Badour     r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
3646*80a68eefSBob Badour     //swap hi and low part of a and b to process the remaining data
3647*80a68eefSBob Badour     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3648*80a68eefSBob Badour     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3649*80a68eefSBob Badour     a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3650*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3651*80a68eefSBob Badour 
3652*80a68eefSBob Badour     r16_2 = _mm_mullo_epi16 (a16, b16);
3653*80a68eefSBob Badour     r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
3654*80a68eefSBob Badour     return _mm_packus_epi16 (r16_1,  r16_2);
3655*80a68eefSBob Badour }
3656*80a68eefSBob Badour 
3657*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
3658*80a68eefSBob Badour #define vmulq_u16 _mm_mullo_epi16
3659*80a68eefSBob Badour 
3660*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
3661*80a68eefSBob Badour #define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
3662*80a68eefSBob Badour 
3663*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
vmulq_p8(poly8x16_t a,poly8x16_t b)3664*80a68eefSBob Badour _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
3665*80a68eefSBob Badour {
3666*80a68eefSBob Badour     //may be optimized
3667*80a68eefSBob Badour     __m128i c1, res, tmp, bmasked;
3668*80a68eefSBob Badour     int i;
3669*80a68eefSBob Badour     c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
3670*80a68eefSBob Badour     c1 = vshrq_n_u8(c1,7); //0x1
3671*80a68eefSBob Badour     bmasked = _mm_and_si128(b, c1); //0x1
3672*80a68eefSBob Badour     res = vmulq_u8(a, bmasked);
3673*80a68eefSBob Badour     for(i = 1; i<8; i++) {
3674*80a68eefSBob Badour         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3675*80a68eefSBob Badour         bmasked = _mm_and_si128(b, c1); //0x1
3676*80a68eefSBob Badour         tmp = vmulq_u8(a, bmasked);
3677*80a68eefSBob Badour         res = _mm_xor_si128(res, tmp);
3678*80a68eefSBob Badour     }
3679*80a68eefSBob Badour     return res;
3680*80a68eefSBob Badour }
3681*80a68eefSBob Badour 
3682*80a68eefSBob Badour //************************* Vector long multiply ***********************************
3683*80a68eefSBob Badour //****************************************************************************
3684*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
vmull_s8(int8x8_t a,int8x8_t b)3685*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
3686*80a68eefSBob Badour {
3687*80a68eefSBob Badour     //no 8 bit simd multiply, need to go to 16 bits
3688*80a68eefSBob Badour     __m128i a16, b16;
3689*80a68eefSBob Badour     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
3690*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
3691*80a68eefSBob Badour     return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3692*80a68eefSBob Badour }
3693*80a68eefSBob Badour 
3694*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
vmull_s16(int16x4_t a,int16x4_t b)3695*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
3696*80a68eefSBob Badour {
3697*80a68eefSBob Badour #ifdef USE_SSE4
3698*80a68eefSBob Badour     __m128i a16, b16;
3699*80a68eefSBob Badour     a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
3700*80a68eefSBob Badour     b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
3701*80a68eefSBob Badour     return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3702*80a68eefSBob Badour #else
3703*80a68eefSBob Badour     __m128i low, hi, a128,b128;
3704*80a68eefSBob Badour     a128 = _pM128i(a);
3705*80a68eefSBob Badour     b128 = _pM128i(b);
3706*80a68eefSBob Badour     low =  _mm_mullo_epi16(a128,b128);
3707*80a68eefSBob Badour     hi =   _mm_mulhi_epi16(a128,b128);
3708*80a68eefSBob Badour     return _mm_unpacklo_epi16(low,hi);
3709*80a68eefSBob Badour #endif
3710*80a68eefSBob Badour }
3711*80a68eefSBob Badour 
3712*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
vmull_s32(int32x2_t a,int32x2_t b)3713*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
3714*80a68eefSBob Badour {
3715*80a68eefSBob Badour     __m128i ab, ba, a128, b128;
3716*80a68eefSBob Badour     a128 = _pM128i(a);
3717*80a68eefSBob Badour     b128 = _pM128i(b);
3718*80a68eefSBob Badour     ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3719*80a68eefSBob Badour     ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3720*80a68eefSBob Badour     return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3721*80a68eefSBob Badour }
3722*80a68eefSBob Badour 
3723*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
vmull_u8(uint8x8_t a,uint8x8_t b)3724*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
3725*80a68eefSBob Badour {
3726*80a68eefSBob Badour     //no 8 bit simd multiply, need to go to 16 bits
3727*80a68eefSBob Badour     __m128i a16, b16;
3728*80a68eefSBob Badour     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
3729*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
3730*80a68eefSBob Badour     return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3731*80a68eefSBob Badour }
3732*80a68eefSBob Badour 
3733*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
vmull_u16(uint16x4_t a,uint16x4_t b)3734*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
3735*80a68eefSBob Badour {
3736*80a68eefSBob Badour #ifdef USE_SSE4
3737*80a68eefSBob Badour     __m128i a16, b16;
3738*80a68eefSBob Badour     a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
3739*80a68eefSBob Badour     b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
3740*80a68eefSBob Badour     return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3741*80a68eefSBob Badour #else
3742*80a68eefSBob Badour     __m128i a128,b128,low, hi;
3743*80a68eefSBob Badour     a128 = _pM128i(a);
3744*80a68eefSBob Badour     b128 = _pM128i(b);
3745*80a68eefSBob Badour     low =  _mm_mullo_epi16(a128,b128);
3746*80a68eefSBob Badour     hi =   _mm_mulhi_epu16(a128,b128);
3747*80a68eefSBob Badour     return _mm_unpacklo_epi16(low,hi);
3748*80a68eefSBob Badour #endif
3749*80a68eefSBob Badour }
3750*80a68eefSBob Badour 
3751*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
vmull_u32(uint32x2_t a,uint32x2_t b)3752*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
3753*80a68eefSBob Badour {
3754*80a68eefSBob Badour     ///may be not optimal compared with serial implementation
3755*80a68eefSBob Badour     __m128i ab, ba, a128, b128;
3756*80a68eefSBob Badour     a128 = _pM128i(a);
3757*80a68eefSBob Badour     b128 = _pM128i(b);
3758*80a68eefSBob Badour     ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3759*80a68eefSBob Badour     ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3760*80a68eefSBob Badour     return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3761*80a68eefSBob Badour }
3762*80a68eefSBob Badour 
3763*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
vmull_p8(poly8x8_t a,poly8x8_t b)3764*80a68eefSBob Badour _NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
3765*80a68eefSBob Badour {
3766*80a68eefSBob Badour     //may be optimized
3767*80a68eefSBob Badour     __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
3768*80a68eefSBob Badour     int i;
3769*80a68eefSBob Badour     a128 = _pM128i(a);
3770*80a68eefSBob Badour     b128 = _pM128i(b);
3771*80a68eefSBob Badour     c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
3772*80a68eefSBob Badour     c1 = vshrq_n_u8(c1,7); //0x1
3773*80a68eefSBob Badour     bmasked = _mm_and_si128(b128, c1); //0x1
3774*80a68eefSBob Badour 
3775*80a68eefSBob Badour     a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
3776*80a68eefSBob Badour     bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3777*80a68eefSBob Badour     res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
3778*80a68eefSBob Badour     for(i = 1; i<8; i++) {
3779*80a68eefSBob Badour         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3780*80a68eefSBob Badour         bmasked = _mm_and_si128(b128, c1); //0x1
3781*80a68eefSBob Badour         bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3782*80a68eefSBob Badour         tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
3783*80a68eefSBob Badour         res = _mm_xor_si128(res, tmp);
3784*80a68eefSBob Badour     }
3785*80a68eefSBob Badour     return res;
3786*80a68eefSBob Badour }
3787*80a68eefSBob Badour 
3788*80a68eefSBob Badour //****************Vector saturating doubling long multiply **************************
3789*80a68eefSBob Badour //*****************************************************************
3790*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
vqdmull_s16(int16x4_t a,int16x4_t b)3791*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
3792*80a68eefSBob Badour {
3793*80a68eefSBob Badour     //the serial soulution may be faster due to saturation
3794*80a68eefSBob Badour     __m128i res;
3795*80a68eefSBob Badour     res = vmull_s16(a, b);
3796*80a68eefSBob Badour     return vqd_s32(res);
3797*80a68eefSBob Badour }
3798*80a68eefSBob Badour 
3799*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3800*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
3801*80a68eefSBob Badour {
3802*80a68eefSBob Badour     //the serial soulution may be faster due to saturation
3803*80a68eefSBob Badour     __m128i res;
3804*80a68eefSBob Badour     res = vmull_s32(a,b);
3805*80a68eefSBob Badour     return vqaddq_s64(res,res); //slow serial function!!!!
3806*80a68eefSBob Badour }
3807*80a68eefSBob Badour 
3808*80a68eefSBob Badour //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
3809*80a68eefSBob Badour //******************************************************************************************
3810*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
vmla_s8(int8x8_t a,int8x8_t b,int8x8_t c)3811*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
3812*80a68eefSBob Badour {
3813*80a68eefSBob Badour     // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
3814*80a68eefSBob Badour     int8x8_t res64;
3815*80a68eefSBob Badour     __m128i b128, c128, res;
3816*80a68eefSBob Badour     b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3817*80a68eefSBob Badour     c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3818*80a68eefSBob Badour     res = _mm_mullo_epi16 (c128, b128);
3819*80a68eefSBob Badour     res  =  _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
3820*80a68eefSBob Badour     res  = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3821*80a68eefSBob Badour     return64(res);
3822*80a68eefSBob Badour }
3823*80a68eefSBob Badour 
3824*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
vmla_s16(int16x4_t a,int16x4_t b,int16x4_t c)3825*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
3826*80a68eefSBob Badour {
3827*80a68eefSBob Badour     int16x4_t res64;
3828*80a68eefSBob Badour     return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
3829*80a68eefSBob Badour }
3830*80a68eefSBob Badour 
3831*80a68eefSBob Badour 
3832*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
vmla_s32(int32x2_t a,int32x2_t b,int32x2_t c)3833*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
3834*80a68eefSBob Badour {
3835*80a68eefSBob Badour     int32x2_t res64;
3836*80a68eefSBob Badour     __m128i res;
3837*80a68eefSBob Badour     res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
3838*80a68eefSBob Badour     res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
3839*80a68eefSBob Badour     return64(res);
3840*80a68eefSBob Badour }
3841*80a68eefSBob Badour 
3842*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
vmla_f32(float32x2_t a,float32x2_t b,float32x2_t c)3843*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
3844*80a68eefSBob Badour {
3845*80a68eefSBob Badour     //fma is coming soon, but right now:
3846*80a68eefSBob Badour     __m128 res;
3847*80a68eefSBob Badour     __m64_128 res64;
3848*80a68eefSBob Badour     res = _mm_mul_ps (_pM128(c), _pM128(b));
3849*80a68eefSBob Badour     res = _mm_add_ps (_pM128(a), res);
3850*80a68eefSBob Badour     _M64f(res64, res);
3851*80a68eefSBob Badour     return res64;
3852*80a68eefSBob Badour }
3853*80a68eefSBob Badour 
3854*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
vmla_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)3855*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
3856*80a68eefSBob Badour {
3857*80a68eefSBob Badour     // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
3858*80a68eefSBob Badour     uint8x8_t res64;
3859*80a68eefSBob Badour     __m128i mask, b128, c128, res;
3860*80a68eefSBob Badour     mask = _mm_set1_epi16(0xff);
3861*80a68eefSBob Badour     b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3862*80a68eefSBob Badour     c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3863*80a68eefSBob Badour     res = _mm_mullo_epi16 (c128, b128);
3864*80a68eefSBob Badour     res = _mm_and_si128(res, mask); //to avoid saturation
3865*80a68eefSBob Badour     res = _mm_packus_epi16 (res, res);
3866*80a68eefSBob Badour     res =  _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3867*80a68eefSBob Badour     return64(res);
3868*80a68eefSBob Badour }
3869*80a68eefSBob Badour 
3870*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
3871*80a68eefSBob Badour #define vmla_u16 vmla_s16
3872*80a68eefSBob Badour 
3873*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
3874*80a68eefSBob Badour #define vmla_u32 vmla_s32
3875*80a68eefSBob Badour 
3876*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
vmlaq_s8(int8x16_t a,int8x16_t b,int8x16_t c)3877*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
3878*80a68eefSBob Badour {
3879*80a68eefSBob Badour     //solution may be not optimal
3880*80a68eefSBob Badour     // no 8 bit simd multiply, need to go to 16 bits
3881*80a68eefSBob Badour     __m128i b16, c16, r16_1, a_2,r16_2;
3882*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3883*80a68eefSBob Badour     c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
3884*80a68eefSBob Badour     r16_1 = _mm_mullo_epi16 (b16, c16);
3885*80a68eefSBob Badour     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
3886*80a68eefSBob Badour     r16_1 = _mm_add_epi8 (r16_1, a);
3887*80a68eefSBob Badour     //swap hi and low part of a, b and c to process the remaining data
3888*80a68eefSBob Badour     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3889*80a68eefSBob Badour     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3890*80a68eefSBob Badour     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3891*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3892*80a68eefSBob Badour     c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
3893*80a68eefSBob Badour 
3894*80a68eefSBob Badour     r16_2 = _mm_mullo_epi16 (b16, c16);
3895*80a68eefSBob Badour     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
3896*80a68eefSBob Badour     r16_2 = _mm_add_epi8(r16_2, a_2);
3897*80a68eefSBob Badour     return _mm_unpacklo_epi64(r16_1,r16_2);
3898*80a68eefSBob Badour }
3899*80a68eefSBob Badour 
3900*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
vmlaq_s16(int16x8_t a,int16x8_t b,int16x8_t c)3901*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
3902*80a68eefSBob Badour {
3903*80a68eefSBob Badour     __m128i res;
3904*80a68eefSBob Badour     res = _mm_mullo_epi16 (c, b);
3905*80a68eefSBob Badour     return _mm_add_epi16 (res, a);
3906*80a68eefSBob Badour }
3907*80a68eefSBob Badour 
3908*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
vmlaq_s32(int32x4_t a,int32x4_t b,int32x4_t c)3909*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
3910*80a68eefSBob Badour {
3911*80a68eefSBob Badour     __m128i res;
3912*80a68eefSBob Badour     res = _MM_MULLO_EPI32 (c,  b); //SSE4.1
3913*80a68eefSBob Badour     return _mm_add_epi32 (res, a);
3914*80a68eefSBob Badour }
3915*80a68eefSBob Badour 
3916*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
vmlaq_f32(float32x4_t a,float32x4_t b,float32x4_t c)3917*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
3918*80a68eefSBob Badour {
3919*80a68eefSBob Badour     //fma is coming soon, but right now:
3920*80a68eefSBob Badour     __m128 res;
3921*80a68eefSBob Badour     res = _mm_mul_ps (c, b);
3922*80a68eefSBob Badour     return _mm_add_ps (a, res);
3923*80a68eefSBob Badour }
3924*80a68eefSBob Badour 
3925*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
vmlaq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)3926*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
3927*80a68eefSBob Badour {
3928*80a68eefSBob Badour     //solution may be not optimal
3929*80a68eefSBob Badour     // no 8 bit simd multiply, need to go to 16 bits
3930*80a68eefSBob Badour     __m128i b16, c16, r16_1, a_2, r16_2;
3931*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3932*80a68eefSBob Badour     c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
3933*80a68eefSBob Badour     r16_1 = _mm_mullo_epi16 (b16, c16);
3934*80a68eefSBob Badour     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
3935*80a68eefSBob Badour     r16_1 = _mm_add_epi8 (r16_1, a);
3936*80a68eefSBob Badour     //swap hi and low part of a, b and c to process the remaining data
3937*80a68eefSBob Badour     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3938*80a68eefSBob Badour     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3939*80a68eefSBob Badour     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3940*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
3941*80a68eefSBob Badour     c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
3942*80a68eefSBob Badour 
3943*80a68eefSBob Badour     r16_2 = _mm_mullo_epi16 (b16, c16);
3944*80a68eefSBob Badour     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
3945*80a68eefSBob Badour     r16_2 = _mm_add_epi8(r16_2, a_2);
3946*80a68eefSBob Badour     return _mm_unpacklo_epi64(r16_1,r16_2);
3947*80a68eefSBob Badour }
3948*80a68eefSBob Badour 
3949*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
3950*80a68eefSBob Badour #define vmlaq_u16 vmlaq_s16
3951*80a68eefSBob Badour 
3952*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
3953*80a68eefSBob Badour #define vmlaq_u32 vmlaq_s32
3954*80a68eefSBob Badour 
3955*80a68eefSBob Badour //**********************  Vector widening multiply accumulate (long multiply accumulate):
3956*80a68eefSBob Badour //                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
3957*80a68eefSBob Badour //********************************************************************************************
3958*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
vmlal_s8(int16x8_t a,int8x8_t b,int8x8_t c)3959*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
3960*80a68eefSBob Badour {
3961*80a68eefSBob Badour     int16x8_t res;
3962*80a68eefSBob Badour     res = vmull_s8(b, c);
3963*80a68eefSBob Badour     return _mm_add_epi16 (res, a);
3964*80a68eefSBob Badour }
3965*80a68eefSBob Badour 
3966*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
vmlal_s16(int32x4_t a,int16x4_t b,int16x4_t c)3967*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
3968*80a68eefSBob Badour {
3969*80a68eefSBob Badour     //may be not optimal compared with serial implementation
3970*80a68eefSBob Badour     int32x4_t res;
3971*80a68eefSBob Badour     res = vmull_s16(b,  c);
3972*80a68eefSBob Badour     return _mm_add_epi32 (res, a);
3973*80a68eefSBob Badour }
3974*80a68eefSBob Badour 
3975*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
vmlal_s32(int64x2_t a,int32x2_t b,int32x2_t c)3976*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
3977*80a68eefSBob Badour {
3978*80a68eefSBob Badour     //may be not optimal compared with serial implementation
3979*80a68eefSBob Badour     int64x2_t res;
3980*80a68eefSBob Badour     res = vmull_s32( b, c);
3981*80a68eefSBob Badour     return _mm_add_epi64 (res, a);
3982*80a68eefSBob Badour }
3983*80a68eefSBob Badour 
3984*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
vmlal_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)3985*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
3986*80a68eefSBob Badour {
3987*80a68eefSBob Badour     uint16x8_t res;
3988*80a68eefSBob Badour     res = vmull_u8(b, c);
3989*80a68eefSBob Badour     return _mm_add_epi16 (res, a);
3990*80a68eefSBob Badour }
3991*80a68eefSBob Badour 
3992*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
vmlal_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)3993*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
3994*80a68eefSBob Badour {
3995*80a68eefSBob Badour     //may be not optimal compared with serial implementation
3996*80a68eefSBob Badour     uint32x4_t res;
3997*80a68eefSBob Badour     res = vmull_u16(b, c);
3998*80a68eefSBob Badour     return _mm_add_epi32 (res, a);
3999*80a68eefSBob Badour }
4000*80a68eefSBob Badour 
4001*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
vmlal_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)4002*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
4003*80a68eefSBob Badour {
4004*80a68eefSBob Badour     //may be not optimal compared with serial implementation
4005*80a68eefSBob Badour     int64x2_t res;
4006*80a68eefSBob Badour     res = vmull_u32( b,c);
4007*80a68eefSBob Badour     return _mm_add_epi64 (res, a);
4008*80a68eefSBob Badour }
4009*80a68eefSBob Badour 
4010*80a68eefSBob Badour //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
4011*80a68eefSBob Badour //********************************************************************************************
4012*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
vmls_s8(int8x8_t a,int8x8_t b,int8x8_t c)4013*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
4014*80a68eefSBob Badour {
4015*80a68eefSBob Badour     // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
4016*80a68eefSBob Badour     int8x8_t res64;
4017*80a68eefSBob Badour     __m128i res;
4018*80a68eefSBob Badour     res64 = vmul_s8(b,c);
4019*80a68eefSBob Badour     res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4020*80a68eefSBob Badour     return64(res);
4021*80a68eefSBob Badour }
4022*80a68eefSBob Badour 
4023*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
vmls_s16(int16x4_t a,int16x4_t b,int16x4_t c)4024*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
4025*80a68eefSBob Badour {
4026*80a68eefSBob Badour     int16x4_t res64;
4027*80a68eefSBob Badour     return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
4028*80a68eefSBob Badour }
4029*80a68eefSBob Badour 
4030*80a68eefSBob Badour 
4031*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
vmls_s32(int32x2_t a,int32x2_t b,int32x2_t c)4032*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
4033*80a68eefSBob Badour {
4034*80a68eefSBob Badour     int32x2_t res64;
4035*80a68eefSBob Badour     __m128i res;
4036*80a68eefSBob Badour     res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
4037*80a68eefSBob Badour     res =  _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
4038*80a68eefSBob Badour     return64(res);
4039*80a68eefSBob Badour }
4040*80a68eefSBob Badour 
4041*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
vmls_f32(float32x2_t a,float32x2_t b,float32x2_t c)4042*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
4043*80a68eefSBob Badour {
4044*80a68eefSBob Badour     __m128 res;
4045*80a68eefSBob Badour     __m64_128 res64;
4046*80a68eefSBob Badour     res = _mm_mul_ps (_pM128(c), _pM128(b));
4047*80a68eefSBob Badour     res = _mm_sub_ps (_pM128(a), res);
4048*80a68eefSBob Badour     _M64f(res64, res);
4049*80a68eefSBob Badour     return res64;
4050*80a68eefSBob Badour }
4051*80a68eefSBob Badour 
4052*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
vmls_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)4053*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
4054*80a68eefSBob Badour {
4055*80a68eefSBob Badour     // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
4056*80a68eefSBob Badour     uint8x8_t res64;
4057*80a68eefSBob Badour     __m128i res;
4058*80a68eefSBob Badour     res64 = vmul_u8(b,c);
4059*80a68eefSBob Badour     res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4060*80a68eefSBob Badour     return64(res);
4061*80a68eefSBob Badour }
4062*80a68eefSBob Badour 
4063*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
4064*80a68eefSBob Badour #define vmls_u16 vmls_s16
4065*80a68eefSBob Badour 
4066*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
4067*80a68eefSBob Badour #define vmls_u32 vmls_s32
4068*80a68eefSBob Badour 
4069*80a68eefSBob Badour 
4070*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
vmlsq_s8(int8x16_t a,int8x16_t b,int8x16_t c)4071*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
4072*80a68eefSBob Badour {
4073*80a68eefSBob Badour     //solution may be not optimal
4074*80a68eefSBob Badour     // no 8 bit simd multiply, need to go to 16 bits
4075*80a68eefSBob Badour     __m128i b16, c16, r16_1, a_2, r16_2;
4076*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
4077*80a68eefSBob Badour     c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
4078*80a68eefSBob Badour     r16_1 = _mm_mullo_epi16 (b16, c16);
4079*80a68eefSBob Badour     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
4080*80a68eefSBob Badour     r16_1 = _mm_sub_epi8 (a, r16_1);
4081*80a68eefSBob Badour     //swap hi and low part of a, b, c to process the remaining data
4082*80a68eefSBob Badour     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4083*80a68eefSBob Badour     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4084*80a68eefSBob Badour     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4085*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
4086*80a68eefSBob Badour     c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
4087*80a68eefSBob Badour 
4088*80a68eefSBob Badour     r16_2 = _mm_mullo_epi16 (b16, c16);
4089*80a68eefSBob Badour     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
4090*80a68eefSBob Badour     r16_2 = _mm_sub_epi8 (a_2, r16_2);
4091*80a68eefSBob Badour     return _mm_unpacklo_epi64(r16_1,r16_2);
4092*80a68eefSBob Badour }
4093*80a68eefSBob Badour 
4094*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
vmlsq_s16(int16x8_t a,int16x8_t b,int16x8_t c)4095*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
4096*80a68eefSBob Badour {
4097*80a68eefSBob Badour     __m128i res;
4098*80a68eefSBob Badour     res = _mm_mullo_epi16 (c, b);
4099*80a68eefSBob Badour     return _mm_sub_epi16 (a, res);
4100*80a68eefSBob Badour }
4101*80a68eefSBob Badour 
4102*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
vmlsq_s32(int32x4_t a,int32x4_t b,int32x4_t c)4103*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
4104*80a68eefSBob Badour {
4105*80a68eefSBob Badour     __m128i res;
4106*80a68eefSBob Badour     res = _MM_MULLO_EPI32 (c, b); //SSE4.1
4107*80a68eefSBob Badour     return _mm_sub_epi32 (a, res);
4108*80a68eefSBob Badour }
4109*80a68eefSBob Badour 
4110*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
vmlsq_f32(float32x4_t a,float32x4_t b,float32x4_t c)4111*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
4112*80a68eefSBob Badour {
4113*80a68eefSBob Badour     __m128 res;
4114*80a68eefSBob Badour     res = _mm_mul_ps (c, b);
4115*80a68eefSBob Badour     return _mm_sub_ps (a, res);
4116*80a68eefSBob Badour }
4117*80a68eefSBob Badour 
4118*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
vmlsq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)4119*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
4120*80a68eefSBob Badour {
4121*80a68eefSBob Badour     //solution may be not optimal
4122*80a68eefSBob Badour     // no 8 bit simd multiply, need to go to 16 bits
4123*80a68eefSBob Badour     __m128i b16, c16, r16_1, a_2, r16_2;
4124*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
4125*80a68eefSBob Badour     c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
4126*80a68eefSBob Badour     r16_1 = _mm_mullo_epi16 (b16, c16);
4127*80a68eefSBob Badour     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
4128*80a68eefSBob Badour     r16_1 = _mm_sub_epi8 (a, r16_1);
4129*80a68eefSBob Badour     //swap hi and low part of a, b and c to process the remaining data
4130*80a68eefSBob Badour     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4131*80a68eefSBob Badour     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4132*80a68eefSBob Badour     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4133*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
4134*80a68eefSBob Badour     c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
4135*80a68eefSBob Badour 
4136*80a68eefSBob Badour     r16_2 = _mm_mullo_epi16 (b16, c16);
4137*80a68eefSBob Badour     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
4138*80a68eefSBob Badour     r16_2 = _mm_sub_epi8(a_2, r16_2);
4139*80a68eefSBob Badour     return _mm_unpacklo_epi64(r16_1,r16_2);
4140*80a68eefSBob Badour }
4141*80a68eefSBob Badour 
4142*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
4143*80a68eefSBob Badour #define vmlsq_u16 vmlsq_s16
4144*80a68eefSBob Badour 
4145*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
4146*80a68eefSBob Badour #define vmlsq_u32 vmlsq_s32
4147*80a68eefSBob Badour 
4148*80a68eefSBob Badour //******************** Vector multiply subtract long (widening multiply subtract) ************************************
4149*80a68eefSBob Badour //*************************************************************************************************************
4150*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
vmlsl_s8(int16x8_t a,int8x8_t b,int8x8_t c)4151*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
4152*80a68eefSBob Badour {
4153*80a68eefSBob Badour     int16x8_t res;
4154*80a68eefSBob Badour     res = vmull_s8(b, c);
4155*80a68eefSBob Badour     return _mm_sub_epi16 (a, res);
4156*80a68eefSBob Badour }
4157*80a68eefSBob Badour 
4158*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
vmlsl_s16(int32x4_t a,int16x4_t b,int16x4_t c)4159*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
4160*80a68eefSBob Badour {
4161*80a68eefSBob Badour     //may be not optimal compared with serial implementation
4162*80a68eefSBob Badour     int32x4_t res;
4163*80a68eefSBob Badour     res = vmull_s16(b,  c);
4164*80a68eefSBob Badour     return _mm_sub_epi32 (a, res);
4165*80a68eefSBob Badour }
4166*80a68eefSBob Badour 
4167*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
vmlsl_s32(int64x2_t a,int32x2_t b,int32x2_t c)4168*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
4169*80a68eefSBob Badour {
4170*80a68eefSBob Badour     //may be not optimal compared with serial implementation
4171*80a68eefSBob Badour     int64x2_t res;
4172*80a68eefSBob Badour     res = vmull_s32( b,c);
4173*80a68eefSBob Badour     return _mm_sub_epi64 (a, res);
4174*80a68eefSBob Badour }
4175*80a68eefSBob Badour 
4176*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
vmlsl_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)4177*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
4178*80a68eefSBob Badour {
4179*80a68eefSBob Badour     uint16x8_t res;
4180*80a68eefSBob Badour     res = vmull_u8(b, c);
4181*80a68eefSBob Badour     return _mm_sub_epi16 (a, res);
4182*80a68eefSBob Badour }
4183*80a68eefSBob Badour 
4184*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
vmlsl_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)4185*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
4186*80a68eefSBob Badour {
4187*80a68eefSBob Badour     //may be not optimal compared with serial implementation
4188*80a68eefSBob Badour     uint32x4_t res;
4189*80a68eefSBob Badour     res = vmull_u16(b, c);
4190*80a68eefSBob Badour     return _mm_sub_epi32 (a, res);
4191*80a68eefSBob Badour }
4192*80a68eefSBob Badour 
4193*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
vmlsl_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)4194*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
4195*80a68eefSBob Badour {
4196*80a68eefSBob Badour     //may be not optimal compared with serial implementation
4197*80a68eefSBob Badour     int64x2_t res;
4198*80a68eefSBob Badour     res = vmull_u32( b,c);
4199*80a68eefSBob Badour     return _mm_sub_epi64 (a, res);
4200*80a68eefSBob Badour }
4201*80a68eefSBob Badour 
4202*80a68eefSBob Badour //******  Vector saturating doubling multiply high **********************
4203*80a68eefSBob Badour //*************************************************************************
4204*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b); // VQDMULH.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)4205*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4206*80a68eefSBob Badour {
4207*80a68eefSBob Badour     int16x4_t res;
4208*80a68eefSBob Badour     int32_t a32, b32, i;
4209*80a68eefSBob Badour     for (i = 0; i<4; i++) {
4210*80a68eefSBob Badour         a32 = (int32_t) a.m64_i16[i];
4211*80a68eefSBob Badour         b32 = (int32_t) b.m64_i16[i];
4212*80a68eefSBob Badour         a32 = (a32 * b32) >> 15;
4213*80a68eefSBob Badour         res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
4214*80a68eefSBob Badour     }
4215*80a68eefSBob Badour     return res;
4216*80a68eefSBob Badour }
4217*80a68eefSBob Badour 
4218*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
vqdmulh_s32(int32x2_t a,int32x2_t b)4219*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
4220*80a68eefSBob Badour {
4221*80a68eefSBob Badour     //may be not optimal compared with a serial solution
4222*80a68eefSBob Badour     int32x2_t res64;
4223*80a68eefSBob Badour     __m128i mask;
4224*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4225*80a68eefSBob Badour     int64x2_t mul;
4226*80a68eefSBob Badour     mul = vmull_s32(a,b);
4227*80a68eefSBob Badour     mul = _mm_slli_epi64(mul,1); //double the result
4228*80a68eefSBob Badour     //at this point start treating 2 64-bit numbers as 4 32-bit
4229*80a68eefSBob Badour     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4230*80a68eefSBob Badour     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4231*80a68eefSBob Badour     mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
4232*80a68eefSBob Badour     return64(mul);
4233*80a68eefSBob Badour }
4234*80a68eefSBob Badour 
4235*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
vqdmulhq_s16(int16x8_t a,int16x8_t b)4236*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
4237*80a68eefSBob Badour {
4238*80a68eefSBob Badour     __m128i res, res_lo, mask;
4239*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4240*80a68eefSBob Badour     res = _mm_mulhi_epi16 (a, b);
4241*80a68eefSBob Badour     res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
4242*80a68eefSBob Badour     res_lo = _mm_mullo_epi16 (a, b);
4243*80a68eefSBob Badour     res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
4244*80a68eefSBob Badour     res = _mm_add_epi16(res, res_lo); //combine results
4245*80a68eefSBob Badour     mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
4246*80a68eefSBob Badour     return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
4247*80a68eefSBob Badour }
4248*80a68eefSBob Badour 
4249*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)4250*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4251*80a68eefSBob Badour {
4252*80a68eefSBob Badour     // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4253*80a68eefSBob Badour     __m128i ab, ba, mask, mul, mul1;
4254*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4255*80a68eefSBob Badour     ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4256*80a68eefSBob Badour     ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4257*80a68eefSBob Badour     mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4258*80a68eefSBob Badour     mul = _mm_slli_epi64(mul,1); //double the result
4259*80a68eefSBob Badour     ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4260*80a68eefSBob Badour     ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4261*80a68eefSBob Badour     mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4262*80a68eefSBob Badour     mul1 = _mm_slli_epi64(mul1,1); //double the result
4263*80a68eefSBob Badour     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4264*80a68eefSBob Badour     mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4265*80a68eefSBob Badour     mul = _mm_unpacklo_epi64(mul, mul1);
4266*80a68eefSBob Badour     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4267*80a68eefSBob Badour     return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
4268*80a68eefSBob Badour }
4269*80a68eefSBob Badour 
4270*80a68eefSBob Badour //********* Vector saturating rounding doubling multiply high ****************
4271*80a68eefSBob Badour //****************************************************************************
4272*80a68eefSBob Badour //If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
4273*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b); // VQRDMULH.S16 d0,d0,d0
vqrdmulh_s16(int16x4_t a,int16x4_t b)4274*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b)
4275*80a68eefSBob Badour {
4276*80a68eefSBob Badour     int16x4_t res64;
4277*80a68eefSBob Badour     return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
4278*80a68eefSBob Badour }
4279*80a68eefSBob Badour 
4280*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)4281*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4282*80a68eefSBob Badour {
4283*80a68eefSBob Badour     //may be not optimal compared with a serial solution
4284*80a68eefSBob Badour     int32x2_t res64;
4285*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4286*80a68eefSBob Badour     __m128i res_sat, mask, mask1;
4287*80a68eefSBob Badour     int64x2_t mul;
4288*80a68eefSBob Badour     mul = vmull_s32(a,b);
4289*80a68eefSBob Badour     res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
4290*80a68eefSBob Badour     mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
4291*80a68eefSBob Badour     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
4292*80a68eefSBob Badour     mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
4293*80a68eefSBob Badour     //at this point start treating 2 64-bit numbers as 4 32-bit
4294*80a68eefSBob Badour     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4295*80a68eefSBob Badour     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4296*80a68eefSBob Badour     mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
4297*80a68eefSBob Badour     return64(mul);
4298*80a68eefSBob Badour }
4299*80a68eefSBob Badour 
4300*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
vqrdmulhq_s16(int16x8_t a,int16x8_t b)4301*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
4302*80a68eefSBob Badour {
4303*80a68eefSBob Badour     __m128i mask, res;
4304*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4305*80a68eefSBob Badour     res = _mm_mulhrs_epi16 (a, b);
4306*80a68eefSBob Badour     mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
4307*80a68eefSBob Badour     return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
4308*80a68eefSBob Badour }
4309*80a68eefSBob Badour 
4310*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)4311*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4312*80a68eefSBob Badour {
4313*80a68eefSBob Badour     // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4314*80a68eefSBob Badour     __m128i ab, ba,  mask, mul, mul1, mask1;
4315*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4316*80a68eefSBob Badour     ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4317*80a68eefSBob Badour     ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4318*80a68eefSBob Badour     mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4319*80a68eefSBob Badour     mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
4320*80a68eefSBob Badour     mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
4321*80a68eefSBob Badour     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
4322*80a68eefSBob Badour     mul = _mm_add_epi32 (mul, mask1); //actual rounding
4323*80a68eefSBob Badour 
4324*80a68eefSBob Badour     ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4325*80a68eefSBob Badour     ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4326*80a68eefSBob Badour     mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4327*80a68eefSBob Badour     mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
4328*80a68eefSBob Badour     mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
4329*80a68eefSBob Badour     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
4330*80a68eefSBob Badour     mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
4331*80a68eefSBob Badour     //at this point start treating 2 64-bit numbers as 4 32-bit
4332*80a68eefSBob Badour     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4333*80a68eefSBob Badour     mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4334*80a68eefSBob Badour     mul = _mm_unpacklo_epi64(mul, mul1);
4335*80a68eefSBob Badour     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4336*80a68eefSBob Badour     return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
4337*80a68eefSBob Badour }
4338*80a68eefSBob Badour 
4339*80a68eefSBob Badour //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
4340*80a68eefSBob Badour //*************************************************************************************************************************
4341*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
vqdmlal_s16(int32x4_t a,int16x4_t b,int16x4_t c)4342*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
4343*80a68eefSBob Badour {
4344*80a68eefSBob Badour     //not optimal SIMD soulution, serial may be faster
4345*80a68eefSBob Badour     __m128i res32;
4346*80a68eefSBob Badour     res32 = vmull_s16(b,  c);
4347*80a68eefSBob Badour     res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
4348*80a68eefSBob Badour     return vqaddq_s32(res32, a); //saturation
4349*80a68eefSBob Badour }
4350*80a68eefSBob Badour 
4351*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32 (int64x2_t a,int32x2_t b,int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)4352*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
4353*80a68eefSBob Badour {
4354*80a68eefSBob Badour     __m128i res64;
4355*80a68eefSBob Badour     res64 = vmull_s32(b,c);
4356*80a68eefSBob Badour     res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
4357*80a68eefSBob Badour     return vqaddq_s64(res64, a); //saturation
4358*80a68eefSBob Badour }
4359*80a68eefSBob Badour 
4360*80a68eefSBob Badour //************************************************************************************
4361*80a68eefSBob Badour //******************  Vector subtract ***********************************************
4362*80a68eefSBob Badour //************************************************************************************
4363*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
vsub_s8(int8x8_t a,int8x8_t b)4364*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
4365*80a68eefSBob Badour {
4366*80a68eefSBob Badour     int8x8_t res64;
4367*80a68eefSBob Badour     return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
4368*80a68eefSBob Badour }
4369*80a68eefSBob Badour 
4370*80a68eefSBob Badour 
4371*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
vsub_s16(int16x4_t a,int16x4_t b)4372*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
4373*80a68eefSBob Badour {
4374*80a68eefSBob Badour     int16x4_t res64;
4375*80a68eefSBob Badour     return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
4376*80a68eefSBob Badour }
4377*80a68eefSBob Badour 
4378*80a68eefSBob Badour 
4379*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
vsub_s32(int32x2_t a,int32x2_t b)4380*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
4381*80a68eefSBob Badour {
4382*80a68eefSBob Badour     int32x2_t res64;
4383*80a68eefSBob Badour     return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
4384*80a68eefSBob Badour }
4385*80a68eefSBob Badour 
4386*80a68eefSBob Badour 
4387*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b); // VSUB.I64 d0,d0,d0
vsub_s64(int64x1_t a,int64x1_t b)4388*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b)
4389*80a68eefSBob Badour {
4390*80a68eefSBob Badour     int64x1_t res64;
4391*80a68eefSBob Badour     res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
4392*80a68eefSBob Badour     return res64;
4393*80a68eefSBob Badour }
4394*80a68eefSBob Badour 
4395*80a68eefSBob Badour 
4396*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
vsub_f32(float32x2_t a,float32x2_t b)4397*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
4398*80a68eefSBob Badour {
4399*80a68eefSBob Badour     float32x2_t res;
4400*80a68eefSBob Badour     res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
4401*80a68eefSBob Badour     res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
4402*80a68eefSBob Badour     return res;
4403*80a68eefSBob Badour }
4404*80a68eefSBob Badour 
4405*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
4406*80a68eefSBob Badour #define vsub_u8 vsub_s8
4407*80a68eefSBob Badour 
4408*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
4409*80a68eefSBob Badour #define vsub_u16 vsub_s16
4410*80a68eefSBob Badour 
4411*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
4412*80a68eefSBob Badour #define vsub_u32 vsub_s32
4413*80a68eefSBob Badour 
4414*80a68eefSBob Badour 
4415*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b); // VSUB.I64 d0,d0,d0
vsub_u64(uint64x1_t a,uint64x1_t b)4416*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b)
4417*80a68eefSBob Badour {
4418*80a68eefSBob Badour     int64x1_t res64;
4419*80a68eefSBob Badour     res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
4420*80a68eefSBob Badour     return res64;
4421*80a68eefSBob Badour }
4422*80a68eefSBob Badour 
4423*80a68eefSBob Badour 
4424*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
4425*80a68eefSBob Badour #define vsubq_s8 _mm_sub_epi8
4426*80a68eefSBob Badour 
4427*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
4428*80a68eefSBob Badour #define vsubq_s16 _mm_sub_epi16
4429*80a68eefSBob Badour 
4430*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
4431*80a68eefSBob Badour #define vsubq_s32 _mm_sub_epi32
4432*80a68eefSBob Badour 
4433*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
4434*80a68eefSBob Badour #define vsubq_s64 _mm_sub_epi64
4435*80a68eefSBob Badour 
4436*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
4437*80a68eefSBob Badour #define vsubq_f32 _mm_sub_ps
4438*80a68eefSBob Badour 
4439*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
4440*80a68eefSBob Badour #define vsubq_u8 _mm_sub_epi8
4441*80a68eefSBob Badour 
4442*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
4443*80a68eefSBob Badour #define vsubq_u16 _mm_sub_epi16
4444*80a68eefSBob Badour 
4445*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
4446*80a68eefSBob Badour #define vsubq_u32 _mm_sub_epi32
4447*80a68eefSBob Badour 
4448*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
4449*80a68eefSBob Badour #define vsubq_u64 _mm_sub_epi64
4450*80a68eefSBob Badour 
4451*80a68eefSBob Badour //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
4452*80a68eefSBob Badour //***********************************************************************************
4453*80a68eefSBob Badour //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
4454*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
vsubl_s8(int8x8_t a,int8x8_t b)4455*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
4456*80a68eefSBob Badour {
4457*80a68eefSBob Badour     __m128i a16, b16;
4458*80a68eefSBob Badour     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
4459*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4460*80a68eefSBob Badour     return _mm_sub_epi16 (a16, b16);
4461*80a68eefSBob Badour }
4462*80a68eefSBob Badour 
4463*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
vsubl_s16(int16x4_t a,int16x4_t b)4464*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
4465*80a68eefSBob Badour {
4466*80a68eefSBob Badour     __m128i a32, b32;
4467*80a68eefSBob Badour     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
4468*80a68eefSBob Badour     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4469*80a68eefSBob Badour     return _mm_sub_epi32 (a32, b32);
4470*80a68eefSBob Badour }
4471*80a68eefSBob Badour 
4472*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
vsubl_s32(int32x2_t a,int32x2_t b)4473*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
4474*80a68eefSBob Badour {
4475*80a68eefSBob Badour     //may be not optimal
4476*80a68eefSBob Badour     __m128i a64, b64;
4477*80a68eefSBob Badour     a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
4478*80a68eefSBob Badour     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
4479*80a68eefSBob Badour     return _mm_sub_epi64 (a64, b64);
4480*80a68eefSBob Badour }
4481*80a68eefSBob Badour 
4482*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
vsubl_u8(uint8x8_t a,uint8x8_t b)4483*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
4484*80a68eefSBob Badour {
4485*80a68eefSBob Badour     __m128i a16, b16;
4486*80a68eefSBob Badour     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
4487*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4488*80a68eefSBob Badour     return _mm_sub_epi16 (a16, b16);
4489*80a68eefSBob Badour }
4490*80a68eefSBob Badour 
4491*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
vsubl_u16(uint16x4_t a,uint16x4_t b)4492*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
4493*80a68eefSBob Badour {
4494*80a68eefSBob Badour     __m128i a32, b32;
4495*80a68eefSBob Badour     a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
4496*80a68eefSBob Badour     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4497*80a68eefSBob Badour     return _mm_sub_epi32 (a32, b32);
4498*80a68eefSBob Badour }
4499*80a68eefSBob Badour 
4500*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
vsubl_u32(uint32x2_t a,uint32x2_t b)4501*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
4502*80a68eefSBob Badour {
4503*80a68eefSBob Badour     //may be not optimal
4504*80a68eefSBob Badour     __m128i a64, b64;
4505*80a68eefSBob Badour     a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
4506*80a68eefSBob Badour     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
4507*80a68eefSBob Badour     return _mm_sub_epi64 (a64, b64);
4508*80a68eefSBob Badour }
4509*80a68eefSBob Badour 
4510*80a68eefSBob Badour //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
4511*80a68eefSBob Badour //*****************************************************************************************************
4512*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
vsubw_s8(int16x8_t a,int8x8_t b)4513*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
4514*80a68eefSBob Badour {
4515*80a68eefSBob Badour     __m128i b16;
4516*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4517*80a68eefSBob Badour     return _mm_sub_epi16 (a, b16);
4518*80a68eefSBob Badour }
4519*80a68eefSBob Badour 
4520*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
vsubw_s16(int32x4_t a,int16x4_t b)4521*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
4522*80a68eefSBob Badour {
4523*80a68eefSBob Badour     __m128i b32;
4524*80a68eefSBob Badour     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4525*80a68eefSBob Badour     return _mm_sub_epi32 (a, b32);
4526*80a68eefSBob Badour }
4527*80a68eefSBob Badour 
4528*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
vsubw_s32(int64x2_t a,int32x2_t b)4529*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
4530*80a68eefSBob Badour {
4531*80a68eefSBob Badour     __m128i b64;
4532*80a68eefSBob Badour     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
4533*80a68eefSBob Badour     return _mm_sub_epi64 (a, b64);
4534*80a68eefSBob Badour }
4535*80a68eefSBob Badour 
4536*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
vsubw_u8(uint16x8_t a,uint8x8_t b)4537*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
4538*80a68eefSBob Badour {
4539*80a68eefSBob Badour     __m128i b16;
4540*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4541*80a68eefSBob Badour     return _mm_sub_epi16 (a, b16);
4542*80a68eefSBob Badour }
4543*80a68eefSBob Badour 
4544*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
vsubw_u16(uint32x4_t a,uint16x4_t b)4545*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
4546*80a68eefSBob Badour {
4547*80a68eefSBob Badour     __m128i b32;
4548*80a68eefSBob Badour     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4549*80a68eefSBob Badour     return _mm_sub_epi32 (a, b32);
4550*80a68eefSBob Badour }
4551*80a68eefSBob Badour 
4552*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
vsubw_u32(uint64x2_t a,uint32x2_t b)4553*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
4554*80a68eefSBob Badour {
4555*80a68eefSBob Badour     __m128i b64;
4556*80a68eefSBob Badour     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
4557*80a68eefSBob Badour     return _mm_sub_epi64 (a, b64);
4558*80a68eefSBob Badour }
4559*80a68eefSBob Badour 
4560*80a68eefSBob Badour //************************Vector saturating subtract *********************************
4561*80a68eefSBob Badour //*************************************************************************************
4562*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
vqsub_s8(int8x8_t a,int8x8_t b)4563*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
4564*80a68eefSBob Badour {
4565*80a68eefSBob Badour     int8x8_t res64;
4566*80a68eefSBob Badour     return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
4567*80a68eefSBob Badour }
4568*80a68eefSBob Badour 
4569*80a68eefSBob Badour 
4570*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
vqsub_s16(int16x4_t a,int16x4_t b)4571*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
4572*80a68eefSBob Badour {
4573*80a68eefSBob Badour     int16x4_t res64;
4574*80a68eefSBob Badour     return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
4575*80a68eefSBob Badour }
4576*80a68eefSBob Badour 
4577*80a68eefSBob Badour 
4578*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b); // VQSUB.S32 d0,d0,d0
vqsub_s32(int32x2_t a,int32x2_t b)4579*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b)
4580*80a68eefSBob Badour {
4581*80a68eefSBob Badour     int32x2_t res64;
4582*80a68eefSBob Badour     return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
4583*80a68eefSBob Badour }
4584*80a68eefSBob Badour 
4585*80a68eefSBob Badour 
4586*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)4587*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4588*80a68eefSBob Badour {
4589*80a68eefSBob Badour     uint64x1_t res;
4590*80a68eefSBob Badour     uint64_t a64,b64;
4591*80a68eefSBob Badour     a64 = a.m64_u64[0];
4592*80a68eefSBob Badour     b64 = b.m64_u64[0];
4593*80a68eefSBob Badour     res.m64_u64[0] = a64 - b64;
4594*80a68eefSBob Badour 
4595*80a68eefSBob Badour     a64 =  (a64 >> 63) + (~_SIGNBIT64);
4596*80a68eefSBob Badour     if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
4597*80a68eefSBob Badour         res.m64_u64[0] = a64;
4598*80a68eefSBob Badour     }
4599*80a68eefSBob Badour     return res;
4600*80a68eefSBob Badour }
4601*80a68eefSBob Badour 
4602*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
vqsub_u8(uint8x8_t a,uint8x8_t b)4603*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
4604*80a68eefSBob Badour {
4605*80a68eefSBob Badour     uint8x8_t res64;
4606*80a68eefSBob Badour     return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
4607*80a68eefSBob Badour }
4608*80a68eefSBob Badour 
4609*80a68eefSBob Badour 
4610*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
vqsub_u16(uint16x4_t a,uint16x4_t b)4611*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
4612*80a68eefSBob Badour {
4613*80a68eefSBob Badour     uint16x4_t res64;
4614*80a68eefSBob Badour     return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
4615*80a68eefSBob Badour }
4616*80a68eefSBob Badour 
4617*80a68eefSBob Badour 
4618*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b); // VQSUB.U32 d0,d0,d0
vqsub_u32(uint32x2_t a,uint32x2_t b)4619*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b)
4620*80a68eefSBob Badour {
4621*80a68eefSBob Badour     uint32x2_t res64;
4622*80a68eefSBob Badour     return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
4623*80a68eefSBob Badour }
4624*80a68eefSBob Badour 
4625*80a68eefSBob Badour 
4626*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64 (uint64x1_t a,uint64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)4627*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4628*80a68eefSBob Badour {
4629*80a68eefSBob Badour     uint64x1_t res;
4630*80a68eefSBob Badour     uint64_t a64, b64;
4631*80a68eefSBob Badour     a64 = _Ui64(a);
4632*80a68eefSBob Badour     b64 = _Ui64(b);
4633*80a68eefSBob Badour     if (a64 > b64) {
4634*80a68eefSBob Badour         res.m64_u64[0] = a64 - b64;
4635*80a68eefSBob Badour     } else {
4636*80a68eefSBob Badour         res.m64_u64[0] = 0;
4637*80a68eefSBob Badour     }
4638*80a68eefSBob Badour     return res;
4639*80a68eefSBob Badour }
4640*80a68eefSBob Badour 
4641*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
4642*80a68eefSBob Badour #define vqsubq_s8 _mm_subs_epi8
4643*80a68eefSBob Badour 
4644*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
4645*80a68eefSBob Badour #define vqsubq_s16 _mm_subs_epi16
4646*80a68eefSBob Badour 
4647*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
vqsubq_s32(int32x4_t a,int32x4_t b)4648*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
4649*80a68eefSBob Badour {
4650*80a68eefSBob Badour     //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
4651*80a68eefSBob Badour     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
4652*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32(0x7fffffff);
4653*80a68eefSBob Badour     res = _mm_sub_epi32(a, b);
4654*80a68eefSBob Badour     res_sat = _mm_srli_epi32(a, 31);
4655*80a68eefSBob Badour     res_sat = _mm_add_epi32(res_sat, c7fffffff);
4656*80a68eefSBob Badour     res_xor_a = _mm_xor_si128(res, a);
4657*80a68eefSBob Badour     b_xor_a = _mm_xor_si128(b, a);
4658*80a68eefSBob Badour     res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
4659*80a68eefSBob Badour     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
4660*80a68eefSBob Badour     res_sat = _mm_and_si128(res_xor_a, res_sat);
4661*80a68eefSBob Badour     res = _mm_andnot_si128(res_xor_a, res);
4662*80a68eefSBob Badour     return _mm_or_si128(res, res_sat);
4663*80a68eefSBob Badour }
4664*80a68eefSBob Badour 
4665*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)4666*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4667*80a68eefSBob Badour {
4668*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
4669*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t res[2];
4670*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
4671*80a68eefSBob Badour     _mm_store_si128((__m128i*)btmp, b);
4672*80a68eefSBob Badour     res[0] = atmp[0] - btmp[0];
4673*80a68eefSBob Badour     res[1] = atmp[1] - btmp[1];
4674*80a68eefSBob Badour     if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
4675*80a68eefSBob Badour         res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
4676*80a68eefSBob Badour     }
4677*80a68eefSBob Badour     if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
4678*80a68eefSBob Badour         res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
4679*80a68eefSBob Badour     }
4680*80a68eefSBob Badour     return _mm_load_si128((__m128i*)res);
4681*80a68eefSBob Badour }
4682*80a68eefSBob Badour 
4683*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
4684*80a68eefSBob Badour #define vqsubq_u8 _mm_subs_epu8
4685*80a68eefSBob Badour 
4686*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
4687*80a68eefSBob Badour #define vqsubq_u16 _mm_subs_epu16
4688*80a68eefSBob Badour 
4689*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
vqsubq_u32(uint32x4_t a,uint32x4_t b)4690*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
4691*80a68eefSBob Badour {
4692*80a68eefSBob Badour     __m128i min, mask, sub;
4693*80a68eefSBob Badour     min = _MM_MIN_EPU32(a, b); //SSE4.1
4694*80a68eefSBob Badour     mask = _mm_cmpeq_epi32 (min,  b);
4695*80a68eefSBob Badour     sub = _mm_sub_epi32 (a, b);
4696*80a68eefSBob Badour     return _mm_and_si128 ( sub, mask);
4697*80a68eefSBob Badour }
4698*80a68eefSBob Badour 
4699*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
4700*80a68eefSBob Badour #ifdef USE_SSE4
vqsubq_u64(uint64x2_t a,uint64x2_t b)4701*80a68eefSBob Badour     _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
4702*80a68eefSBob Badour     {
4703*80a68eefSBob Badour         __m128i c80000000, subb, suba, cmp, sub;
4704*80a68eefSBob Badour         c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
4705*80a68eefSBob Badour         sub  = _mm_sub_epi64 (a, b);
4706*80a68eefSBob Badour         suba = _mm_sub_epi64 (a, c80000000);
4707*80a68eefSBob Badour         subb = _mm_sub_epi64 (b, c80000000);
4708*80a68eefSBob Badour         cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
4709*80a68eefSBob Badour         return _mm_and_si128 (sub, cmp); //saturation
4710*80a68eefSBob Badour     }
4711*80a68eefSBob Badour #else
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64 (uint64x2_t a,uint64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)4712*80a68eefSBob Badour     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4713*80a68eefSBob Badour     {
4714*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
4715*80a68eefSBob Badour         _mm_store_si128((__m128i*)atmp, a);
4716*80a68eefSBob Badour         _mm_store_si128((__m128i*)btmp, b);
4717*80a68eefSBob Badour         res[0] = (atmp[0] > btmp[0]) ? atmp[0] -  btmp[0] : 0;
4718*80a68eefSBob Badour         res[1] = (atmp[1] > btmp[1]) ? atmp[1] -  btmp[1] : 0;
4719*80a68eefSBob Badour         return _mm_load_si128((__m128i*)(res));
4720*80a68eefSBob Badour     }
4721*80a68eefSBob Badour #endif
4722*80a68eefSBob Badour 
4723*80a68eefSBob Badour //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
4724*80a68eefSBob Badour //****************************************************************
4725*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
vhsub_s8(int8x8_t a,int8x8_t b)4726*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
4727*80a68eefSBob Badour {
4728*80a68eefSBob Badour     //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
4729*80a68eefSBob Badour     int8x8_t res64;
4730*80a68eefSBob Badour     __m128i r16;
4731*80a68eefSBob Badour     int8x8_t r;
4732*80a68eefSBob Badour     r = vsub_s8 (a, b);
4733*80a68eefSBob Badour     r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
4734*80a68eefSBob Badour     r16 = _mm_srai_epi16 (r16, 1); //SSE2
4735*80a68eefSBob Badour     r16 =  _mm_packs_epi16 (r16,r16); //use low 64 bits
4736*80a68eefSBob Badour     return64(r16);
4737*80a68eefSBob Badour }
4738*80a68eefSBob Badour 
4739*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b); // VHSUB.S16 d0,d0,d0
vhsub_s16(int16x4_t a,int16x4_t b)4740*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b)
4741*80a68eefSBob Badour {
4742*80a68eefSBob Badour     int16x4_t res64;
4743*80a68eefSBob Badour     return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
4744*80a68eefSBob Badour }
4745*80a68eefSBob Badour 
4746*80a68eefSBob Badour 
4747*80a68eefSBob Badour 
4748*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b); // VHSUB.S32 d0,d0,d0
vhsub_s32(int32x2_t a,int32x2_t b)4749*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b)
4750*80a68eefSBob Badour {
4751*80a68eefSBob Badour     int32x2_t res64;
4752*80a68eefSBob Badour     return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
4753*80a68eefSBob Badour }
4754*80a68eefSBob Badour 
4755*80a68eefSBob Badour 
4756*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b); // VHSUB.U8 d0,d0,d0
vhsub_u8(uint8x8_t a,uint8x8_t b)4757*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b)
4758*80a68eefSBob Badour {
4759*80a68eefSBob Badour     uint8x8_t res64;
4760*80a68eefSBob Badour     return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
4761*80a68eefSBob Badour }
4762*80a68eefSBob Badour 
4763*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b); // VHSUB.s16 d0,d0,d0
vhsub_u16(uint16x4_t a,uint16x4_t b)4764*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b)
4765*80a68eefSBob Badour {
4766*80a68eefSBob Badour     uint16x4_t res64;
4767*80a68eefSBob Badour     return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
4768*80a68eefSBob Badour }
4769*80a68eefSBob Badour 
4770*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b); // VHSUB.U32 d0,d0,d0
vhsub_u32(uint32x2_t a,uint32x2_t b)4771*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b)
4772*80a68eefSBob Badour {
4773*80a68eefSBob Badour     uint32x2_t res64;
4774*80a68eefSBob Badour     return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
4775*80a68eefSBob Badour }
4776*80a68eefSBob Badour 
4777*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
vhsubq_s8(int8x16_t a,int8x16_t b)4778*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
4779*80a68eefSBob Badour {
4780*80a68eefSBob Badour     // //need to deal with the possibility of internal overflow
4781*80a68eefSBob Badour     __m128i c128, au,bu;
4782*80a68eefSBob Badour     c128 = _mm_set1_epi8((int8_t)128);
4783*80a68eefSBob Badour     au = _mm_add_epi8( a, c128);
4784*80a68eefSBob Badour     bu = _mm_add_epi8( b, c128);
4785*80a68eefSBob Badour     return vhsubq_u8(au,bu);
4786*80a68eefSBob Badour }
4787*80a68eefSBob Badour 
4788*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
vhsubq_s16(int16x8_t a,int16x8_t b)4789*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
4790*80a68eefSBob Badour {
4791*80a68eefSBob Badour     //need to deal with the possibility of internal overflow
4792*80a68eefSBob Badour     __m128i c8000, au,bu;
4793*80a68eefSBob Badour     c8000 = _mm_set1_epi16((int16_t)0x8000);
4794*80a68eefSBob Badour     au = _mm_add_epi16( a, c8000);
4795*80a68eefSBob Badour     bu = _mm_add_epi16( b, c8000);
4796*80a68eefSBob Badour     return vhsubq_u16(au,bu);
4797*80a68eefSBob Badour }
4798*80a68eefSBob Badour 
4799*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
vhsubq_s32(int32x4_t a,int32x4_t b)4800*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
4801*80a68eefSBob Badour {
4802*80a68eefSBob Badour     //need to deal with the possibility of internal overflow
4803*80a68eefSBob Badour     __m128i a2, b2,r, b_1;
4804*80a68eefSBob Badour     a2 = _mm_srai_epi32 (a,1);
4805*80a68eefSBob Badour     b2 = _mm_srai_epi32 (b,1);
4806*80a68eefSBob Badour     r = _mm_sub_epi32 (a2, b2);
4807*80a68eefSBob Badour     b_1 = _mm_andnot_si128(a, b); //!a and b
4808*80a68eefSBob Badour     b_1 = _mm_slli_epi32 (b_1,31);
4809*80a68eefSBob Badour     b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
4810*80a68eefSBob Badour     return _mm_sub_epi32(r,b_1);
4811*80a68eefSBob Badour }
4812*80a68eefSBob Badour 
4813*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
vhsubq_u8(uint8x16_t a,uint8x16_t b)4814*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
4815*80a68eefSBob Badour {
4816*80a68eefSBob Badour     __m128i avg;
4817*80a68eefSBob Badour     avg = _mm_avg_epu8 (a, b);
4818*80a68eefSBob Badour     return _mm_sub_epi8(a, avg);
4819*80a68eefSBob Badour }
4820*80a68eefSBob Badour 
4821*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
vhsubq_u16(uint16x8_t a,uint16x8_t b)4822*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
4823*80a68eefSBob Badour {
4824*80a68eefSBob Badour     __m128i avg;
4825*80a68eefSBob Badour     avg = _mm_avg_epu16 (a, b);
4826*80a68eefSBob Badour     return _mm_sub_epi16(a, avg);
4827*80a68eefSBob Badour }
4828*80a68eefSBob Badour 
4829*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
vhsubq_u32(uint32x4_t a,uint32x4_t b)4830*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
4831*80a68eefSBob Badour {
4832*80a68eefSBob Badour     //need to deal with the possibility of internal overflow
4833*80a68eefSBob Badour     __m128i a2, b2,r, b_1;
4834*80a68eefSBob Badour     a2 = _mm_srli_epi32 (a,1);
4835*80a68eefSBob Badour     b2 = _mm_srli_epi32 (b,1);
4836*80a68eefSBob Badour     r = _mm_sub_epi32 (a2, b2);
4837*80a68eefSBob Badour     b_1 = _mm_andnot_si128(a, b); //!a and b
4838*80a68eefSBob Badour     b_1 = _mm_slli_epi32 (b_1,31);
4839*80a68eefSBob Badour     b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
4840*80a68eefSBob Badour     return _mm_sub_epi32(r,b_1);
4841*80a68eefSBob Badour }
4842*80a68eefSBob Badour 
4843*80a68eefSBob Badour //******* Vector subtract high half (truncated) ** ************
4844*80a68eefSBob Badour //************************************************************
4845*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
vsubhn_s16(int16x8_t a,int16x8_t b)4846*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
4847*80a68eefSBob Badour {
4848*80a68eefSBob Badour     int8x8_t res64;
4849*80a68eefSBob Badour     __m128i sum, sum8;
4850*80a68eefSBob Badour     sum = _mm_sub_epi16 (a, b);
4851*80a68eefSBob Badour     sum8 = _mm_srai_epi16 (sum, 8);
4852*80a68eefSBob Badour     sum8 = _mm_packs_epi16(sum8,sum8);
4853*80a68eefSBob Badour     return64(sum8);
4854*80a68eefSBob Badour }
4855*80a68eefSBob Badour 
4856*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
vsubhn_s32(int32x4_t a,int32x4_t b)4857*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
4858*80a68eefSBob Badour {
4859*80a68eefSBob Badour     int16x4_t res64;
4860*80a68eefSBob Badour     __m128i sum, sum16;
4861*80a68eefSBob Badour     sum = _mm_sub_epi32 (a, b);
4862*80a68eefSBob Badour     sum16 = _mm_srai_epi32 (sum, 16);
4863*80a68eefSBob Badour     sum16 = _mm_packs_epi32(sum16,sum16);
4864*80a68eefSBob Badour     return64(sum16);
4865*80a68eefSBob Badour }
4866*80a68eefSBob Badour 
4867*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
vsubhn_s64(int64x2_t a,int64x2_t b)4868*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
4869*80a68eefSBob Badour {
4870*80a68eefSBob Badour     int32x2_t res64;
4871*80a68eefSBob Badour     __m128i sub;
4872*80a68eefSBob Badour     sub = _mm_sub_epi64 (a, b);
4873*80a68eefSBob Badour     sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
4874*80a68eefSBob Badour     return64(sub);
4875*80a68eefSBob Badour }
4876*80a68eefSBob Badour 
4877*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
vsubhn_u16(uint16x8_t a,uint16x8_t b)4878*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
4879*80a68eefSBob Badour {
4880*80a68eefSBob Badour     uint8x8_t res64;
4881*80a68eefSBob Badour     __m128i sum, sum8;
4882*80a68eefSBob Badour     sum = _mm_sub_epi16 (a, b);
4883*80a68eefSBob Badour     sum8 = _mm_srli_epi16 (sum, 8);
4884*80a68eefSBob Badour     sum8 =  _mm_packus_epi16(sum8,sum8);
4885*80a68eefSBob Badour     return64(sum8);
4886*80a68eefSBob Badour }
4887*80a68eefSBob Badour 
4888*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
vsubhn_u32(uint32x4_t a,uint32x4_t b)4889*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
4890*80a68eefSBob Badour {
4891*80a68eefSBob Badour     uint16x4_t res64;
4892*80a68eefSBob Badour      __m128i sum, sum16;
4893*80a68eefSBob Badour     sum = _mm_sub_epi32 (a, b);
4894*80a68eefSBob Badour     sum16 = _mm_srli_epi32 (sum, 16);
4895*80a68eefSBob Badour #ifdef USE_SSE4
4896*80a68eefSBob Badour     sum16 =  _MM_PACKUS1_EPI32(sum16);
4897*80a68eefSBob Badour #else
4898*80a68eefSBob Badour     sum16  = _mm_shuffle_epi8 (sum16, *(__m128i*) mask8_32_even_odd); //go to 16 bits
4899*80a68eefSBob Badour #endif
4900*80a68eefSBob Badour     return64(sum16);
4901*80a68eefSBob Badour }
4902*80a68eefSBob Badour 
4903*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
4904*80a68eefSBob Badour #define vsubhn_u64 vsubhn_s64
4905*80a68eefSBob Badour 
4906*80a68eefSBob Badour //************ Vector rounding subtract high half *********************
4907*80a68eefSBob Badour //*********************************************************************
4908*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
vrsubhn_s16(int16x8_t a,int16x8_t b)4909*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
4910*80a68eefSBob Badour {
4911*80a68eefSBob Badour     int8x8_t res64;
4912*80a68eefSBob Badour     __m128i sub, mask1;
4913*80a68eefSBob Badour     sub = _mm_sub_epi16 (a, b);
4914*80a68eefSBob Badour     mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
4915*80a68eefSBob Badour     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
4916*80a68eefSBob Badour     sub = _mm_srai_epi16 (sub, 8); //get high half
4917*80a68eefSBob Badour     sub = _mm_add_epi16 (sub, mask1); //actual rounding
4918*80a68eefSBob Badour     sub =  _mm_packs_epi16 (sub, sub);
4919*80a68eefSBob Badour     return64(sub);
4920*80a68eefSBob Badour }
4921*80a68eefSBob Badour 
4922*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
vrsubhn_s32(int32x4_t a,int32x4_t b)4923*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
4924*80a68eefSBob Badour {
4925*80a68eefSBob Badour     //SIMD may be not optimal, serial may be faster
4926*80a68eefSBob Badour     int16x4_t res64;
4927*80a68eefSBob Badour     __m128i sub, mask1;
4928*80a68eefSBob Badour     sub = _mm_sub_epi32 (a, b);
4929*80a68eefSBob Badour     mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
4930*80a68eefSBob Badour     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
4931*80a68eefSBob Badour     sub = _mm_srai_epi32 (sub, 16); //get high half
4932*80a68eefSBob Badour     sub = _mm_add_epi32 (sub, mask1); //actual rounding
4933*80a68eefSBob Badour     sub = _mm_packs_epi32 (sub, sub);
4934*80a68eefSBob Badour     return64(sub);
4935*80a68eefSBob Badour }
4936*80a68eefSBob Badour 
4937*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
vrsubhn_s64(int64x2_t a,int64x2_t b)4938*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
4939*80a68eefSBob Badour {
4940*80a68eefSBob Badour     //SIMD may be not optimal, serial may be faster
4941*80a68eefSBob Badour     int32x2_t res64;
4942*80a68eefSBob Badour     __m128i sub, mask1;
4943*80a68eefSBob Badour     sub = _mm_sub_epi64 (a, b);
4944*80a68eefSBob Badour     mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
4945*80a68eefSBob Badour     mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
4946*80a68eefSBob Badour     sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
4947*80a68eefSBob Badour     sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
4948*80a68eefSBob Badour     return64(sub);
4949*80a68eefSBob Badour }
4950*80a68eefSBob Badour 
4951*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
vrsubhn_u16(uint16x8_t a,uint16x8_t b)4952*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
4953*80a68eefSBob Badour {
4954*80a68eefSBob Badour     uint8x8_t res64;
4955*80a68eefSBob Badour     __m128i sub, mask1;
4956*80a68eefSBob Badour     sub = _mm_sub_epi16 (a, b);
4957*80a68eefSBob Badour     mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
4958*80a68eefSBob Badour     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
4959*80a68eefSBob Badour     sub = _mm_srai_epi16 (sub, 8); //get high half
4960*80a68eefSBob Badour     sub = _mm_add_epi16 (sub, mask1); //actual rounding
4961*80a68eefSBob Badour     sub = _mm_packus_epi16 (sub, sub);
4962*80a68eefSBob Badour     return64(sub);
4963*80a68eefSBob Badour }
4964*80a68eefSBob Badour 
4965*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
vrsubhn_u32(uint32x4_t a,uint32x4_t b)4966*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
4967*80a68eefSBob Badour {
4968*80a68eefSBob Badour     //SIMD may be not optimal, serial may be faster
4969*80a68eefSBob Badour     uint16x4_t res64;
4970*80a68eefSBob Badour     __m128i sub, mask1;
4971*80a68eefSBob Badour     sub = _mm_sub_epi32 (a, b);
4972*80a68eefSBob Badour     mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
4973*80a68eefSBob Badour     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
4974*80a68eefSBob Badour     sub = _mm_srai_epi32 (sub, 16); //get high half
4975*80a68eefSBob Badour     sub = _mm_add_epi32 (sub, mask1); //actual rounding
4976*80a68eefSBob Badour #ifdef USE_SSE4
4977*80a68eefSBob Badour     sub =  _MM_PACKUS1_EPI32 (sub);
4978*80a68eefSBob Badour #else
4979*80a68eefSBob Badour     sub = _mm_shuffle_epi8 (sub, *(__m128i*) mask8_32_even_odd); //go to 16 bits
4980*80a68eefSBob Badour #endif
4981*80a68eefSBob Badour     return64(sub);
4982*80a68eefSBob Badour }
4983*80a68eefSBob Badour 
4984*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
4985*80a68eefSBob Badour #define vrsubhn_u64 vrsubhn_s64
4986*80a68eefSBob Badour 
4987*80a68eefSBob Badour //*********** Vector saturating doubling multiply subtract long ********************
4988*80a68eefSBob Badour //************************************************************************************
4989*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
vqdmlsl_s16(int32x4_t a,int16x4_t b,int16x4_t c)4990*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
4991*80a68eefSBob Badour {
4992*80a68eefSBob Badour     //not optimal SIMD soulution, serial may be faster
4993*80a68eefSBob Badour     __m128i res32, mask;
4994*80a68eefSBob Badour     int32x4_t res;
4995*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4996*80a68eefSBob Badour     res = vmull_s16(b,  c);
4997*80a68eefSBob Badour     res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
4998*80a68eefSBob Badour     mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
4999*80a68eefSBob Badour     res32 = _mm_xor_si128 (res32,  mask); //res32 saturated for 0x80000000
5000*80a68eefSBob Badour     return vqsubq_s32(a, res32); //saturation
5001*80a68eefSBob Badour }
5002*80a68eefSBob Badour 
5003*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32 (int64x2_t a,int32x2_t b,int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)5004*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
5005*80a68eefSBob Badour {
5006*80a68eefSBob Badour     __m128i res64, mask;
5007*80a68eefSBob Badour     int64x2_t res;
5008*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
5009*80a68eefSBob Badour     res = vmull_s32(b,  c);
5010*80a68eefSBob Badour     res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
5011*80a68eefSBob Badour     mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
5012*80a68eefSBob Badour     res64 = _mm_xor_si128 (res64,  mask); //res32 saturated for 0x80000000
5013*80a68eefSBob Badour     return vqsubq_s64(a, res64); //saturation
5014*80a68eefSBob Badour }
5015*80a68eefSBob Badour 
5016*80a68eefSBob Badour //******************  COMPARISON ***************************************
5017*80a68eefSBob Badour //******************* Vector compare equal *************************************
5018*80a68eefSBob Badour //****************************************************************************
5019*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
vceq_s8(int8x8_t a,int8x8_t b)5020*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
5021*80a68eefSBob Badour {
5022*80a68eefSBob Badour     int8x8_t res64;
5023*80a68eefSBob Badour     return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5024*80a68eefSBob Badour }
5025*80a68eefSBob Badour 
5026*80a68eefSBob Badour 
5027*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
vceq_s16(int16x4_t a,int16x4_t b)5028*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
5029*80a68eefSBob Badour {
5030*80a68eefSBob Badour     int16x4_t res64;
5031*80a68eefSBob Badour     return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5032*80a68eefSBob Badour }
5033*80a68eefSBob Badour 
5034*80a68eefSBob Badour 
5035*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
vceq_s32(int32x2_t a,int32x2_t b)5036*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
5037*80a68eefSBob Badour {
5038*80a68eefSBob Badour     int32x2_t res64;
5039*80a68eefSBob Badour     return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5040*80a68eefSBob Badour }
5041*80a68eefSBob Badour 
5042*80a68eefSBob Badour 
5043*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
vceq_f32(float32x2_t a,float32x2_t b)5044*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
5045*80a68eefSBob Badour {
5046*80a68eefSBob Badour     uint32x2_t res64;
5047*80a68eefSBob Badour     __m128 res;
5048*80a68eefSBob Badour     res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
5049*80a68eefSBob Badour     return64f(res);
5050*80a68eefSBob Badour }
5051*80a68eefSBob Badour 
5052*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
vceq_u8(uint8x8_t a,uint8x8_t b)5053*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
5054*80a68eefSBob Badour {
5055*80a68eefSBob Badour     uint8x8_t res64;
5056*80a68eefSBob Badour     return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5057*80a68eefSBob Badour }
5058*80a68eefSBob Badour 
5059*80a68eefSBob Badour 
5060*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
vceq_u16(uint16x4_t a,uint16x4_t b)5061*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
5062*80a68eefSBob Badour {
5063*80a68eefSBob Badour     uint16x4_t res64;
5064*80a68eefSBob Badour     return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5065*80a68eefSBob Badour }
5066*80a68eefSBob Badour 
5067*80a68eefSBob Badour 
5068*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
vceq_u32(uint32x2_t a,uint32x2_t b)5069*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
5070*80a68eefSBob Badour {
5071*80a68eefSBob Badour     uint32x2_t res64;
5072*80a68eefSBob Badour     return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5073*80a68eefSBob Badour }
5074*80a68eefSBob Badour 
5075*80a68eefSBob Badour 
5076*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t   vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
5077*80a68eefSBob Badour #define vceq_p8 vceq_u8
5078*80a68eefSBob Badour 
5079*80a68eefSBob Badour 
5080*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
5081*80a68eefSBob Badour #define vceqq_s8 _mm_cmpeq_epi8
5082*80a68eefSBob Badour 
5083*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
5084*80a68eefSBob Badour #define vceqq_s16 _mm_cmpeq_epi16
5085*80a68eefSBob Badour 
5086*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
5087*80a68eefSBob Badour #define vceqq_s32 _mm_cmpeq_epi32
5088*80a68eefSBob Badour 
5089*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
vceqq_f32(float32x4_t a,float32x4_t b)5090*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
5091*80a68eefSBob Badour {
5092*80a68eefSBob Badour     __m128 res;
5093*80a68eefSBob Badour     res = _mm_cmpeq_ps(a,b);
5094*80a68eefSBob Badour     return _M128i(res);
5095*80a68eefSBob Badour }
5096*80a68eefSBob Badour 
5097*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
5098*80a68eefSBob Badour #define vceqq_u8 _mm_cmpeq_epi8
5099*80a68eefSBob Badour 
5100*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
5101*80a68eefSBob Badour #define vceqq_u16 _mm_cmpeq_epi16
5102*80a68eefSBob Badour 
5103*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
5104*80a68eefSBob Badour #define vceqq_u32 _mm_cmpeq_epi32
5105*80a68eefSBob Badour 
5106*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
5107*80a68eefSBob Badour #define vceqq_p8 _mm_cmpeq_epi8
5108*80a68eefSBob Badour 
5109*80a68eefSBob Badour //******************Vector compare greater-than or equal*************************
5110*80a68eefSBob Badour //*******************************************************************************
5111*80a68eefSBob Badour //in IA SIMD no greater-than-or-equal comparison for integers,
5112*80a68eefSBob Badour // there is greater-than available only, so we need the following tricks
5113*80a68eefSBob Badour 
5114*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
vcge_s8(int8x8_t a,int8x8_t b)5115*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a,  int8x8_t b)
5116*80a68eefSBob Badour {
5117*80a68eefSBob Badour     int8x8_t res64;
5118*80a68eefSBob Badour     return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
5119*80a68eefSBob Badour }
5120*80a68eefSBob Badour 
5121*80a68eefSBob Badour 
5122*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
vcge_s16(int16x4_t a,int16x4_t b)5123*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a,  int16x4_t b)
5124*80a68eefSBob Badour {
5125*80a68eefSBob Badour     int16x4_t res64;
5126*80a68eefSBob Badour     return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
5127*80a68eefSBob Badour }
5128*80a68eefSBob Badour 
5129*80a68eefSBob Badour 
5130*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
vcge_s32(int32x2_t a,int32x2_t b)5131*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a,  int32x2_t b)
5132*80a68eefSBob Badour {
5133*80a68eefSBob Badour     int32x2_t res64;
5134*80a68eefSBob Badour     return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
5135*80a68eefSBob Badour }
5136*80a68eefSBob Badour 
5137*80a68eefSBob Badour 
5138*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
vcge_f32(float32x2_t a,float32x2_t b)5139*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
5140*80a68eefSBob Badour {
5141*80a68eefSBob Badour     uint32x2_t res64;
5142*80a68eefSBob Badour     __m128 res;
5143*80a68eefSBob Badour     res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5144*80a68eefSBob Badour     return64f(res);
5145*80a68eefSBob Badour }
5146*80a68eefSBob Badour 
5147*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
vcge_u8(uint8x8_t a,uint8x8_t b)5148*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b)
5149*80a68eefSBob Badour {
5150*80a68eefSBob Badour     uint8x8_t res64;
5151*80a68eefSBob Badour     return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
5152*80a68eefSBob Badour }
5153*80a68eefSBob Badour 
5154*80a68eefSBob Badour 
5155*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
vcge_u16(uint16x4_t a,uint16x4_t b)5156*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b)
5157*80a68eefSBob Badour {
5158*80a68eefSBob Badour     uint16x4_t res64;
5159*80a68eefSBob Badour     return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
5160*80a68eefSBob Badour }
5161*80a68eefSBob Badour 
5162*80a68eefSBob Badour 
5163*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
vcge_u32(uint32x2_t a,uint32x2_t b)5164*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b)
5165*80a68eefSBob Badour {
5166*80a68eefSBob Badour     //serial solution looks faster
5167*80a68eefSBob Badour     uint32x2_t res64;
5168*80a68eefSBob Badour     return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
5169*80a68eefSBob Badour }
5170*80a68eefSBob Badour 
5171*80a68eefSBob Badour 
5172*80a68eefSBob Badour 
5173*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
vcgeq_s8(int8x16_t a,int8x16_t b)5174*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5175*80a68eefSBob Badour {
5176*80a68eefSBob Badour     __m128i m1, m2;
5177*80a68eefSBob Badour     m1 = _mm_cmpgt_epi8 ( a, b);
5178*80a68eefSBob Badour     m2 = _mm_cmpeq_epi8 ( a, b);
5179*80a68eefSBob Badour     return _mm_or_si128  ( m1, m2);
5180*80a68eefSBob Badour }
5181*80a68eefSBob Badour 
5182*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
vcgeq_s16(int16x8_t a,int16x8_t b)5183*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5184*80a68eefSBob Badour {
5185*80a68eefSBob Badour     __m128i m1, m2;
5186*80a68eefSBob Badour     m1 = _mm_cmpgt_epi16 ( a, b);
5187*80a68eefSBob Badour     m2 = _mm_cmpeq_epi16 ( a, b);
5188*80a68eefSBob Badour     return _mm_or_si128   ( m1,m2);
5189*80a68eefSBob Badour }
5190*80a68eefSBob Badour 
5191*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
vcgeq_s32(int32x4_t a,int32x4_t b)5192*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5193*80a68eefSBob Badour {
5194*80a68eefSBob Badour     __m128i m1, m2;
5195*80a68eefSBob Badour     m1 = _mm_cmpgt_epi32 (a, b);
5196*80a68eefSBob Badour     m2 = _mm_cmpeq_epi32 (a, b);
5197*80a68eefSBob Badour     return _mm_or_si128   (m1, m2);
5198*80a68eefSBob Badour }
5199*80a68eefSBob Badour 
5200*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
vcgeq_f32(float32x4_t a,float32x4_t b)5201*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
5202*80a68eefSBob Badour {
5203*80a68eefSBob Badour     __m128 res;
5204*80a68eefSBob Badour     res = _mm_cmpge_ps(a,b); //use only 2 first entries
5205*80a68eefSBob Badour     return *(__m128i*)&res;
5206*80a68eefSBob Badour }
5207*80a68eefSBob Badour 
5208*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
vcgeq_u8(uint8x16_t a,uint8x16_t b)5209*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5210*80a68eefSBob Badour {
5211*80a68eefSBob Badour     //no unsigned chars comparison, only signed available,so need the trick
5212*80a68eefSBob Badour     __m128i cmp;
5213*80a68eefSBob Badour     cmp = _mm_max_epu8(a, b);
5214*80a68eefSBob Badour     return _mm_cmpeq_epi8(cmp, a); //a>=b
5215*80a68eefSBob Badour }
5216*80a68eefSBob Badour 
5217*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
vcgeq_u16(uint16x8_t a,uint16x8_t b)5218*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5219*80a68eefSBob Badour {
5220*80a68eefSBob Badour     //no unsigned shorts comparison, only signed available,so need the trick
5221*80a68eefSBob Badour #ifdef USE_SSE4
5222*80a68eefSBob Badour     __m128i cmp;
5223*80a68eefSBob Badour     cmp = _mm_max_epu16(a, b);
5224*80a68eefSBob Badour     return _mm_cmpeq_epi16(cmp, a); //a>=b
5225*80a68eefSBob Badour #else
5226*80a68eefSBob Badour     __m128i as, mask;
5227*80a68eefSBob Badour     __m128i zero = _mm_setzero_si128();
5228*80a68eefSBob Badour     __m128i cffff = _mm_set1_epi16(0xffff);
5229*80a68eefSBob Badour     as = _mm_subs_epu16(b,a);
5230*80a68eefSBob Badour     mask = _mm_cmpgt_epi16(as, zero);
5231*80a68eefSBob Badour     return _mm_xor_si128 ( mask, cffff);
5232*80a68eefSBob Badour #endif
5233*80a68eefSBob Badour }
5234*80a68eefSBob Badour 
5235*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
vcgeq_u32(uint32x4_t a,uint32x4_t b)5236*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5237*80a68eefSBob Badour {
5238*80a68eefSBob Badour     //no unsigned ints comparison, only signed available,so need the trick
5239*80a68eefSBob Badour #ifdef USE_SSE4
5240*80a68eefSBob Badour     __m128i cmp;
5241*80a68eefSBob Badour     cmp = _mm_max_epu32(a, b);
5242*80a68eefSBob Badour     return _mm_cmpeq_epi32(cmp, a); //a>=b
5243*80a68eefSBob Badour #else
5244*80a68eefSBob Badour     //serial solution may be faster
5245*80a68eefSBob Badour     __m128i c80000000, as, bs, m1, m2;
5246*80a68eefSBob Badour     c80000000 = _mm_set1_epi32 (0x80000000);
5247*80a68eefSBob Badour     as = _mm_sub_epi32(a,c80000000);
5248*80a68eefSBob Badour     bs = _mm_sub_epi32(b,c80000000);
5249*80a68eefSBob Badour     m1 = _mm_cmpgt_epi32 (as, bs);
5250*80a68eefSBob Badour     m2 = _mm_cmpeq_epi32 (as, bs);
5251*80a68eefSBob Badour     return _mm_or_si128 ( m1,  m2);
5252*80a68eefSBob Badour #endif
5253*80a68eefSBob Badour }
5254*80a68eefSBob Badour 
5255*80a68eefSBob Badour //**********************Vector compare less-than or equal******************************
5256*80a68eefSBob Badour //***************************************************************************************
5257*80a68eefSBob Badour //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
5258*80a68eefSBob Badour 
5259*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
vcle_s8(int8x8_t a,int8x8_t b)5260*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a,  int8x8_t b)
5261*80a68eefSBob Badour {
5262*80a68eefSBob Badour     int8x8_t res64;
5263*80a68eefSBob Badour     return64(vcleq_s8(_pM128i(a), _pM128i(b)));
5264*80a68eefSBob Badour }
5265*80a68eefSBob Badour 
5266*80a68eefSBob Badour 
5267*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
vcle_s16(int16x4_t a,int16x4_t b)5268*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a,  int16x4_t b)
5269*80a68eefSBob Badour {
5270*80a68eefSBob Badour     int16x4_t res64;
5271*80a68eefSBob Badour     return64(vcleq_s16(_pM128i(a), _pM128i(b)));
5272*80a68eefSBob Badour }
5273*80a68eefSBob Badour 
5274*80a68eefSBob Badour 
5275*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
vcle_s32(int32x2_t a,int32x2_t b)5276*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a,  int32x2_t b)
5277*80a68eefSBob Badour {
5278*80a68eefSBob Badour     int32x2_t res64;
5279*80a68eefSBob Badour     return64(vcleq_s32(_pM128i(a), _pM128i(b)));
5280*80a68eefSBob Badour }
5281*80a68eefSBob Badour 
5282*80a68eefSBob Badour 
5283*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
vcle_f32(float32x2_t a,float32x2_t b)5284*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
5285*80a68eefSBob Badour {
5286*80a68eefSBob Badour     uint32x2_t res64;
5287*80a68eefSBob Badour     __m128 res;
5288*80a68eefSBob Badour     res = _mm_cmple_ps(_pM128(a),_pM128(b));
5289*80a68eefSBob Badour     return64f(res);
5290*80a68eefSBob Badour }
5291*80a68eefSBob Badour 
5292*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
5293*80a68eefSBob Badour #define vcle_u8(a,b) vcge_u8(b,a)
5294*80a68eefSBob Badour 
5295*80a68eefSBob Badour 
5296*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
5297*80a68eefSBob Badour #define vcle_u16(a,b) vcge_u16(b,a)
5298*80a68eefSBob Badour 
5299*80a68eefSBob Badour 
5300*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
5301*80a68eefSBob Badour #define vcle_u32(a,b) vcge_u32(b,a)
5302*80a68eefSBob Badour 
5303*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
vcleq_s8(int8x16_t a,int8x16_t b)5304*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5305*80a68eefSBob Badour {
5306*80a68eefSBob Badour     __m128i c1, res;
5307*80a68eefSBob Badour     c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
5308*80a68eefSBob Badour     res = _mm_cmpgt_epi8 ( a,  b);
5309*80a68eefSBob Badour     return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
5310*80a68eefSBob Badour }
5311*80a68eefSBob Badour 
5312*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
vcleq_s16(int16x8_t a,int16x8_t b)5313*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5314*80a68eefSBob Badour {
5315*80a68eefSBob Badour     __m128i c1, res;
5316*80a68eefSBob Badour     c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
5317*80a68eefSBob Badour     res = _mm_cmpgt_epi16 ( a,  b);
5318*80a68eefSBob Badour     return _mm_andnot_si128 (res, c1);
5319*80a68eefSBob Badour }
5320*80a68eefSBob Badour 
5321*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
vcleq_s32(int32x4_t a,int32x4_t b)5322*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5323*80a68eefSBob Badour {
5324*80a68eefSBob Badour     __m128i c1, res;
5325*80a68eefSBob Badour     c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
5326*80a68eefSBob Badour     res = _mm_cmpgt_epi32 ( a,  b);
5327*80a68eefSBob Badour     return _mm_andnot_si128 (res, c1);
5328*80a68eefSBob Badour }
5329*80a68eefSBob Badour 
5330*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
vcleq_f32(float32x4_t a,float32x4_t b)5331*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
5332*80a68eefSBob Badour {
5333*80a68eefSBob Badour     __m128 res;
5334*80a68eefSBob Badour     res = _mm_cmple_ps(a,b);
5335*80a68eefSBob Badour     return *(__m128i*)&res;
5336*80a68eefSBob Badour }
5337*80a68eefSBob Badour 
5338*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
5339*80a68eefSBob Badour #ifdef USE_SSE4
vcleq_u8(uint8x16_t a,uint8x16_t b)5340*80a68eefSBob Badour     _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5341*80a68eefSBob Badour     {
5342*80a68eefSBob Badour         //no unsigned chars comparison in SSE, only signed available,so need the trick
5343*80a68eefSBob Badour         __m128i cmp;
5344*80a68eefSBob Badour         cmp = _mm_min_epu8(a, b);
5345*80a68eefSBob Badour         return _mm_cmpeq_epi8(cmp, a); //a<=b
5346*80a68eefSBob Badour     }
5347*80a68eefSBob Badour #else
5348*80a68eefSBob Badour #   define vcleq_u8(a,b) vcgeq_u8(b,a)
5349*80a68eefSBob Badour #endif
5350*80a68eefSBob Badour 
5351*80a68eefSBob Badour 
5352*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
5353*80a68eefSBob Badour #ifdef USE_SSE4
vcleq_u16(uint16x8_t a,uint16x8_t b)5354*80a68eefSBob Badour     _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5355*80a68eefSBob Badour     {
5356*80a68eefSBob Badour         //no unsigned shorts comparison in SSE, only signed available,so need the trick
5357*80a68eefSBob Badour         __m128i cmp;
5358*80a68eefSBob Badour         cmp = _mm_min_epu16(a, b);
5359*80a68eefSBob Badour         return _mm_cmpeq_epi16(cmp, a); //a<=b
5360*80a68eefSBob Badour     }
5361*80a68eefSBob Badour #else
5362*80a68eefSBob Badour #   define vcleq_u16(a,b) vcgeq_u16(b,a)
5363*80a68eefSBob Badour #endif
5364*80a68eefSBob Badour 
5365*80a68eefSBob Badour 
5366*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
5367*80a68eefSBob Badour #ifdef USE_SSE4
vcleq_u32(uint32x4_t a,uint32x4_t b)5368*80a68eefSBob Badour     _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5369*80a68eefSBob Badour     {
5370*80a68eefSBob Badour         //no unsigned chars comparison in SSE, only signed available,so need the trick
5371*80a68eefSBob Badour         __m128i cmp;
5372*80a68eefSBob Badour         cmp = _mm_min_epu32(a, b);
5373*80a68eefSBob Badour         return _mm_cmpeq_epi32(cmp, a); //a<=b
5374*80a68eefSBob Badour     }
5375*80a68eefSBob Badour #else
5376*80a68eefSBob Badour //solution may be not optimal compared with the serial one
5377*80a68eefSBob Badour #   define vcleq_u32(a,b) vcgeq_u32(b,a)
5378*80a68eefSBob Badour #endif
5379*80a68eefSBob Badour 
5380*80a68eefSBob Badour 
5381*80a68eefSBob Badour //****** Vector compare greater-than ******************************************
5382*80a68eefSBob Badour //**************************************************************************
5383*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
vcgt_s8(int8x8_t a,int8x8_t b)5384*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
5385*80a68eefSBob Badour {
5386*80a68eefSBob Badour     int8x8_t res64;
5387*80a68eefSBob Badour     return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
5388*80a68eefSBob Badour }
5389*80a68eefSBob Badour 
5390*80a68eefSBob Badour 
5391*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
vcgt_s16(int16x4_t a,int16x4_t b)5392*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
5393*80a68eefSBob Badour {
5394*80a68eefSBob Badour     int16x4_t res64;
5395*80a68eefSBob Badour     return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
5396*80a68eefSBob Badour }
5397*80a68eefSBob Badour 
5398*80a68eefSBob Badour 
5399*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
vcgt_s32(int32x2_t a,int32x2_t b)5400*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
5401*80a68eefSBob Badour {
5402*80a68eefSBob Badour     int32x2_t res64;
5403*80a68eefSBob Badour     return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
5404*80a68eefSBob Badour }
5405*80a68eefSBob Badour 
5406*80a68eefSBob Badour 
5407*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
vcgt_f32(float32x2_t a,float32x2_t b)5408*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
5409*80a68eefSBob Badour {
5410*80a68eefSBob Badour     uint32x2_t res64;
5411*80a68eefSBob Badour     __m128 res;
5412*80a68eefSBob Badour     res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5413*80a68eefSBob Badour     return64f(res);
5414*80a68eefSBob Badour }
5415*80a68eefSBob Badour 
5416*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b); // VCGT.U8 d0, d0, d0
vcgt_u8(uint8x8_t a,uint8x8_t b)5417*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b)
5418*80a68eefSBob Badour {
5419*80a68eefSBob Badour     uint8x8_t res64;
5420*80a68eefSBob Badour     return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
5421*80a68eefSBob Badour }
5422*80a68eefSBob Badour 
5423*80a68eefSBob Badour 
5424*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b); // VCGT.s16 d0, d0, d0
vcgt_u16(uint16x4_t a,uint16x4_t b)5425*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b)
5426*80a68eefSBob Badour {
5427*80a68eefSBob Badour     uint16x4_t res64;
5428*80a68eefSBob Badour     return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
5429*80a68eefSBob Badour }
5430*80a68eefSBob Badour 
5431*80a68eefSBob Badour 
5432*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b); // VCGT.U32 d0, d0, d0
vcgt_u32(uint32x2_t a,uint32x2_t b)5433*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b)
5434*80a68eefSBob Badour {
5435*80a68eefSBob Badour     uint32x2_t res64;
5436*80a68eefSBob Badour     return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
5437*80a68eefSBob Badour }
5438*80a68eefSBob Badour 
5439*80a68eefSBob Badour 
5440*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5441*80a68eefSBob Badour #define vcgtq_s8 _mm_cmpgt_epi8
5442*80a68eefSBob Badour 
5443*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5444*80a68eefSBob Badour #define vcgtq_s16 _mm_cmpgt_epi16
5445*80a68eefSBob Badour 
5446*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5447*80a68eefSBob Badour #define vcgtq_s32 _mm_cmpgt_epi32
5448*80a68eefSBob Badour 
5449*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
vcgtq_f32(float32x4_t a,float32x4_t b)5450*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
5451*80a68eefSBob Badour {
5452*80a68eefSBob Badour     __m128 res;
5453*80a68eefSBob Badour     res = _mm_cmpgt_ps(a,b); //use only 2 first entries
5454*80a68eefSBob Badour     return *(__m128i*)&res;
5455*80a68eefSBob Badour }
5456*80a68eefSBob Badour 
5457*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
vcgtq_u8(uint8x16_t a,uint8x16_t b)5458*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
5459*80a68eefSBob Badour {
5460*80a68eefSBob Badour     //no unsigned chars comparison, only signed available,so need the trick
5461*80a68eefSBob Badour     __m128i as;
5462*80a68eefSBob Badour     __m128i zero = _mm_setzero_si128();
5463*80a68eefSBob Badour     as = _mm_subs_epu8(a, b);
5464*80a68eefSBob Badour     return _mm_cmpgt_epi8(as, zero);
5465*80a68eefSBob Badour }
5466*80a68eefSBob Badour 
5467*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
vcgtq_u16(uint16x8_t a,uint16x8_t b)5468*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
5469*80a68eefSBob Badour {
5470*80a68eefSBob Badour     //no unsigned short comparison, only signed available,so need the trick
5471*80a68eefSBob Badour     __m128i as;
5472*80a68eefSBob Badour     __m128i zero = _mm_setzero_si128();
5473*80a68eefSBob Badour     as = _mm_subs_epu16(a, b);
5474*80a68eefSBob Badour     return _mm_cmpgt_epi16(as, zero);
5475*80a68eefSBob Badour }
5476*80a68eefSBob Badour 
5477*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
vcgtq_u32(uint32x4_t a,uint32x4_t b)5478*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
5479*80a68eefSBob Badour {
5480*80a68eefSBob Badour     //no unsigned int comparison, only signed available,so need the trick
5481*80a68eefSBob Badour     __m128i c80000000, as, bs;
5482*80a68eefSBob Badour     c80000000 = _mm_set1_epi32 (0x80000000);
5483*80a68eefSBob Badour     as = _mm_sub_epi32(a,c80000000);
5484*80a68eefSBob Badour     bs = _mm_sub_epi32(b,c80000000);
5485*80a68eefSBob Badour     return _mm_cmpgt_epi32 ( as, bs);
5486*80a68eefSBob Badour }
5487*80a68eefSBob Badour 
5488*80a68eefSBob Badour //********************* Vector compare less-than **************************
5489*80a68eefSBob Badour //*************************************************************************
5490*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t   vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
5491*80a68eefSBob Badour #define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
5492*80a68eefSBob Badour 
5493*80a68eefSBob Badour 
5494*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t   vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
5495*80a68eefSBob Badour #define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
5496*80a68eefSBob Badour 
5497*80a68eefSBob Badour 
5498*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t   vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
5499*80a68eefSBob Badour #define vclt_s32(a,b)  vcgt_s32(b,a) //swap the arguments!!
5500*80a68eefSBob Badour 
5501*80a68eefSBob Badour 
5502*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
5503*80a68eefSBob Badour #define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
5504*80a68eefSBob Badour 
5505*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
5506*80a68eefSBob Badour #define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
5507*80a68eefSBob Badour 
5508*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
5509*80a68eefSBob Badour #define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
5510*80a68eefSBob Badour 
5511*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
5512*80a68eefSBob Badour #define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
5513*80a68eefSBob Badour 
5514*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5515*80a68eefSBob Badour #define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
5516*80a68eefSBob Badour 
5517*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5518*80a68eefSBob Badour #define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
5519*80a68eefSBob Badour 
5520*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5521*80a68eefSBob Badour #define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
5522*80a68eefSBob Badour 
5523*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
5524*80a68eefSBob Badour #define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
5525*80a68eefSBob Badour 
5526*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
5527*80a68eefSBob Badour #define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
5528*80a68eefSBob Badour 
5529*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
5530*80a68eefSBob Badour #define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
5531*80a68eefSBob Badour 
5532*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
5533*80a68eefSBob Badour #define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
5534*80a68eefSBob Badour 
5535*80a68eefSBob Badour //*****************Vector compare absolute greater-than or equal ************
5536*80a68eefSBob Badour //***************************************************************************
5537*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
vcage_f32(float32x2_t a,float32x2_t b)5538*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
5539*80a68eefSBob Badour {
5540*80a68eefSBob Badour     uint32x2_t res64;
5541*80a68eefSBob Badour     __m128i c7fffffff;
5542*80a68eefSBob Badour     __m128 a0, b0;
5543*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5544*80a68eefSBob Badour     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5545*80a68eefSBob Badour     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5546*80a68eefSBob Badour     a0 = _mm_cmpge_ps ( a0, b0);
5547*80a68eefSBob Badour     return64f(a0);
5548*80a68eefSBob Badour }
5549*80a68eefSBob Badour 
5550*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
vcageq_f32(float32x4_t a,float32x4_t b)5551*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5552*80a68eefSBob Badour {
5553*80a68eefSBob Badour     __m128i c7fffffff;
5554*80a68eefSBob Badour     __m128 a0, b0;
5555*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5556*80a68eefSBob Badour     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5557*80a68eefSBob Badour     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5558*80a68eefSBob Badour     a0 = _mm_cmpge_ps ( a0, b0);
5559*80a68eefSBob Badour     return (*(__m128i*)&a0);
5560*80a68eefSBob Badour }
5561*80a68eefSBob Badour 
5562*80a68eefSBob Badour //********Vector compare absolute less-than or equal ******************
5563*80a68eefSBob Badour //********************************************************************
5564*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
vcale_f32(float32x2_t a,float32x2_t b)5565*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
5566*80a68eefSBob Badour {
5567*80a68eefSBob Badour     uint32x2_t res64;
5568*80a68eefSBob Badour     __m128i c7fffffff;
5569*80a68eefSBob Badour     __m128 a0, b0;
5570*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5571*80a68eefSBob Badour     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5572*80a68eefSBob Badour     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5573*80a68eefSBob Badour     a0 = _mm_cmple_ps (a0, b0);
5574*80a68eefSBob Badour     return64f(a0);
5575*80a68eefSBob Badour }
5576*80a68eefSBob Badour 
5577*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
vcaleq_f32(float32x4_t a,float32x4_t b)5578*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5579*80a68eefSBob Badour {
5580*80a68eefSBob Badour     __m128i c7fffffff;
5581*80a68eefSBob Badour     __m128 a0, b0;
5582*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5583*80a68eefSBob Badour     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5584*80a68eefSBob Badour     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5585*80a68eefSBob Badour     a0 = _mm_cmple_ps (a0, b0);
5586*80a68eefSBob Badour     return (*(__m128i*)&a0);
5587*80a68eefSBob Badour }
5588*80a68eefSBob Badour 
5589*80a68eefSBob Badour //********  Vector compare absolute greater-than    ******************
5590*80a68eefSBob Badour //******************************************************************
5591*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
vcagt_f32(float32x2_t a,float32x2_t b)5592*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
5593*80a68eefSBob Badour {
5594*80a68eefSBob Badour     uint32x2_t res64;
5595*80a68eefSBob Badour     __m128i c7fffffff;
5596*80a68eefSBob Badour     __m128 a0, b0;
5597*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5598*80a68eefSBob Badour     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5599*80a68eefSBob Badour     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5600*80a68eefSBob Badour     a0 = _mm_cmpgt_ps (a0, b0);
5601*80a68eefSBob Badour     return64f(a0);
5602*80a68eefSBob Badour }
5603*80a68eefSBob Badour 
5604*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
vcagtq_f32(float32x4_t a,float32x4_t b)5605*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5606*80a68eefSBob Badour {
5607*80a68eefSBob Badour     __m128i c7fffffff;
5608*80a68eefSBob Badour     __m128 a0, b0;
5609*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5610*80a68eefSBob Badour     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5611*80a68eefSBob Badour     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5612*80a68eefSBob Badour     a0 = _mm_cmpgt_ps (a0, b0);
5613*80a68eefSBob Badour     return (*(__m128i*)&a0);
5614*80a68eefSBob Badour }
5615*80a68eefSBob Badour 
5616*80a68eefSBob Badour //***************Vector compare absolute less-than  ***********************
5617*80a68eefSBob Badour //*************************************************************************
5618*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
vcalt_f32(float32x2_t a,float32x2_t b)5619*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
5620*80a68eefSBob Badour {
5621*80a68eefSBob Badour     uint32x2_t res64;
5622*80a68eefSBob Badour     __m128i c7fffffff;
5623*80a68eefSBob Badour     __m128 a0, b0;
5624*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5625*80a68eefSBob Badour     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5626*80a68eefSBob Badour     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5627*80a68eefSBob Badour     a0 = _mm_cmplt_ps (a0, b0);
5628*80a68eefSBob Badour     return64f(a0);
5629*80a68eefSBob Badour }
5630*80a68eefSBob Badour 
5631*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
vcaltq_f32(float32x4_t a,float32x4_t b)5632*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5633*80a68eefSBob Badour {
5634*80a68eefSBob Badour     __m128i c7fffffff;
5635*80a68eefSBob Badour     __m128 a0, b0;
5636*80a68eefSBob Badour     c7fffffff = _mm_set1_epi32 (0x7fffffff);
5637*80a68eefSBob Badour     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5638*80a68eefSBob Badour     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5639*80a68eefSBob Badour     a0 = _mm_cmplt_ps (a0, b0);
5640*80a68eefSBob Badour     return (*(__m128i*)&a0);
5641*80a68eefSBob Badour }
5642*80a68eefSBob Badour 
5643*80a68eefSBob Badour //*************************Vector test bits************************************
5644*80a68eefSBob Badour //*****************************************************************************
5645*80a68eefSBob Badour /*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
5646*80a68eefSBob Badour with the corresponding element of a second vector. If the result is not zero, the
5647*80a68eefSBob Badour corresponding element in the destination vector is set to all ones. Otherwise, it is set to
5648*80a68eefSBob Badour all zeros. */
5649*80a68eefSBob Badour 
5650*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b); // VTST.8 d0, d0, d0
vtst_s8(int8x8_t a,int8x8_t b)5651*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b)
5652*80a68eefSBob Badour {
5653*80a68eefSBob Badour     int8x8_t res64;
5654*80a68eefSBob Badour     return64(vtstq_s8(_pM128i(a), _pM128i(b)));
5655*80a68eefSBob Badour }
5656*80a68eefSBob Badour 
5657*80a68eefSBob Badour 
5658*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b); // VTST.16 d0, d0, d0
vtst_s16(int16x4_t a,int16x4_t b)5659*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b)
5660*80a68eefSBob Badour {
5661*80a68eefSBob Badour     int16x4_t res64;
5662*80a68eefSBob Badour     return64(vtstq_s16(_pM128i(a), _pM128i(b)));
5663*80a68eefSBob Badour }
5664*80a68eefSBob Badour 
5665*80a68eefSBob Badour 
5666*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b); // VTST.32 d0, d0, d0
vtst_s32(int32x2_t a,int32x2_t b)5667*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b)
5668*80a68eefSBob Badour {
5669*80a68eefSBob Badour     int32x2_t res64;
5670*80a68eefSBob Badour     return64(vtstq_s32(_pM128i(a), _pM128i(b)));
5671*80a68eefSBob Badour }
5672*80a68eefSBob Badour 
5673*80a68eefSBob Badour 
5674*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a,  uint8x8_t b); // VTST.8 d0, d0, d0
5675*80a68eefSBob Badour #define vtst_u8 vtst_s8
5676*80a68eefSBob Badour 
5677*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a,  uint16x4_t b); // VTST.16 d0, d0, d0
5678*80a68eefSBob Badour #define vtst_u16 vtst_s16
5679*80a68eefSBob Badour 
5680*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a,  uint32x2_t b); // VTST.32 d0, d0, d0
5681*80a68eefSBob Badour #define vtst_u32 vtst_s32
5682*80a68eefSBob Badour 
5683*80a68eefSBob Badour 
5684*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
5685*80a68eefSBob Badour #define vtst_p8 vtst_u8
5686*80a68eefSBob Badour 
5687*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
vtstq_s8(int8x16_t a,int8x16_t b)5688*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
5689*80a68eefSBob Badour {
5690*80a68eefSBob Badour     __m128i zero, one, res;
5691*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
5692*80a68eefSBob Badour     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5693*80a68eefSBob Badour     res = _mm_and_si128 (a, b);
5694*80a68eefSBob Badour     res =  _mm_cmpeq_epi8 (res, zero);
5695*80a68eefSBob Badour     return _mm_xor_si128(res, one); //invert result
5696*80a68eefSBob Badour }
5697*80a68eefSBob Badour 
5698*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
vtstq_s16(int16x8_t a,int16x8_t b)5699*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
5700*80a68eefSBob Badour {
5701*80a68eefSBob Badour     __m128i zero, one, res;
5702*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
5703*80a68eefSBob Badour     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5704*80a68eefSBob Badour     res = _mm_and_si128 (a, b);
5705*80a68eefSBob Badour     res =  _mm_cmpeq_epi16 (res, zero);
5706*80a68eefSBob Badour     return _mm_xor_si128(res, one); //invert result
5707*80a68eefSBob Badour }
5708*80a68eefSBob Badour 
5709*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
vtstq_s32(int32x4_t a,int32x4_t b)5710*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
5711*80a68eefSBob Badour {
5712*80a68eefSBob Badour     __m128i zero, one, res;
5713*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
5714*80a68eefSBob Badour     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5715*80a68eefSBob Badour     res = _mm_and_si128 (a, b);
5716*80a68eefSBob Badour     res =  _mm_cmpeq_epi32 (res, zero);
5717*80a68eefSBob Badour     return _mm_xor_si128(res, one); //invert result
5718*80a68eefSBob Badour }
5719*80a68eefSBob Badour 
5720*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
5721*80a68eefSBob Badour #define vtstq_u8 vtstq_s8
5722*80a68eefSBob Badour 
5723*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
5724*80a68eefSBob Badour #define vtstq_u16 vtstq_s16
5725*80a68eefSBob Badour 
5726*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
5727*80a68eefSBob Badour #define vtstq_u32 vtstq_s32
5728*80a68eefSBob Badour 
5729*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
5730*80a68eefSBob Badour #define vtstq_p8 vtstq_u8
5731*80a68eefSBob Badour 
5732*80a68eefSBob Badour //****************** Absolute difference ********************
5733*80a68eefSBob Badour //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
5734*80a68eefSBob Badour //************************************************************
5735*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b); // VABD.S8 d0,d0,d0
vabd_s8(int8x8_t a,int8x8_t b)5736*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b)
5737*80a68eefSBob Badour {
5738*80a68eefSBob Badour     int8x8_t res64;
5739*80a68eefSBob Badour     return64(vabdq_s8(_pM128i(a), _pM128i(b)));
5740*80a68eefSBob Badour }
5741*80a68eefSBob Badour 
5742*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b); // VABD.S16 d0,d0,d0
vabd_s16(int16x4_t a,int16x4_t b)5743*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b)
5744*80a68eefSBob Badour {
5745*80a68eefSBob Badour     int16x4_t res64;
5746*80a68eefSBob Badour     return64(vabdq_s16(_pM128i(a), _pM128i(b)));
5747*80a68eefSBob Badour }
5748*80a68eefSBob Badour 
5749*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b); // VABD.S32 d0,d0,d0
vabd_s32(int32x2_t a,int32x2_t b)5750*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b)
5751*80a68eefSBob Badour {//need to deal with an intermediate overflow
5752*80a68eefSBob Badour     int32x2_t res;
5753*80a68eefSBob Badour     res.m64_i32[0] = (a.m64_i32[0] > b.m64_i32[0]) ? a.m64_i32[0] -  b.m64_i32[0]: b.m64_i32[0] -  a.m64_i32[0];
5754*80a68eefSBob Badour     res.m64_i32[1] = (a.m64_i32[1] > b.m64_i32[1]) ? a.m64_i32[1] -  b.m64_i32[1]: b.m64_i32[1] -  a.m64_i32[1];
5755*80a68eefSBob Badour     return res;
5756*80a68eefSBob Badour }
5757*80a68eefSBob Badour 
5758*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b); // VABD.U8 d0,d0,d0
vabd_u8(uint8x8_t a,uint8x8_t b)5759*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b)
5760*80a68eefSBob Badour {
5761*80a68eefSBob Badour     uint8x8_t res64;
5762*80a68eefSBob Badour     return64(vabdq_u8(_pM128i(a), _pM128i(b)));
5763*80a68eefSBob Badour }
5764*80a68eefSBob Badour 
5765*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b); // VABD.s16 d0,d0,d0
vabd_u16(uint16x4_t a,uint16x4_t b)5766*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b)
5767*80a68eefSBob Badour {
5768*80a68eefSBob Badour     uint16x4_t res64;
5769*80a68eefSBob Badour     return64(vabdq_u16(_pM128i(a), _pM128i(b)));
5770*80a68eefSBob Badour }
5771*80a68eefSBob Badour 
5772*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b); // VABD.U32 d0,d0,d0
vabd_u32(uint32x2_t a,uint32x2_t b)5773*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b)
5774*80a68eefSBob Badour {
5775*80a68eefSBob Badour     uint32x2_t res64;
5776*80a68eefSBob Badour     return64(vabdq_u32(_pM128i(a), _pM128i(b)));
5777*80a68eefSBob Badour }
5778*80a68eefSBob Badour 
5779*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
vabd_f32(float32x2_t a,float32x2_t b)5780*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
5781*80a68eefSBob Badour {
5782*80a68eefSBob Badour     float32x4_t res;
5783*80a68eefSBob Badour     __m64_128 res64;
5784*80a68eefSBob Badour     res = vabdq_f32(_pM128(a), _pM128(b));
5785*80a68eefSBob Badour     _M64f(res64, res);
5786*80a68eefSBob Badour     return res64;
5787*80a68eefSBob Badour }
5788*80a68eefSBob Badour 
5789*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
vabdq_s8(int8x16_t a,int8x16_t b)5790*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
5791*80a68eefSBob Badour { //need to deal with an intermediate overflow
5792*80a68eefSBob Badour    __m128i cmp, difab, difba;
5793*80a68eefSBob Badour    cmp = vcgtq_s8(a,b);
5794*80a68eefSBob Badour    difab = _mm_sub_epi8(a,b);
5795*80a68eefSBob Badour    difba = _mm_sub_epi8(b,a);
5796*80a68eefSBob Badour    difab = _mm_and_si128(cmp, difab);
5797*80a68eefSBob Badour    difba = _mm_andnot_si128(cmp, difba);
5798*80a68eefSBob Badour    return _mm_or_si128(difab, difba);
5799*80a68eefSBob Badour }
5800*80a68eefSBob Badour 
5801*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
vabdq_s16(int16x8_t a,int16x8_t b)5802*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
5803*80a68eefSBob Badour {//need to deal with an intermediate overflow
5804*80a68eefSBob Badour     __m128i cmp, difab, difba;
5805*80a68eefSBob Badour     cmp = vcgtq_s16(a,b);
5806*80a68eefSBob Badour     difab = _mm_sub_epi16(a,b);
5807*80a68eefSBob Badour     difba = _mm_sub_epi16 (b,a);
5808*80a68eefSBob Badour     difab = _mm_and_si128(cmp, difab);
5809*80a68eefSBob Badour     difba = _mm_andnot_si128(cmp, difba);
5810*80a68eefSBob Badour     return _mm_or_si128(difab, difba);
5811*80a68eefSBob Badour }
5812*80a68eefSBob Badour 
5813*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
vabdq_s32(int32x4_t a,int32x4_t b)5814*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
5815*80a68eefSBob Badour {//need to deal with an intermediate overflow
5816*80a68eefSBob Badour     __m128i cmp, difab, difba;
5817*80a68eefSBob Badour     cmp = vcgtq_s32(a,b);
5818*80a68eefSBob Badour     difab = _mm_sub_epi32(a,b);
5819*80a68eefSBob Badour     difba = _mm_sub_epi32(b,a);
5820*80a68eefSBob Badour     difab = _mm_and_si128(cmp, difab);
5821*80a68eefSBob Badour     difba = _mm_andnot_si128(cmp, difba);
5822*80a68eefSBob Badour     return _mm_or_si128(difab, difba);
5823*80a68eefSBob Badour }
5824*80a68eefSBob Badour 
5825*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
vabdq_u8(uint8x16_t a,uint8x16_t b)5826*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
5827*80a68eefSBob Badour {
5828*80a68eefSBob Badour     __m128i  difab, difba;
5829*80a68eefSBob Badour     difab = _mm_subs_epu8(a,b);
5830*80a68eefSBob Badour     difba = _mm_subs_epu8 (b,a);
5831*80a68eefSBob Badour     return _mm_or_si128(difab, difba);
5832*80a68eefSBob Badour }
5833*80a68eefSBob Badour 
5834*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
vabdq_u16(uint16x8_t a,uint16x8_t b)5835*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
5836*80a68eefSBob Badour {
5837*80a68eefSBob Badour     __m128i difab, difba;
5838*80a68eefSBob Badour     difab = _mm_subs_epu16(a,b);
5839*80a68eefSBob Badour     difba = _mm_subs_epu16 (b,a);
5840*80a68eefSBob Badour     return _mm_or_si128(difab, difba);
5841*80a68eefSBob Badour }
5842*80a68eefSBob Badour 
5843*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
vabdq_u32(uint32x4_t a,uint32x4_t b)5844*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
5845*80a68eefSBob Badour {
5846*80a68eefSBob Badour     __m128i cmp, difab, difba;
5847*80a68eefSBob Badour     cmp = vcgtq_u32(a,b);
5848*80a68eefSBob Badour     difab = _mm_sub_epi32(a,b);
5849*80a68eefSBob Badour     difba = _mm_sub_epi32 (b,a);
5850*80a68eefSBob Badour     difab = _mm_and_si128(cmp, difab);
5851*80a68eefSBob Badour     difba = _mm_andnot_si128(cmp, difba);
5852*80a68eefSBob Badour     return _mm_or_si128(difab, difba);
5853*80a68eefSBob Badour }
5854*80a68eefSBob Badour 
5855*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
vabdq_f32(float32x4_t a,float32x4_t b)5856*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
5857*80a68eefSBob Badour {
5858*80a68eefSBob Badour     __m128i c1;
5859*80a68eefSBob Badour     __m128 res;
5860*80a68eefSBob Badour     c1 =  _mm_set1_epi32(0x7fffffff);
5861*80a68eefSBob Badour     res = _mm_sub_ps (a, b);
5862*80a68eefSBob Badour     return _mm_and_ps (res, *(__m128*)&c1);
5863*80a68eefSBob Badour }
5864*80a68eefSBob Badour 
5865*80a68eefSBob Badour //************  Absolute difference - long **************************
5866*80a68eefSBob Badour //********************************************************************
5867*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
vabdl_s8(int8x8_t a,int8x8_t b)5868*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
5869*80a68eefSBob Badour {
5870*80a68eefSBob Badour     __m128i a16, b16;
5871*80a68eefSBob Badour     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
5872*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
5873*80a68eefSBob Badour     return vabdq_s16(a16, b16);
5874*80a68eefSBob Badour 
5875*80a68eefSBob Badour }
5876*80a68eefSBob Badour 
5877*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
vabdl_s16(int16x4_t a,int16x4_t b)5878*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
5879*80a68eefSBob Badour {
5880*80a68eefSBob Badour     __m128i a32, b32;
5881*80a68eefSBob Badour     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
5882*80a68eefSBob Badour     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
5883*80a68eefSBob Badour     return vabdq_s32(a32, b32);
5884*80a68eefSBob Badour }
5885*80a68eefSBob Badour 
5886*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vabdl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)5887*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
5888*80a68eefSBob Badour {
5889*80a68eefSBob Badour     //no optimal SIMD solution, serial looks faster
5890*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t res[2];
5891*80a68eefSBob Badour     if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
5892*80a68eefSBob Badour     else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
5893*80a68eefSBob Badour     if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
5894*80a68eefSBob Badour     else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
5895*80a68eefSBob Badour     return _mm_load_si128((__m128i*)res);
5896*80a68eefSBob Badour }
5897*80a68eefSBob Badour 
5898*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
vabdl_u8(uint8x8_t a,uint8x8_t b)5899*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
5900*80a68eefSBob Badour {
5901*80a68eefSBob Badour     __m128i res;
5902*80a68eefSBob Badour     res = vsubl_u8(a,b);
5903*80a68eefSBob Badour     return _mm_abs_epi16(res);
5904*80a68eefSBob Badour }
5905*80a68eefSBob Badour 
5906*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
vabdl_u16(uint16x4_t a,uint16x4_t b)5907*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
5908*80a68eefSBob Badour {
5909*80a68eefSBob Badour     __m128i res;
5910*80a68eefSBob Badour     res = vsubl_u16(a,b);
5911*80a68eefSBob Badour     return _mm_abs_epi32(res);
5912*80a68eefSBob Badour }
5913*80a68eefSBob Badour 
5914*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vabdl_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)5915*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
5916*80a68eefSBob Badour {
5917*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t res[2];
5918*80a68eefSBob Badour     if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
5919*80a68eefSBob Badour     else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
5920*80a68eefSBob Badour     if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
5921*80a68eefSBob Badour     else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
5922*80a68eefSBob Badour     return _mm_load_si128((__m128i*)res);
5923*80a68eefSBob Badour }
5924*80a68eefSBob Badour 
5925*80a68eefSBob Badour //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
5926*80a68eefSBob Badour //*********************************************************************************************
5927*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
vaba_s8(int8x8_t a,int8x8_t b,int8x8_t c)5928*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c)
5929*80a68eefSBob Badour {
5930*80a68eefSBob Badour     int8x8_t res64;
5931*80a68eefSBob Badour     return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
5932*80a68eefSBob Badour }
5933*80a68eefSBob Badour 
5934*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
vaba_s16(int16x4_t a,int16x4_t b,int16x4_t c)5935*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
5936*80a68eefSBob Badour {
5937*80a68eefSBob Badour     int16x4_t res64;
5938*80a68eefSBob Badour     return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
5939*80a68eefSBob Badour }
5940*80a68eefSBob Badour 
5941*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
vaba_s32(int32x2_t a,int32x2_t b,int32x2_t c)5942*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c)
5943*80a68eefSBob Badour {
5944*80a68eefSBob Badour     int32x2_t res64;
5945*80a68eefSBob Badour     return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
5946*80a68eefSBob Badour }
5947*80a68eefSBob Badour 
5948*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
vaba_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)5949*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c)
5950*80a68eefSBob Badour {
5951*80a68eefSBob Badour     int8x8_t res64;
5952*80a68eefSBob Badour     return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c)));
5953*80a68eefSBob Badour }
5954*80a68eefSBob Badour 
5955*80a68eefSBob Badour 
5956*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
vaba_u16(uint16x4_t a,uint16x4_t b,uint16x4_t c)5957*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c)
5958*80a68eefSBob Badour {
5959*80a68eefSBob Badour     int16x4_t res64;
5960*80a68eefSBob Badour     return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c)));
5961*80a68eefSBob Badour }
5962*80a68eefSBob Badour 
5963*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
vaba_u32(uint32x2_t a,uint32x2_t b,uint32x2_t c)5964*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c)
5965*80a68eefSBob Badour {
5966*80a68eefSBob Badour     uint32x2_t res64;
5967*80a68eefSBob Badour     return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
5968*80a68eefSBob Badour }
5969*80a68eefSBob Badour 
5970*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
vabaq_s8(int8x16_t a,int8x16_t b,int8x16_t c)5971*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
5972*80a68eefSBob Badour {
5973*80a68eefSBob Badour     int8x16_t sub;
5974*80a68eefSBob Badour     sub = vabdq_s8(b, c);
5975*80a68eefSBob Badour     return vaddq_s8( a, sub);
5976*80a68eefSBob Badour }
5977*80a68eefSBob Badour 
5978*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
vabaq_s16(int16x8_t a,int16x8_t b,int16x8_t c)5979*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
5980*80a68eefSBob Badour {
5981*80a68eefSBob Badour     int16x8_t sub;
5982*80a68eefSBob Badour     sub = vabdq_s16(b, c);
5983*80a68eefSBob Badour     return vaddq_s16( a, sub);
5984*80a68eefSBob Badour }
5985*80a68eefSBob Badour 
5986*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
vabaq_s32(int32x4_t a,int32x4_t b,int32x4_t c)5987*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
5988*80a68eefSBob Badour {
5989*80a68eefSBob Badour     int32x4_t sub;
5990*80a68eefSBob Badour     sub = vabdq_s32(b, c);
5991*80a68eefSBob Badour     return vaddq_s32( a, sub);
5992*80a68eefSBob Badour }
5993*80a68eefSBob Badour 
5994*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
vabaq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)5995*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
5996*80a68eefSBob Badour {
5997*80a68eefSBob Badour     uint8x16_t sub;
5998*80a68eefSBob Badour     sub = vabdq_u8(b, c);
5999*80a68eefSBob Badour     return vaddq_u8( a, sub);
6000*80a68eefSBob Badour }
6001*80a68eefSBob Badour 
6002*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
vabaq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t c)6003*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
6004*80a68eefSBob Badour {
6005*80a68eefSBob Badour     uint16x8_t sub;
6006*80a68eefSBob Badour     sub = vabdq_u16(b, c);
6007*80a68eefSBob Badour     return vaddq_u16( a, sub);
6008*80a68eefSBob Badour }
6009*80a68eefSBob Badour 
6010*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
vabaq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t c)6011*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
6012*80a68eefSBob Badour {
6013*80a68eefSBob Badour     uint32x4_t sub;
6014*80a68eefSBob Badour     sub = vabdq_u32(b, c);
6015*80a68eefSBob Badour     return vaddq_u32( a, sub);
6016*80a68eefSBob Badour }
6017*80a68eefSBob Badour 
6018*80a68eefSBob Badour //************** Absolute difference and accumulate - long ********************************
6019*80a68eefSBob Badour //*************************************************************************************
6020*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
vabal_s8(int16x8_t a,int8x8_t b,int8x8_t c)6021*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
6022*80a68eefSBob Badour {
6023*80a68eefSBob Badour     __m128i b16, c16, res;
6024*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
6025*80a68eefSBob Badour     c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
6026*80a68eefSBob Badour     res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6027*80a68eefSBob Badour     return _mm_add_epi16 (a, res);
6028*80a68eefSBob Badour }
6029*80a68eefSBob Badour 
6030*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
vabal_s16(int32x4_t a,int16x4_t b,int16x4_t c)6031*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
6032*80a68eefSBob Badour {
6033*80a68eefSBob Badour     __m128i b32, c32, res;
6034*80a68eefSBob Badour     b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
6035*80a68eefSBob Badour     c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
6036*80a68eefSBob Badour     res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6037*80a68eefSBob Badour     return _mm_add_epi32 (a, res);
6038*80a68eefSBob Badour }
6039*80a68eefSBob Badour 
6040*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vabal_s32 (int64x2_t a,int32x2_t b,int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)6041*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6042*80a68eefSBob Badour {
6043*80a68eefSBob Badour     __m128i res;
6044*80a68eefSBob Badour     res = vabdl_s32(b,c);
6045*80a68eefSBob Badour     return _mm_add_epi64(a, res);
6046*80a68eefSBob Badour }
6047*80a68eefSBob Badour 
6048*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
vabal_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)6049*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
6050*80a68eefSBob Badour {
6051*80a68eefSBob Badour     __m128i b16, c16, res;
6052*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
6053*80a68eefSBob Badour     c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
6054*80a68eefSBob Badour     res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6055*80a68eefSBob Badour     return _mm_add_epi16 (a, res);
6056*80a68eefSBob Badour }
6057*80a68eefSBob Badour 
6058*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
vabal_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)6059*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
6060*80a68eefSBob Badour {
6061*80a68eefSBob Badour     __m128i b32, c32, res;
6062*80a68eefSBob Badour     b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
6063*80a68eefSBob Badour     c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
6064*80a68eefSBob Badour     res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6065*80a68eefSBob Badour     return _mm_add_epi32 (a, res);
6066*80a68eefSBob Badour }
6067*80a68eefSBob Badour 
6068*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vabal_u32 (uint64x2_t a,uint32x2_t b,uint32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)6069*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6070*80a68eefSBob Badour {
6071*80a68eefSBob Badour     __m128i res;
6072*80a68eefSBob Badour     res = vabdl_u32(b,c);
6073*80a68eefSBob Badour     return _mm_add_epi64(a, res);
6074*80a68eefSBob Badour }
6075*80a68eefSBob Badour 
6076*80a68eefSBob Badour //***********************************************************************************
6077*80a68eefSBob Badour //****************  Maximum and minimum operations **********************************
6078*80a68eefSBob Badour //***********************************************************************************
6079*80a68eefSBob Badour //************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
6080*80a68eefSBob Badour //***********************************************************************************
6081*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
vmax_s8(int8x8_t a,int8x8_t b)6082*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b)
6083*80a68eefSBob Badour {
6084*80a68eefSBob Badour     int8x8_t res64;
6085*80a68eefSBob Badour     __m128i res;
6086*80a68eefSBob Badour     res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6087*80a68eefSBob Badour     return64(res);
6088*80a68eefSBob Badour }
6089*80a68eefSBob Badour 
6090*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
vmax_s16(int16x4_t a,int16x4_t b)6091*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
6092*80a68eefSBob Badour {
6093*80a68eefSBob Badour     int16x4_t res64;
6094*80a68eefSBob Badour     return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
6095*80a68eefSBob Badour }
6096*80a68eefSBob Badour 
6097*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
vmax_s32(int32x2_t a,int32x2_t b)6098*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b)
6099*80a68eefSBob Badour {
6100*80a68eefSBob Badour     int32x2_t res64;
6101*80a68eefSBob Badour     __m128i res;
6102*80a68eefSBob Badour     res =  _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6103*80a68eefSBob Badour     return64(res);
6104*80a68eefSBob Badour }
6105*80a68eefSBob Badour 
6106*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
vmax_u8(uint8x8_t a,uint8x8_t b)6107*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
6108*80a68eefSBob Badour {
6109*80a68eefSBob Badour     uint8x8_t res64;
6110*80a68eefSBob Badour     return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
6111*80a68eefSBob Badour }
6112*80a68eefSBob Badour 
6113*80a68eefSBob Badour 
6114*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
vmax_u16(uint16x4_t a,uint16x4_t b)6115*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
6116*80a68eefSBob Badour {
6117*80a68eefSBob Badour     uint16x4_t res64;
6118*80a68eefSBob Badour     return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
6119*80a68eefSBob Badour }
6120*80a68eefSBob Badour 
6121*80a68eefSBob Badour 
6122*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
vmax_u32(uint32x2_t a,uint32x2_t b)6123*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b)
6124*80a68eefSBob Badour {
6125*80a68eefSBob Badour     uint32x2_t res64;
6126*80a68eefSBob Badour     __m128i res;
6127*80a68eefSBob Badour     res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6128*80a68eefSBob Badour     return64(res);
6129*80a68eefSBob Badour }
6130*80a68eefSBob Badour 
6131*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
vmax_f32(float32x2_t a,float32x2_t b)6132*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
6133*80a68eefSBob Badour {
6134*80a68eefSBob Badour     //serial solution looks faster than  SIMD one
6135*80a68eefSBob Badour     float32x2_t res;
6136*80a68eefSBob Badour     res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
6137*80a68eefSBob Badour     res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
6138*80a68eefSBob Badour     return res;
6139*80a68eefSBob Badour }
6140*80a68eefSBob Badour 
6141*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
6142*80a68eefSBob Badour #define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
6143*80a68eefSBob Badour 
6144*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
6145*80a68eefSBob Badour #define vmaxq_s16 _mm_max_epi16
6146*80a68eefSBob Badour 
6147*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
6148*80a68eefSBob Badour #define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
6149*80a68eefSBob Badour 
6150*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
6151*80a68eefSBob Badour #define vmaxq_u8 _mm_max_epu8
6152*80a68eefSBob Badour 
6153*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
6154*80a68eefSBob Badour #define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
6155*80a68eefSBob Badour 
6156*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
6157*80a68eefSBob Badour #define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
6158*80a68eefSBob Badour 
6159*80a68eefSBob Badour 
6160*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
6161*80a68eefSBob Badour #define vmaxq_f32 _mm_max_ps
6162*80a68eefSBob Badour 
6163*80a68eefSBob Badour 
6164*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
6165*80a68eefSBob Badour #define vmaxq_f64 _mm_max_pd
6166*80a68eefSBob Badour 
6167*80a68eefSBob Badour 
6168*80a68eefSBob Badour //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
6169*80a68eefSBob Badour //***********************************************************************************************************
6170*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
vmin_s8(int8x8_t a,int8x8_t b)6171*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b)
6172*80a68eefSBob Badour {
6173*80a68eefSBob Badour     int8x8_t res64;
6174*80a68eefSBob Badour     __m128i res;
6175*80a68eefSBob Badour     res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6176*80a68eefSBob Badour     return64(res);
6177*80a68eefSBob Badour }
6178*80a68eefSBob Badour 
6179*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
vmin_s16(int16x4_t a,int16x4_t b)6180*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
6181*80a68eefSBob Badour {
6182*80a68eefSBob Badour     int16x4_t res64;
6183*80a68eefSBob Badour     return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
6184*80a68eefSBob Badour }
6185*80a68eefSBob Badour 
6186*80a68eefSBob Badour 
6187*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
vmin_s32(int32x2_t a,int32x2_t b)6188*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b)
6189*80a68eefSBob Badour {
6190*80a68eefSBob Badour     int32x2_t res64;
6191*80a68eefSBob Badour     __m128i res;
6192*80a68eefSBob Badour     res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6193*80a68eefSBob Badour     return64(res);
6194*80a68eefSBob Badour }
6195*80a68eefSBob Badour 
6196*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
vmin_u8(uint8x8_t a,uint8x8_t b)6197*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
6198*80a68eefSBob Badour {
6199*80a68eefSBob Badour     uint8x8_t res64;
6200*80a68eefSBob Badour     return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
6201*80a68eefSBob Badour }
6202*80a68eefSBob Badour 
6203*80a68eefSBob Badour 
6204*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
vmin_u16(uint16x4_t a,uint16x4_t b)6205*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
6206*80a68eefSBob Badour {
6207*80a68eefSBob Badour     uint16x4_t res64;
6208*80a68eefSBob Badour     return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
6209*80a68eefSBob Badour }
6210*80a68eefSBob Badour 
6211*80a68eefSBob Badour 
6212*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
vmin_u32(uint32x2_t a,uint32x2_t b)6213*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b)
6214*80a68eefSBob Badour {
6215*80a68eefSBob Badour     uint32x2_t res64;
6216*80a68eefSBob Badour     __m128i res;
6217*80a68eefSBob Badour     res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6218*80a68eefSBob Badour     return64(res);
6219*80a68eefSBob Badour }
6220*80a68eefSBob Badour 
6221*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
vmin_f32(float32x2_t a,float32x2_t b)6222*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
6223*80a68eefSBob Badour {
6224*80a68eefSBob Badour     //serial solution looks faster than  SIMD one
6225*80a68eefSBob Badour     float32x2_t res;
6226*80a68eefSBob Badour     res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
6227*80a68eefSBob Badour     res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
6228*80a68eefSBob Badour     return res;
6229*80a68eefSBob Badour }
6230*80a68eefSBob Badour 
6231*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
6232*80a68eefSBob Badour #define vminq_s8 _MM_MIN_EPI8 //SSE4.1
6233*80a68eefSBob Badour 
6234*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
6235*80a68eefSBob Badour #define vminq_s16 _mm_min_epi16
6236*80a68eefSBob Badour 
6237*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
6238*80a68eefSBob Badour #define vminq_s32 _MM_MIN_EPI32 //SSE4.1
6239*80a68eefSBob Badour 
6240*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
6241*80a68eefSBob Badour #define vminq_u8 _mm_min_epu8
6242*80a68eefSBob Badour 
6243*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
6244*80a68eefSBob Badour #define vminq_u16 _MM_MIN_EPU16 //SSE4.1
6245*80a68eefSBob Badour 
6246*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
6247*80a68eefSBob Badour #define vminq_u32 _MM_MIN_EPU32 //SSE4.1
6248*80a68eefSBob Badour 
6249*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
6250*80a68eefSBob Badour #define vminq_f32 _mm_min_ps
6251*80a68eefSBob Badour 
6252*80a68eefSBob Badour 
6253*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
6254*80a68eefSBob Badour #define vminq_f64 _mm_min_pd
6255*80a68eefSBob Badour 
6256*80a68eefSBob Badour 
6257*80a68eefSBob Badour //*************  Pairwise addition operations. **************************************
6258*80a68eefSBob Badour //************************************************************************************
6259*80a68eefSBob Badour //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
6260*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
vpadd_s8(int8x8_t a,int8x8_t b)6261*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
6262*80a68eefSBob Badour {
6263*80a68eefSBob Badour     //no 8 bit hadd in IA32, need to go to 16 bit and then pack
6264*80a68eefSBob Badour     int8x8_t res64;
6265*80a68eefSBob Badour     __m128i a16, b16, res;
6266*80a68eefSBob Badour     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6267*80a68eefSBob Badour     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
6268*80a68eefSBob Badour     res = _mm_hadd_epi16 (a16, b16);
6269*80a68eefSBob Badour     res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
6270*80a68eefSBob Badour     return64(res);
6271*80a68eefSBob Badour }
6272*80a68eefSBob Badour 
6273*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
vpadd_s16(int16x4_t a,int16x4_t b)6274*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b)
6275*80a68eefSBob Badour {
6276*80a68eefSBob Badour     int16x4_t res64;
6277*80a68eefSBob Badour     __m128i hadd128;
6278*80a68eefSBob Badour     hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
6279*80a68eefSBob Badour     hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6280*80a68eefSBob Badour     return64(hadd128);
6281*80a68eefSBob Badour }
6282*80a68eefSBob Badour 
6283*80a68eefSBob Badour 
6284*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
vpadd_s32(int32x2_t a,int32x2_t b)6285*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b)
6286*80a68eefSBob Badour {
6287*80a68eefSBob Badour     int32x2_t res64;
6288*80a68eefSBob Badour     __m128i hadd128;
6289*80a68eefSBob Badour     hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
6290*80a68eefSBob Badour     hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6291*80a68eefSBob Badour     return64(hadd128);
6292*80a68eefSBob Badour }
6293*80a68eefSBob Badour 
6294*80a68eefSBob Badour 
6295*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
vpadd_u8(uint8x8_t a,uint8x8_t b)6296*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
6297*80a68eefSBob Badour {
6298*80a68eefSBob Badour     //  no 8 bit hadd in IA32, need to go to 16 bit and then pack
6299*80a68eefSBob Badour     uint8x8_t res64;
6300*80a68eefSBob Badour //  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6301*80a68eefSBob Badour     __m128i mask8, a16, b16, res;
6302*80a68eefSBob Badour     mask8 = _mm_set1_epi16(0xff);
6303*80a68eefSBob Badour     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
6304*80a68eefSBob Badour     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
6305*80a68eefSBob Badour     res = _mm_hadd_epi16 (a16, b16);
6306*80a68eefSBob Badour     res = _mm_and_si128(res, mask8); //to avoid saturation
6307*80a68eefSBob Badour     res = _mm_packus_epi16 (res,res); //use low 64 bits
6308*80a68eefSBob Badour     return64(res);
6309*80a68eefSBob Badour }
6310*80a68eefSBob Badour 
6311*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
vpadd_u16(uint16x4_t a,uint16x4_t b)6312*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
6313*80a68eefSBob Badour {
6314*80a68eefSBob Badour     // solution may be not optimal, serial execution may be faster
6315*80a68eefSBob Badour     // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
6316*80a68eefSBob Badour     uint16x4_t res64;
6317*80a68eefSBob Badour     __m128i c32767,  cfffe, as, bs, res;
6318*80a68eefSBob Badour     c32767 = _mm_set1_epi16 (32767);
6319*80a68eefSBob Badour     cfffe = _mm_set1_epi16 ((int16_t)0xfffe);
6320*80a68eefSBob Badour     as = _mm_sub_epi16 (_pM128i(a), c32767);
6321*80a68eefSBob Badour     bs = _mm_sub_epi16 (_pM128i(b), c32767);
6322*80a68eefSBob Badour     res = _mm_hadd_epi16 (as, bs);
6323*80a68eefSBob Badour     res = _mm_add_epi16 (res, cfffe);
6324*80a68eefSBob Badour     res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6325*80a68eefSBob Badour     return64(res);
6326*80a68eefSBob Badour }
6327*80a68eefSBob Badour 
6328*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
vpadd_u32(uint32x2_t a,uint32x2_t b)6329*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
6330*80a68eefSBob Badour {
6331*80a68eefSBob Badour     //hadd doesn't work for unsigned values
6332*80a68eefSBob Badour     uint32x2_t res64;
6333*80a68eefSBob Badour     __m128i ab, ab_sh, res;
6334*80a68eefSBob Badour     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
6335*80a68eefSBob Badour     ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
6336*80a68eefSBob Badour     res = _mm_add_epi32(ab, ab_sh);
6337*80a68eefSBob Badour     res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6338*80a68eefSBob Badour     return64(res);
6339*80a68eefSBob Badour }
6340*80a68eefSBob Badour 
6341*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
vpadd_f32(float32x2_t a,float32x2_t b)6342*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
6343*80a68eefSBob Badour {
6344*80a68eefSBob Badour     __m128 hadd128;
6345*80a68eefSBob Badour     __m64_128 res64;
6346*80a68eefSBob Badour     hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
6347*80a68eefSBob Badour     hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
6348*80a68eefSBob Badour     _M64f(res64, hadd128);
6349*80a68eefSBob Badour     return res64;
6350*80a68eefSBob Badour }
6351*80a68eefSBob Badour 
6352*80a68eefSBob Badour 
6353*80a68eefSBob Badour //**************************  Long pairwise add  **********************************
6354*80a68eefSBob Badour //*********************************************************************************
6355*80a68eefSBob Badour //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
6356*80a68eefSBob Badour // and places the final results in the destination vector.
6357*80a68eefSBob Badour 
6358*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
vpaddl_s8(int8x8_t a)6359*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
6360*80a68eefSBob Badour {
6361*80a68eefSBob Badour     //no 8 bit hadd in IA32, need to go to 16 bit anyway
6362*80a68eefSBob Badour     __m128i a16;
6363*80a68eefSBob Badour     int16x4_t res64;
6364*80a68eefSBob Badour     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6365*80a68eefSBob Badour     a16 = _mm_hadd_epi16 (a16,  a16); //use low 64 bits
6366*80a68eefSBob Badour     return64(a16);
6367*80a68eefSBob Badour }
6368*80a68eefSBob Badour 
6369*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
vpaddl_s16(int16x4_t a)6370*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
6371*80a68eefSBob Badour {
6372*80a68eefSBob Badour     // solution may be not optimal, serial execution may be faster
6373*80a68eefSBob Badour     int32x2_t res64;
6374*80a68eefSBob Badour     __m128i r32_1;
6375*80a68eefSBob Badour     r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
6376*80a68eefSBob Badour     r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
6377*80a68eefSBob Badour     return64(r32_1);
6378*80a68eefSBob Badour }
6379*80a68eefSBob Badour 
6380*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32 (int32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6381*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6382*80a68eefSBob Badour {
6383*80a68eefSBob Badour     int64x1_t res;
6384*80a68eefSBob Badour     res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
6385*80a68eefSBob Badour     return res;
6386*80a68eefSBob Badour }
6387*80a68eefSBob Badour 
6388*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
vpaddl_u8(uint8x8_t a)6389*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
6390*80a68eefSBob Badour {
6391*80a68eefSBob Badour     //  no 8 bit hadd in IA32, need to go to 16 bit
6392*80a68eefSBob Badour //  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6393*80a68eefSBob Badour     uint16x4_t res64;
6394*80a68eefSBob Badour     __m128i a16;
6395*80a68eefSBob Badour     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
6396*80a68eefSBob Badour     a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
6397*80a68eefSBob Badour     return64(a16);
6398*80a68eefSBob Badour }
6399*80a68eefSBob Badour 
6400*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16 (uint16x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6401*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
6402*80a68eefSBob Badour {
6403*80a68eefSBob Badour     //serial solution looks faster than a SIMD one
6404*80a68eefSBob Badour     uint32x2_t res;
6405*80a68eefSBob Badour     res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
6406*80a68eefSBob Badour     res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
6407*80a68eefSBob Badour     return res;
6408*80a68eefSBob Badour }
6409*80a68eefSBob Badour 
6410*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32 (uint32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6411*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6412*80a68eefSBob Badour {
6413*80a68eefSBob Badour     uint64x1_t res;
6414*80a68eefSBob Badour     res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
6415*80a68eefSBob Badour     return res;
6416*80a68eefSBob Badour }
6417*80a68eefSBob Badour 
6418*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
vpaddlq_s8(int8x16_t a)6419*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
6420*80a68eefSBob Badour {
6421*80a68eefSBob Badour     //no 8 bit hadd in IA32, need to go to 16 bit
6422*80a68eefSBob Badour     __m128i r16_1, r16_2;
6423*80a68eefSBob Badour     r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
6424*80a68eefSBob Badour     //swap hi and low part of r to process the remaining data
6425*80a68eefSBob Badour     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6426*80a68eefSBob Badour     r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
6427*80a68eefSBob Badour     return _mm_hadd_epi16 (r16_1, r16_2);
6428*80a68eefSBob Badour }
6429*80a68eefSBob Badour 
6430*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
vpaddlq_s16(int16x8_t a)6431*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
6432*80a68eefSBob Badour {
6433*80a68eefSBob Badour     //no 8 bit hadd in IA32, need to go to 16 bit
6434*80a68eefSBob Badour     __m128i r32_1, r32_2;
6435*80a68eefSBob Badour     r32_1 = _MM_CVTEPI16_EPI32(a);
6436*80a68eefSBob Badour     //swap hi and low part of r to process the remaining data
6437*80a68eefSBob Badour     r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6438*80a68eefSBob Badour     r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
6439*80a68eefSBob Badour     return _mm_hadd_epi32 (r32_1, r32_2);
6440*80a68eefSBob Badour }
6441*80a68eefSBob Badour 
6442*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32 (int32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6443*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
6444*80a68eefSBob Badour {
6445*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int32_t atmp[4];
6446*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t res[2];
6447*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
6448*80a68eefSBob Badour     res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
6449*80a68eefSBob Badour     res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
6450*80a68eefSBob Badour     return _mm_load_si128((__m128i*)res);
6451*80a68eefSBob Badour }
6452*80a68eefSBob Badour 
6453*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
vpaddlq_u8(uint8x16_t a)6454*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
6455*80a68eefSBob Badour {
6456*80a68eefSBob Badour     //no 8 bit hadd in IA32, need to go to 16 bit
6457*80a68eefSBob Badour     __m128i r16_1, r16_2;
6458*80a68eefSBob Badour     r16_1 = _MM_CVTEPU8_EPI16(a);
6459*80a68eefSBob Badour     //swap hi and low part of r to process the remaining data
6460*80a68eefSBob Badour     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6461*80a68eefSBob Badour     r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
6462*80a68eefSBob Badour     return _mm_hadd_epi16 (r16_1, r16_2);
6463*80a68eefSBob Badour }
6464*80a68eefSBob Badour 
6465*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16 (uint16x8_t a),_NEON2SSE_REASON_SLOW_SERIAL)6466*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
6467*80a68eefSBob Badour {
6468*80a68eefSBob Badour     //serial solution looks faster than a SIMD one
6469*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint16_t atmp[8];
6470*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint32_t res[4];
6471*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
6472*80a68eefSBob Badour     res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
6473*80a68eefSBob Badour     res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
6474*80a68eefSBob Badour     res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
6475*80a68eefSBob Badour     res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
6476*80a68eefSBob Badour     return _mm_load_si128((__m128i*)res);
6477*80a68eefSBob Badour }
6478*80a68eefSBob Badour 
6479*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32 (uint32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6480*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6481*80a68eefSBob Badour {
6482*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
6483*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t res[2];
6484*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
6485*80a68eefSBob Badour     res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
6486*80a68eefSBob Badour     res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
6487*80a68eefSBob Badour     return _mm_load_si128((__m128i*)res);
6488*80a68eefSBob Badour }
6489*80a68eefSBob Badour 
6490*80a68eefSBob Badour //************************  Long pairwise add and accumulate **************************
6491*80a68eefSBob Badour //****************************************************************************************
6492*80a68eefSBob Badour //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
6493*80a68eefSBob Badour // and accumulates the  values of the results into the elements of the destination (wide) vector
6494*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b); // VPADAL.S8 d0,d0
vpadal_s8(int16x4_t a,int8x8_t b)6495*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b)
6496*80a68eefSBob Badour {
6497*80a68eefSBob Badour     int16x4_t res64;
6498*80a68eefSBob Badour     return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
6499*80a68eefSBob Badour }
6500*80a68eefSBob Badour 
6501*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b); // VPADAL.S16 d0,d0
vpadal_s16(int32x2_t a,int16x4_t b)6502*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b)
6503*80a68eefSBob Badour {
6504*80a68eefSBob Badour     int32x2_t res64;
6505*80a68eefSBob Badour     return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
6506*80a68eefSBob Badour }
6507*80a68eefSBob Badour 
6508*80a68eefSBob Badour 
6509*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
vpadal_s32(int64x1_t a,int32x2_t b)6510*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
6511*80a68eefSBob Badour {
6512*80a68eefSBob Badour     int64x1_t res;
6513*80a68eefSBob Badour     res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
6514*80a68eefSBob Badour     return res;
6515*80a68eefSBob Badour }
6516*80a68eefSBob Badour 
6517*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b); // VPADAL.U8 d0,d0
vpadal_u8(uint16x4_t a,uint8x8_t b)6518*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b)
6519*80a68eefSBob Badour {
6520*80a68eefSBob Badour     uint16x4_t res64;
6521*80a68eefSBob Badour     return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
6522*80a68eefSBob Badour }
6523*80a68eefSBob Badour 
6524*80a68eefSBob Badour 
6525*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b); // VPADAL.s16 d0,d0
vpadal_u16(uint32x2_t a,uint16x4_t b)6526*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b)
6527*80a68eefSBob Badour {
6528*80a68eefSBob Badour     uint32x2_t res64;
6529*80a68eefSBob Badour     return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
6530*80a68eefSBob Badour }
6531*80a68eefSBob Badour 
6532*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
vpadal_u32(uint64x1_t a,uint32x2_t b)6533*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
6534*80a68eefSBob Badour {
6535*80a68eefSBob Badour     uint64x1_t res;
6536*80a68eefSBob Badour     res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
6537*80a68eefSBob Badour     return res;
6538*80a68eefSBob Badour }
6539*80a68eefSBob Badour 
6540*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
vpadalq_s8(int16x8_t a,int8x16_t b)6541*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
6542*80a68eefSBob Badour {
6543*80a68eefSBob Badour     int16x8_t pad;
6544*80a68eefSBob Badour     pad = vpaddlq_s8(b);
6545*80a68eefSBob Badour     return _mm_add_epi16 (a, pad);
6546*80a68eefSBob Badour }
6547*80a68eefSBob Badour 
6548*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
vpadalq_s16(int32x4_t a,int16x8_t b)6549*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
6550*80a68eefSBob Badour {
6551*80a68eefSBob Badour     int32x4_t pad;
6552*80a68eefSBob Badour     pad = vpaddlq_s16(b);
6553*80a68eefSBob Badour     return _mm_add_epi32(a, pad);
6554*80a68eefSBob Badour }
6555*80a68eefSBob Badour 
6556*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
vpadalq_s32(int64x2_t a,int32x4_t b)6557*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
6558*80a68eefSBob Badour {
6559*80a68eefSBob Badour     int64x2_t pad;
6560*80a68eefSBob Badour     pad = vpaddlq_s32(b);
6561*80a68eefSBob Badour     return _mm_add_epi64 (a, pad);
6562*80a68eefSBob Badour }
6563*80a68eefSBob Badour 
6564*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
vpadalq_u8(uint16x8_t a,uint8x16_t b)6565*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
6566*80a68eefSBob Badour {
6567*80a68eefSBob Badour     uint16x8_t pad;
6568*80a68eefSBob Badour     pad = vpaddlq_u8(b);
6569*80a68eefSBob Badour     return _mm_add_epi16 (a, pad);
6570*80a68eefSBob Badour }
6571*80a68eefSBob Badour 
6572*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16 (uint32x4_t a,uint16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)6573*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6574*80a68eefSBob Badour {
6575*80a68eefSBob Badour     uint32x4_t pad;
6576*80a68eefSBob Badour     pad = vpaddlq_u16(b);
6577*80a68eefSBob Badour     return _mm_add_epi32(a, pad);
6578*80a68eefSBob Badour } //no optimal SIMD solution, serial is faster
6579*80a68eefSBob Badour 
6580*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32 (uint64x2_t a,uint32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)6581*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6582*80a68eefSBob Badour {
6583*80a68eefSBob Badour     //no optimal SIMD solution, serial is faster
6584*80a68eefSBob Badour     uint64x2_t pad;
6585*80a68eefSBob Badour     pad = vpaddlq_u32(b);
6586*80a68eefSBob Badour     return _mm_add_epi64(a, pad);
6587*80a68eefSBob Badour } //no optimal SIMD solution, serial is faster
6588*80a68eefSBob Badour 
6589*80a68eefSBob Badour //**********  Folding maximum   *************************************
6590*80a68eefSBob Badour //*******************************************************************
6591*80a68eefSBob Badour //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
6592*80a68eefSBob Badour //and copies the larger of each pair into the corresponding element in the destination
6593*80a68eefSBob Badour //    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
6594*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
vpmax_s8(int8x8_t a,int8x8_t b)6595*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
6596*80a68eefSBob Badour {
6597*80a68eefSBob Badour     int8x8_t res64;
6598*80a68eefSBob Badour     __m128i ab, ab1, max;
6599*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
6600*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6601*80a68eefSBob Badour     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6602*80a68eefSBob Badour     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6603*80a68eefSBob Badour     max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
6604*80a68eefSBob Badour     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
6605*80a68eefSBob Badour     return64(max); //we need 64 bits only
6606*80a68eefSBob Badour }
6607*80a68eefSBob Badour 
6608*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
vpmax_s16(int16x4_t a,int16x4_t b)6609*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
6610*80a68eefSBob Badour {
6611*80a68eefSBob Badour     //solution may be not optimal compared with the serial one
6612*80a68eefSBob Badour     int16x4_t res64;
6613*80a68eefSBob Badour     __m128i ab, ab1, max;
6614*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6615*80a68eefSBob Badour     ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
6616*80a68eefSBob Badour     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6617*80a68eefSBob Badour     max = _mm_max_epi16 (ab, ab1);
6618*80a68eefSBob Badour     max =  _mm_shuffle_epi8 (max, *(__m128i*)  mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6619*80a68eefSBob Badour     return64(max);
6620*80a68eefSBob Badour }
6621*80a68eefSBob Badour 
6622*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6623*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6624*80a68eefSBob Badour {
6625*80a68eefSBob Badour     //serial solution looks faster than SIMD one
6626*80a68eefSBob Badour     int32x2_t res;
6627*80a68eefSBob Badour     res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6628*80a68eefSBob Badour     res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6629*80a68eefSBob Badour     return res;
6630*80a68eefSBob Badour }
6631*80a68eefSBob Badour 
6632*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
vpmax_u8(uint8x8_t a,uint8x8_t b)6633*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
6634*80a68eefSBob Badour {
6635*80a68eefSBob Badour     uint8x8_t res64;
6636*80a68eefSBob Badour     __m128i ab, ab1, max;
6637*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6638*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6639*80a68eefSBob Badour     ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
6640*80a68eefSBob Badour     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6641*80a68eefSBob Badour     max = _mm_max_epu8 (ab, ab1); // SSE4.1
6642*80a68eefSBob Badour     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
6643*80a68eefSBob Badour     return64(max);
6644*80a68eefSBob Badour }
6645*80a68eefSBob Badour 
6646*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
vpmax_u16(uint16x4_t a,uint16x4_t b)6647*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
6648*80a68eefSBob Badour {
6649*80a68eefSBob Badour     //solution may be not optimal compared with the serial one
6650*80a68eefSBob Badour     uint16x4_t res64;
6651*80a68eefSBob Badour     __m128i ab, ab1, max;
6652*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6653*80a68eefSBob Badour     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6654*80a68eefSBob Badour     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6655*80a68eefSBob Badour     max = _MM_MAX_EPU16 (ab, ab1);
6656*80a68eefSBob Badour     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6657*80a68eefSBob Badour     return64(max);
6658*80a68eefSBob Badour }
6659*80a68eefSBob Badour 
6660*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6661*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6662*80a68eefSBob Badour {
6663*80a68eefSBob Badour     //serial solution looks faster than SIMD one
6664*80a68eefSBob Badour     uint32x2_t res;
6665*80a68eefSBob Badour     res.m64_u32[0] = (a.m64_u32[0] < a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
6666*80a68eefSBob Badour     res.m64_u32[1] = (b.m64_u32[0] < b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
6667*80a68eefSBob Badour     return res;
6668*80a68eefSBob Badour }
6669*80a68eefSBob Badour 
6670*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32 (float32x2_t a,float32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6671*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6672*80a68eefSBob Badour {
6673*80a68eefSBob Badour     //serial solution looks faster than  SIMD one
6674*80a68eefSBob Badour     float32x2_t res;
6675*80a68eefSBob Badour     res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
6676*80a68eefSBob Badour     res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
6677*80a68eefSBob Badour     return res;
6678*80a68eefSBob Badour }
6679*80a68eefSBob Badour 
6680*80a68eefSBob Badour // ***************** Folding minimum  ****************************
6681*80a68eefSBob Badour // **************************************************************
6682*80a68eefSBob Badour //vpmin -> takes minimum of adjacent pairs
6683*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
vpmin_s8(int8x8_t a,int8x8_t b)6684*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
6685*80a68eefSBob Badour {
6686*80a68eefSBob Badour     int8x8_t res64;
6687*80a68eefSBob Badour     __m128i ab, ab1, min;
6688*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
6689*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6690*80a68eefSBob Badour     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6691*80a68eefSBob Badour     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
6692*80a68eefSBob Badour     min =  _MM_MIN_EPI8 (ab, ab1); // SSE4.1
6693*80a68eefSBob Badour     min =  _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
6694*80a68eefSBob Badour     return64(min);
6695*80a68eefSBob Badour }
6696*80a68eefSBob Badour 
6697*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
vpmin_s16(int16x4_t a,int16x4_t b)6698*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
6699*80a68eefSBob Badour {
6700*80a68eefSBob Badour     //solution may be not optimal compared with the serial one
6701*80a68eefSBob Badour     int16x4_t res64;
6702*80a68eefSBob Badour     __m128i ab, ab1, min;
6703*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6704*80a68eefSBob Badour     ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
6705*80a68eefSBob Badour     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6706*80a68eefSBob Badour     min = _mm_min_epi16 (ab, ab1);
6707*80a68eefSBob Badour     min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6708*80a68eefSBob Badour     return64(min);
6709*80a68eefSBob Badour }
6710*80a68eefSBob Badour 
6711*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6712*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6713*80a68eefSBob Badour {
6714*80a68eefSBob Badour     //serial solution looks faster than SIMD one
6715*80a68eefSBob Badour     int32x2_t res;
6716*80a68eefSBob Badour     res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6717*80a68eefSBob Badour     res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6718*80a68eefSBob Badour     return res;
6719*80a68eefSBob Badour }
6720*80a68eefSBob Badour 
6721*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
vpmin_u8(uint8x8_t a,uint8x8_t b)6722*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
6723*80a68eefSBob Badour {
6724*80a68eefSBob Badour     uint8x8_t res64;
6725*80a68eefSBob Badour     __m128i ab, ab1, min;
6726*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
6727*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6728*80a68eefSBob Badour     ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
6729*80a68eefSBob Badour     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6730*80a68eefSBob Badour     min = _mm_min_epu8 (ab, ab1); // SSE4.1
6731*80a68eefSBob Badour     min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
6732*80a68eefSBob Badour     return64(min);
6733*80a68eefSBob Badour }
6734*80a68eefSBob Badour 
6735*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
vpmin_u16(uint16x4_t a,uint16x4_t b)6736*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
6737*80a68eefSBob Badour {
6738*80a68eefSBob Badour     //solution may be not optimal compared with the serial one
6739*80a68eefSBob Badour     uint16x4_t res64;
6740*80a68eefSBob Badour     __m128i ab, ab1, min;
6741*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6742*80a68eefSBob Badour     ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
6743*80a68eefSBob Badour     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
6744*80a68eefSBob Badour     min = _MM_MIN_EPU16 (ab, ab1);
6745*80a68eefSBob Badour     min =    _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6746*80a68eefSBob Badour     return64(min);
6747*80a68eefSBob Badour }
6748*80a68eefSBob Badour 
6749*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6750*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6751*80a68eefSBob Badour {
6752*80a68eefSBob Badour     //serial solution looks faster than SIMD one
6753*80a68eefSBob Badour     uint32x2_t res;
6754*80a68eefSBob Badour     res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
6755*80a68eefSBob Badour     res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
6756*80a68eefSBob Badour     return res;
6757*80a68eefSBob Badour }
6758*80a68eefSBob Badour 
6759*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32 (float32x2_t a,float32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6760*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6761*80a68eefSBob Badour {
6762*80a68eefSBob Badour     //serial solution looks faster than SIMD one
6763*80a68eefSBob Badour     float32x2_t res;
6764*80a68eefSBob Badour     res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
6765*80a68eefSBob Badour     res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
6766*80a68eefSBob Badour     return res;
6767*80a68eefSBob Badour }
6768*80a68eefSBob Badour 
6769*80a68eefSBob Badour //***************************************************************
6770*80a68eefSBob Badour //***********  Reciprocal/Sqrt ************************************
6771*80a68eefSBob Badour //***************************************************************
6772*80a68eefSBob Badour //****************** Reciprocal estimate *******************************
6773*80a68eefSBob Badour //the ARM NEON and x86 SIMD results may be slightly different
6774*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
vrecpe_f32(float32x2_t a)6775*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
6776*80a68eefSBob Badour {
6777*80a68eefSBob Badour     float32x4_t res;
6778*80a68eefSBob Badour     __m64_128 res64;
6779*80a68eefSBob Badour     res = _mm_rcp_ps(_pM128(a));
6780*80a68eefSBob Badour     _M64f(res64, res);
6781*80a68eefSBob Badour     return res64;
6782*80a68eefSBob Badour }
6783*80a68eefSBob Badour 
6784*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32 (uint32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6785*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6786*80a68eefSBob Badour {
6787*80a68eefSBob Badour     //Input is  fixed point number!!! No reciprocal for ints in IA32 available
6788*80a68eefSBob Badour     uint32x2_t res;
6789*80a68eefSBob Badour     float resf, r;
6790*80a68eefSBob Badour     int i, q, s;
6791*80a68eefSBob Badour     for (i =0; i<2; i++){
6792*80a68eefSBob Badour         if((a.m64_u32[i] & 0x80000000) == 0) {
6793*80a68eefSBob Badour             res.m64_u32[i] = 0xffffffff;
6794*80a68eefSBob Badour         }else{
6795*80a68eefSBob Badour             resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
6796*80a68eefSBob Badour             q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
6797*80a68eefSBob Badour             r = (float)(1.0 / (((float)q + 0.5) / 512.0)); /* reciprocal r */
6798*80a68eefSBob Badour             s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6799*80a68eefSBob Badour             r =  (float)s / 256.0;
6800*80a68eefSBob Badour             res.m64_u32[i] = r * (uint32_t)(1 << 31);
6801*80a68eefSBob Badour         }
6802*80a68eefSBob Badour     }
6803*80a68eefSBob Badour     return res;
6804*80a68eefSBob Badour }
6805*80a68eefSBob Badour 
6806*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
6807*80a68eefSBob Badour #define vrecpeq_f32 _mm_rcp_ps
6808*80a68eefSBob Badour 
6809*80a68eefSBob Badour 
6810*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32 (uint32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6811*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6812*80a68eefSBob Badour {
6813*80a68eefSBob Badour     //Input is  fixed point number!!!
6814*80a68eefSBob Badour     //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
6815*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
6816*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint32_t res[4];
6817*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
6818*80a68eefSBob Badour     float resf, r;
6819*80a68eefSBob Badour     int i, q, s;
6820*80a68eefSBob Badour     __m128i res128, mask, zero;
6821*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
6822*80a68eefSBob Badour     zero = _mm_setzero_si128();
6823*80a68eefSBob Badour     for (i =0; i<4; i++){
6824*80a68eefSBob Badour         resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31)));  //  2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
6825*80a68eefSBob Badour         q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
6826*80a68eefSBob Badour         r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
6827*80a68eefSBob Badour         s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6828*80a68eefSBob Badour         r =  (float)s / 256.0;
6829*80a68eefSBob Badour         res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
6830*80a68eefSBob Badour     }
6831*80a68eefSBob Badour     res128 = _mm_load_si128((__m128i*)res);
6832*80a68eefSBob Badour     mask = _mm_and_si128(a, *(__m128i*)c80000000);
6833*80a68eefSBob Badour     mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x7fffffff
6834*80a68eefSBob Badour     return _mm_or_si128(res128, mask);
6835*80a68eefSBob Badour }
6836*80a68eefSBob Badour 
6837*80a68eefSBob Badour //**********Reciprocal square root estimate ****************
6838*80a68eefSBob Badour //**********************************************************
6839*80a68eefSBob Badour //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
6840*80a68eefSBob Badour //but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
6841*80a68eefSBob Badour ////the ARM NEON and x86 SIMD results may be slightly different
6842*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
vrsqrte_f32(float32x2_t a)6843*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
6844*80a68eefSBob Badour {
6845*80a68eefSBob Badour     float32x4_t res;
6846*80a68eefSBob Badour     __m64_128 res64;
6847*80a68eefSBob Badour     res = _mm_rsqrt_ps(_pM128(a));
6848*80a68eefSBob Badour     _M64f(res64, res);
6849*80a68eefSBob Badour     return res64;
6850*80a68eefSBob Badour }
6851*80a68eefSBob Badour 
6852*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32 (uint32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6853*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6854*80a68eefSBob Badour {
6855*80a68eefSBob Badour     //Input is  fixed point number!!!
6856*80a68eefSBob Badour     //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
6857*80a68eefSBob Badour    uint32x2_t res;
6858*80a68eefSBob Badour    __m128 tmp;
6859*80a68eefSBob Badour     float r, resf, coeff;
6860*80a68eefSBob Badour     int i,q0, s;
6861*80a68eefSBob Badour     for (i =0; i<2; i++){
6862*80a68eefSBob Badour         if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
6863*80a68eefSBob Badour             res.m64_u32[i] = 0xffffffff;
6864*80a68eefSBob Badour         }else{
6865*80a68eefSBob Badour             resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
6866*80a68eefSBob Badour             coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
6867*80a68eefSBob Badour             q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
6868*80a68eefSBob Badour             r = ((float)q0 + 0.5) / coeff;
6869*80a68eefSBob Badour             tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
6870*80a68eefSBob Badour             _mm_store_ss(&r, tmp);
6871*80a68eefSBob Badour             s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6872*80a68eefSBob Badour             r = (float)(s / 256.0);
6873*80a68eefSBob Badour             res.m64_u32[i] = r * (((uint32_t)1) << 31);
6874*80a68eefSBob Badour         }
6875*80a68eefSBob Badour     }
6876*80a68eefSBob Badour     return res;
6877*80a68eefSBob Badour }
6878*80a68eefSBob Badour 
6879*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
6880*80a68eefSBob Badour #define vrsqrteq_f32 _mm_rsqrt_ps
6881*80a68eefSBob Badour 
6882*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32 (uint32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6883*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6884*80a68eefSBob Badour {
6885*80a68eefSBob Badour     //Input is  fixed point number!!!
6886*80a68eefSBob Badour     //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
6887*80a68eefSBob Badour    _NEON2SSE_ALIGN_16 uint32_t  atmp[4], res[4];
6888*80a68eefSBob Badour    _NEON2SSE_ALIGN_16 static const uint32_t c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
6889*80a68eefSBob Badour    __m128 tmp;
6890*80a68eefSBob Badour    __m128i res128, mask, zero;
6891*80a68eefSBob Badour     float r, resf, coeff;
6892*80a68eefSBob Badour     int i,q0, s;
6893*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
6894*80a68eefSBob Badour     zero = _mm_setzero_si128();
6895*80a68eefSBob Badour     for (i =0; i<4; i++){
6896*80a68eefSBob Badour         resf =  (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
6897*80a68eefSBob Badour         coeff = (float)((resf < 0.5)? 512.0 : 256.0); /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
6898*80a68eefSBob Badour         q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
6899*80a68eefSBob Badour         r = ((float)q0 + 0.5) / coeff;
6900*80a68eefSBob Badour         tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
6901*80a68eefSBob Badour         _mm_store_ss(&r, tmp);
6902*80a68eefSBob Badour         s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6903*80a68eefSBob Badour         r = (float)s / 256.0;
6904*80a68eefSBob Badour         res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
6905*80a68eefSBob Badour     }
6906*80a68eefSBob Badour     res128 = _mm_load_si128((__m128i*)res);
6907*80a68eefSBob Badour     mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
6908*80a68eefSBob Badour     mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x3fffffff
6909*80a68eefSBob Badour     return _mm_or_si128(res128, mask);
6910*80a68eefSBob Badour }
6911*80a68eefSBob Badour //************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
6912*80a68eefSBob Badour //******************************************************************************************
6913*80a68eefSBob Badour //******VRECPS (Vector Reciprocal Step) ***************************************************
6914*80a68eefSBob Badour //multiplies the elements of one vector by the corresponding elements of another vector,
6915*80a68eefSBob Badour //subtracts each of the results from 2, and places the final results into the elements of the destination vector.
6916*80a68eefSBob Badour 
6917*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
vrecps_f32(float32x2_t a,float32x2_t b)6918*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
6919*80a68eefSBob Badour {
6920*80a68eefSBob Badour     float32x4_t res;
6921*80a68eefSBob Badour     __m64_128 res64;
6922*80a68eefSBob Badour     res = vrecpsq_f32(_pM128(a), _pM128(b));
6923*80a68eefSBob Badour     _M64f(res64, res);
6924*80a68eefSBob Badour     return res64;
6925*80a68eefSBob Badour }
6926*80a68eefSBob Badour 
6927*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
vrecpsq_f32(float32x4_t a,float32x4_t b)6928*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
6929*80a68eefSBob Badour {
6930*80a68eefSBob Badour     __m128 f2, mul;
6931*80a68eefSBob Badour     f2 =  _mm_set1_ps(2.);
6932*80a68eefSBob Badour     mul = _mm_mul_ps(a,b);
6933*80a68eefSBob Badour     return _mm_sub_ps(f2,mul);
6934*80a68eefSBob Badour }
6935*80a68eefSBob Badour 
6936*80a68eefSBob Badour //*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
6937*80a68eefSBob Badour //multiplies the elements of one vector by the corresponding elements of another vector,
6938*80a68eefSBob Badour //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
6939*80a68eefSBob Badour 
6940*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
vrsqrts_f32(float32x2_t a,float32x2_t b)6941*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
6942*80a68eefSBob Badour {
6943*80a68eefSBob Badour     float32x2_t res;
6944*80a68eefSBob Badour     res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
6945*80a68eefSBob Badour     res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
6946*80a68eefSBob Badour     return res;
6947*80a68eefSBob Badour }
6948*80a68eefSBob Badour 
6949*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
vrsqrtsq_f32(float32x4_t a,float32x4_t b)6950*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
6951*80a68eefSBob Badour {
6952*80a68eefSBob Badour     __m128 f3, f05, mul;
6953*80a68eefSBob Badour     f3 =  _mm_set1_ps(3.);
6954*80a68eefSBob Badour     f05 =  _mm_set1_ps(0.5);
6955*80a68eefSBob Badour     mul = _mm_mul_ps(a,b);
6956*80a68eefSBob Badour     f3 = _mm_sub_ps(f3,mul);
6957*80a68eefSBob Badour     return _mm_mul_ps (f3, f05);
6958*80a68eefSBob Badour }
6959*80a68eefSBob Badour //********************************************************************************************
6960*80a68eefSBob Badour //***************************** Shifts by signed variable ***********************************
6961*80a68eefSBob Badour //********************************************************************************************
6962*80a68eefSBob Badour //***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
6963*80a68eefSBob Badour //********************************************************************************************
6964*80a68eefSBob Badour //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
6965*80a68eefSBob Badour //helper macro. It matches ARM implementation for big shifts
6966*80a68eefSBob Badour #define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
6967*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
6968*80a68eefSBob Badour         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
6969*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
6970*80a68eefSBob Badour         if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
6971*80a68eefSBob Badour         else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
6972*80a68eefSBob Badour         return _mm_load_si128((__m128i*)res);
6973*80a68eefSBob Badour 
6974*80a68eefSBob Badour #define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
6975*80a68eefSBob Badour         int ## TYPE ## x ## LEN ## _t res;  int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
6976*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
6977*80a68eefSBob Badour         if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
6978*80a68eefSBob Badour         else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
6979*80a68eefSBob Badour         return res;
6980*80a68eefSBob Badour 
6981*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)6982*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6983*80a68eefSBob Badour {
6984*80a68eefSBob Badour     SERIAL_SHIFT_64(8, i, 8)
6985*80a68eefSBob Badour }
6986*80a68eefSBob Badour 
6987*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)6988*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6989*80a68eefSBob Badour {
6990*80a68eefSBob Badour     SERIAL_SHIFT_64(16, i, 4)
6991*80a68eefSBob Badour }
6992*80a68eefSBob Badour 
6993*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6994*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
6995*80a68eefSBob Badour {
6996*80a68eefSBob Badour     SERIAL_SHIFT_64(32, i, 2)
6997*80a68eefSBob Badour }
6998*80a68eefSBob Badour 
6999*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7000*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7001*80a68eefSBob Badour {
7002*80a68eefSBob Badour     SERIAL_SHIFT_64(64, i, 1)
7003*80a68eefSBob Badour }
7004*80a68eefSBob Badour 
7005*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7006*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7007*80a68eefSBob Badour {
7008*80a68eefSBob Badour     SERIAL_SHIFT_64(8, u, 8)
7009*80a68eefSBob Badour }
7010*80a68eefSBob Badour 
7011*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7012*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7013*80a68eefSBob Badour {
7014*80a68eefSBob Badour     SERIAL_SHIFT_64(16, u, 4)
7015*80a68eefSBob Badour }
7016*80a68eefSBob Badour 
7017*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7018*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7019*80a68eefSBob Badour {
7020*80a68eefSBob Badour     SERIAL_SHIFT_64(32, u, 2)
7021*80a68eefSBob Badour }
7022*80a68eefSBob Badour 
7023*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
vshl_u64(uint64x1_t a,int64x1_t b)7024*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing  for large numbers
7025*80a68eefSBob Badour {
7026*80a68eefSBob Badour     SERIAL_SHIFT_64(64, u, 1)
7027*80a68eefSBob Badour }
7028*80a68eefSBob Badour 
7029*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7030*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7031*80a68eefSBob Badour {
7032*80a68eefSBob Badour     SERIAL_SHIFT(int8_t, int8_t, 16, 16)
7033*80a68eefSBob Badour }
7034*80a68eefSBob Badour 
7035*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7036*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7037*80a68eefSBob Badour {
7038*80a68eefSBob Badour     SERIAL_SHIFT(int16_t, int16_t, 8, 8)
7039*80a68eefSBob Badour }
7040*80a68eefSBob Badour 
7041*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7042*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7043*80a68eefSBob Badour {
7044*80a68eefSBob Badour     SERIAL_SHIFT(int32_t, int32_t, 4, 4)
7045*80a68eefSBob Badour }
7046*80a68eefSBob Badour 
7047*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7048*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7049*80a68eefSBob Badour {
7050*80a68eefSBob Badour     SERIAL_SHIFT(int64_t, int64_t, 2, 2)
7051*80a68eefSBob Badour }
7052*80a68eefSBob Badour 
7053*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7054*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7055*80a68eefSBob Badour {
7056*80a68eefSBob Badour     SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
7057*80a68eefSBob Badour }
7058*80a68eefSBob Badour 
7059*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7060*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7061*80a68eefSBob Badour {
7062*80a68eefSBob Badour     SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
7063*80a68eefSBob Badour }
7064*80a68eefSBob Badour 
7065*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7066*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7067*80a68eefSBob Badour {
7068*80a68eefSBob Badour     SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
7069*80a68eefSBob Badour }
7070*80a68eefSBob Badour 
7071*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7072*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7073*80a68eefSBob Badour {
7074*80a68eefSBob Badour     SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
7075*80a68eefSBob Badour }
7076*80a68eefSBob Badour 
7077*80a68eefSBob Badour 
7078*80a68eefSBob Badour //*********** Vector saturating shift left: (negative values shift right) **********************
7079*80a68eefSBob Badour //********************************************************************************************
7080*80a68eefSBob Badour //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7081*80a68eefSBob Badour #define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7082*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7083*80a68eefSBob Badour         int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7084*80a68eefSBob Badour         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7085*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
7086*80a68eefSBob Badour         if (atmp[i] ==0) res[i] = 0; \
7087*80a68eefSBob Badour         else{ \
7088*80a68eefSBob Badour             if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
7089*80a68eefSBob Badour             else{ \
7090*80a68eefSBob Badour                 if (btmp[i]>lanesize_1) { \
7091*80a68eefSBob Badour                     res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7092*80a68eefSBob Badour                 }else{ \
7093*80a68eefSBob Badour                     limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7094*80a68eefSBob Badour                     if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7095*80a68eefSBob Badour                         res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7096*80a68eefSBob Badour                     else res[i] = atmp[i] << btmp[i]; }}}} \
7097*80a68eefSBob Badour         return _mm_load_si128((__m128i*)res);
7098*80a68eefSBob Badour 
7099*80a68eefSBob Badour #define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7100*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7101*80a68eefSBob Badour         TYPE lanesize = (sizeof(TYPE) << 3); \
7102*80a68eefSBob Badour         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7103*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
7104*80a68eefSBob Badour         if (atmp[i] ==0) {res[i] = 0; \
7105*80a68eefSBob Badour         }else{ \
7106*80a68eefSBob Badour             if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
7107*80a68eefSBob Badour             else{ \
7108*80a68eefSBob Badour                 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
7109*80a68eefSBob Badour                 else{ \
7110*80a68eefSBob Badour                     limit = (TYPE) 1 << (lanesize - btmp[i]); \
7111*80a68eefSBob Badour                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
7112*80a68eefSBob Badour         return _mm_load_si128((__m128i*)res);
7113*80a68eefSBob Badour 
7114*80a68eefSBob Badour #define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
7115*80a68eefSBob Badour         int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
7116*80a68eefSBob Badour         int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
7117*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
7118*80a68eefSBob Badour         if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7119*80a68eefSBob Badour         else{ \
7120*80a68eefSBob Badour             if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7121*80a68eefSBob Badour             else{ \
7122*80a68eefSBob Badour                 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7123*80a68eefSBob Badour                     res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7124*80a68eefSBob Badour                 }else{ \
7125*80a68eefSBob Badour                     limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7126*80a68eefSBob Badour                     if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7127*80a68eefSBob Badour                         res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7128*80a68eefSBob Badour                     else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7129*80a68eefSBob Badour         return res;
7130*80a68eefSBob Badour 
7131*80a68eefSBob Badour #define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7132*80a68eefSBob Badour         int ## TYPE ## x ## LEN ## _t res;  _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7133*80a68eefSBob Badour         int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7134*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
7135*80a68eefSBob Badour         if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7136*80a68eefSBob Badour         }else{ \
7137*80a68eefSBob Badour             if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7138*80a68eefSBob Badour             else{ \
7139*80a68eefSBob Badour                 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
7140*80a68eefSBob Badour                 else{ \
7141*80a68eefSBob Badour                     limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7142*80a68eefSBob Badour                     res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
7143*80a68eefSBob Badour         return res;
7144*80a68eefSBob Badour 
7145*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7146*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7147*80a68eefSBob Badour {
7148*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
7149*80a68eefSBob Badour }
7150*80a68eefSBob Badour 
7151*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7152*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7153*80a68eefSBob Badour {
7154*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
7155*80a68eefSBob Badour }
7156*80a68eefSBob Badour 
7157*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7158*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7159*80a68eefSBob Badour {
7160*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
7161*80a68eefSBob Badour }
7162*80a68eefSBob Badour 
7163*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7164*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7165*80a68eefSBob Badour {
7166*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
7167*80a68eefSBob Badour }
7168*80a68eefSBob Badour 
7169*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7170*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7171*80a68eefSBob Badour {
7172*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
7173*80a68eefSBob Badour }
7174*80a68eefSBob Badour 
7175*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7176*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7177*80a68eefSBob Badour {
7178*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
7179*80a68eefSBob Badour }
7180*80a68eefSBob Badour 
7181*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7182*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7183*80a68eefSBob Badour {
7184*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
7185*80a68eefSBob Badour }
7186*80a68eefSBob Badour 
7187*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64 (uint64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7188*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7189*80a68eefSBob Badour {
7190*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
7191*80a68eefSBob Badour }
7192*80a68eefSBob Badour 
7193*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7194*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7195*80a68eefSBob Badour {
7196*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
7197*80a68eefSBob Badour }
7198*80a68eefSBob Badour 
7199*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7200*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7201*80a68eefSBob Badour {
7202*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
7203*80a68eefSBob Badour }
7204*80a68eefSBob Badour 
7205*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7206*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7207*80a68eefSBob Badour {
7208*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
7209*80a68eefSBob Badour }
7210*80a68eefSBob Badour 
7211*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7212*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7213*80a68eefSBob Badour {
7214*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
7215*80a68eefSBob Badour }
7216*80a68eefSBob Badour 
7217*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7218*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7219*80a68eefSBob Badour {
7220*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
7221*80a68eefSBob Badour }
7222*80a68eefSBob Badour 
7223*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7224*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7225*80a68eefSBob Badour {
7226*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
7227*80a68eefSBob Badour }
7228*80a68eefSBob Badour 
7229*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7230*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7231*80a68eefSBob Badour {
7232*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
7233*80a68eefSBob Badour }
7234*80a68eefSBob Badour 
7235*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7236*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7237*80a68eefSBob Badour {
7238*80a68eefSBob Badour     SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
7239*80a68eefSBob Badour }
7240*80a68eefSBob Badour 
7241*80a68eefSBob Badour 
7242*80a68eefSBob Badour //******** Vector rounding shift left: (negative values shift right) **********
7243*80a68eefSBob Badour //****************************************************************************
7244*80a68eefSBob Badour //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7245*80a68eefSBob Badour //rounding makes sense for right shifts only.
7246*80a68eefSBob Badour #define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
7247*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
7248*80a68eefSBob Badour         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7249*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
7250*80a68eefSBob Badour         if( btmp[i] >= 0) { \
7251*80a68eefSBob Badour             if(btmp[i] >= lanesize) res[i] = 0; \
7252*80a68eefSBob Badour             else res[i] = (atmp[i] << btmp[i]); \
7253*80a68eefSBob Badour         }else{ \
7254*80a68eefSBob Badour             res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
7255*80a68eefSBob Badour                             (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
7256*80a68eefSBob Badour                             (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
7257*80a68eefSBob Badour         return _mm_load_si128((__m128i*)res);
7258*80a68eefSBob Badour 
7259*80a68eefSBob Badour 
7260*80a68eefSBob Badour #define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
7261*80a68eefSBob Badour         int ## TYPE ## x ## LEN ## _t res;  int i;  int lanesize = sizeof(int ## TYPE ## _t) << 3; \
7262*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
7263*80a68eefSBob Badour         if( b.m64_i ## TYPE[i] >= 0) { \
7264*80a68eefSBob Badour             if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
7265*80a68eefSBob Badour             else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
7266*80a68eefSBob Badour         }else{ \
7267*80a68eefSBob Badour             res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
7268*80a68eefSBob Badour                             (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
7269*80a68eefSBob Badour                             (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) );    }} \
7270*80a68eefSBob Badour         return res;
7271*80a68eefSBob Badour 
7272*80a68eefSBob Badour 
7273*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7274*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7275*80a68eefSBob Badour {
7276*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT_64(8,i,8)
7277*80a68eefSBob Badour }
7278*80a68eefSBob Badour 
7279*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7280*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7281*80a68eefSBob Badour {
7282*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT_64(16,i,4)
7283*80a68eefSBob Badour }
7284*80a68eefSBob Badour 
7285*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7286*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7287*80a68eefSBob Badour {
7288*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT_64(32,i,2)
7289*80a68eefSBob Badour }
7290*80a68eefSBob Badour 
7291*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7292*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7293*80a68eefSBob Badour {
7294*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT_64(64,i,1)
7295*80a68eefSBob Badour }
7296*80a68eefSBob Badour 
7297*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7298*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7299*80a68eefSBob Badour {
7300*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT_64(8,u,8)
7301*80a68eefSBob Badour }
7302*80a68eefSBob Badour 
7303*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7304*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7305*80a68eefSBob Badour {
7306*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT_64(16,u,4)
7307*80a68eefSBob Badour }
7308*80a68eefSBob Badour 
7309*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7310*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7311*80a68eefSBob Badour {
7312*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT_64(32,u,2)
7313*80a68eefSBob Badour }
7314*80a68eefSBob Badour 
7315*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64 (uint64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7316*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7317*80a68eefSBob Badour {
7318*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT_64(64,u,1)
7319*80a68eefSBob Badour }
7320*80a68eefSBob Badour 
7321*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7322*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7323*80a68eefSBob Badour {
7324*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
7325*80a68eefSBob Badour }
7326*80a68eefSBob Badour 
7327*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7328*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7329*80a68eefSBob Badour {
7330*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
7331*80a68eefSBob Badour }
7332*80a68eefSBob Badour 
7333*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7334*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7335*80a68eefSBob Badour {
7336*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
7337*80a68eefSBob Badour }
7338*80a68eefSBob Badour 
7339*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7340*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7341*80a68eefSBob Badour {
7342*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
7343*80a68eefSBob Badour }
7344*80a68eefSBob Badour 
7345*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7346*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7347*80a68eefSBob Badour {
7348*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
7349*80a68eefSBob Badour }
7350*80a68eefSBob Badour 
7351*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7352*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7353*80a68eefSBob Badour {
7354*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
7355*80a68eefSBob Badour }
7356*80a68eefSBob Badour 
7357*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7358*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7359*80a68eefSBob Badour {
7360*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
7361*80a68eefSBob Badour }
7362*80a68eefSBob Badour 
7363*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7364*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7365*80a68eefSBob Badour {
7366*80a68eefSBob Badour     SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
7367*80a68eefSBob Badour }
7368*80a68eefSBob Badour 
7369*80a68eefSBob Badour 
7370*80a68eefSBob Badour //********** Vector saturating rounding shift left: (negative values shift right) ****************
7371*80a68eefSBob Badour //*************************************************************************************************
7372*80a68eefSBob Badour //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
7373*80a68eefSBob Badour //Saturation happens for left shifts only while rounding makes sense for right shifts only.
7374*80a68eefSBob Badour #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7375*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7376*80a68eefSBob Badour         int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7377*80a68eefSBob Badour         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7378*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
7379*80a68eefSBob Badour         if (atmp[i] ==0) res[i] = 0; \
7380*80a68eefSBob Badour         else{ \
7381*80a68eefSBob Badour             if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7382*80a68eefSBob Badour             else{ \
7383*80a68eefSBob Badour                 if (btmp[i]>lanesize_1) { \
7384*80a68eefSBob Badour                     res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7385*80a68eefSBob Badour                 }else{ \
7386*80a68eefSBob Badour                     limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7387*80a68eefSBob Badour                     if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7388*80a68eefSBob Badour                         res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7389*80a68eefSBob Badour                     else res[i] = atmp[i] << btmp[i]; }}}} \
7390*80a68eefSBob Badour         return _mm_load_si128((__m128i*)res);
7391*80a68eefSBob Badour 
7392*80a68eefSBob Badour #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7393*80a68eefSBob Badour         _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7394*80a68eefSBob Badour         int lanesize = (sizeof(TYPE) << 3); \
7395*80a68eefSBob Badour         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7396*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
7397*80a68eefSBob Badour         if (atmp[i] ==0) {res[i] = 0; \
7398*80a68eefSBob Badour         }else{ \
7399*80a68eefSBob Badour             if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7400*80a68eefSBob Badour             else{ \
7401*80a68eefSBob Badour                 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
7402*80a68eefSBob Badour                 else{ \
7403*80a68eefSBob Badour                     limit = (TYPE) 1 << (lanesize - btmp[i]); \
7404*80a68eefSBob Badour                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
7405*80a68eefSBob Badour         return _mm_load_si128((__m128i*)res);
7406*80a68eefSBob Badour 
7407*80a68eefSBob Badour #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
7408*80a68eefSBob Badour         __m64_128 res; int ## TYPE ## _t limit; int i; \
7409*80a68eefSBob Badour         int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
7410*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
7411*80a68eefSBob Badour         if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7412*80a68eefSBob Badour         else{ \
7413*80a68eefSBob Badour             if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7414*80a68eefSBob Badour             else{ \
7415*80a68eefSBob Badour                 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7416*80a68eefSBob Badour                     res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7417*80a68eefSBob Badour                 }else{ \
7418*80a68eefSBob Badour                     limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7419*80a68eefSBob Badour                     if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7420*80a68eefSBob Badour                         res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7421*80a68eefSBob Badour                     else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7422*80a68eefSBob Badour         return res;
7423*80a68eefSBob Badour 
7424*80a68eefSBob Badour #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7425*80a68eefSBob Badour         __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7426*80a68eefSBob Badour         int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7427*80a68eefSBob Badour         for (i = 0; i<LEN; i++) { \
7428*80a68eefSBob Badour         if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7429*80a68eefSBob Badour         }else{ \
7430*80a68eefSBob Badour             if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7431*80a68eefSBob Badour             else{ \
7432*80a68eefSBob Badour                 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
7433*80a68eefSBob Badour                 else{ \
7434*80a68eefSBob Badour                     limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7435*80a68eefSBob Badour                     res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7436*80a68eefSBob Badour         return res;
7437*80a68eefSBob Badour 
7438*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7439*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7440*80a68eefSBob Badour {
7441*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
7442*80a68eefSBob Badour }
7443*80a68eefSBob Badour 
7444*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7445*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7446*80a68eefSBob Badour {
7447*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
7448*80a68eefSBob Badour }
7449*80a68eefSBob Badour 
7450*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7451*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7452*80a68eefSBob Badour {
7453*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
7454*80a68eefSBob Badour }
7455*80a68eefSBob Badour 
7456*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7457*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7458*80a68eefSBob Badour {
7459*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
7460*80a68eefSBob Badour }
7461*80a68eefSBob Badour 
7462*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7463*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7464*80a68eefSBob Badour {
7465*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
7466*80a68eefSBob Badour }
7467*80a68eefSBob Badour 
7468*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7469*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7470*80a68eefSBob Badour {
7471*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
7472*80a68eefSBob Badour }
7473*80a68eefSBob Badour 
7474*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7475*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7476*80a68eefSBob Badour {
7477*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
7478*80a68eefSBob Badour }
7479*80a68eefSBob Badour 
7480*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64 (uint64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7481*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7482*80a68eefSBob Badour {
7483*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
7484*80a68eefSBob Badour }
7485*80a68eefSBob Badour 
7486*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7487*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7488*80a68eefSBob Badour {
7489*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
7490*80a68eefSBob Badour }
7491*80a68eefSBob Badour 
7492*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7493*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7494*80a68eefSBob Badour {
7495*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
7496*80a68eefSBob Badour }
7497*80a68eefSBob Badour 
7498*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7499*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7500*80a68eefSBob Badour {
7501*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
7502*80a68eefSBob Badour }
7503*80a68eefSBob Badour 
7504*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7505*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7506*80a68eefSBob Badour {
7507*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
7508*80a68eefSBob Badour }
7509*80a68eefSBob Badour 
7510*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7511*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7512*80a68eefSBob Badour {
7513*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
7514*80a68eefSBob Badour }
7515*80a68eefSBob Badour 
7516*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7517*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7518*80a68eefSBob Badour {
7519*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
7520*80a68eefSBob Badour }
7521*80a68eefSBob Badour 
7522*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7523*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7524*80a68eefSBob Badour {
7525*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
7526*80a68eefSBob Badour }
7527*80a68eefSBob Badour 
7528*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7529*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
7530*80a68eefSBob Badour {
7531*80a68eefSBob Badour     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
7532*80a68eefSBob Badour }
7533*80a68eefSBob Badour 
7534*80a68eefSBob Badour // *********************************************************************************
7535*80a68eefSBob Badour // *****************************  Shifts by a constant *****************************
7536*80a68eefSBob Badour // *********************************************************************************
7537*80a68eefSBob Badour //**************** Vector shift right by constant*************************************
7538*80a68eefSBob Badour //************************************************************************************
7539*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
7540*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
7541*80a68eefSBob Badour {
7542*80a68eefSBob Badour     //no 8 bit shift available, go to 16 bit
7543*80a68eefSBob Badour     int8x8_t res64;
7544*80a68eefSBob Badour     __m128i r;
7545*80a68eefSBob Badour     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7546*80a68eefSBob Badour     r = _mm_srai_epi16 (r, b); //SSE2
7547*80a68eefSBob Badour     r = _mm_packs_epi16 (r,r); //we need 64 bits only
7548*80a68eefSBob Badour     return64(r);
7549*80a68eefSBob Badour }
7550*80a68eefSBob Badour 
7551*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
7552*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
7553*80a68eefSBob Badour {
7554*80a68eefSBob Badour     int16x4_t res64;
7555*80a68eefSBob Badour     return64(_mm_srai_epi16(_pM128i(a), b));
7556*80a68eefSBob Badour }
7557*80a68eefSBob Badour 
7558*80a68eefSBob Badour 
7559*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
7560*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
7561*80a68eefSBob Badour {
7562*80a68eefSBob Badour     int32x2_t res64;
7563*80a68eefSBob Badour     return64(_mm_srai_epi32(_pM128i(a), b));
7564*80a68eefSBob Badour }
7565*80a68eefSBob Badour 
7566*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
7567*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7568*80a68eefSBob Badour {
7569*80a68eefSBob Badour     //no arithmetic shift for 64bit values, serial solution used
7570*80a68eefSBob Badour     int64x1_t res;
7571*80a68eefSBob Badour     if(b>=64) res.m64_i64[0] = 0;
7572*80a68eefSBob Badour     else res.m64_i64[0] = (*(int64_t*)&a) >> b;
7573*80a68eefSBob Badour     return res;
7574*80a68eefSBob Badour }
7575*80a68eefSBob Badour 
7576*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
7577*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
7578*80a68eefSBob Badour {
7579*80a68eefSBob Badour     //no 8 bit shift available, go to 16 bit
7580*80a68eefSBob Badour     uint8x8_t res64;
7581*80a68eefSBob Badour     __m128i r;
7582*80a68eefSBob Badour     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7583*80a68eefSBob Badour     r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
7584*80a68eefSBob Badour     r = _mm_packus_epi16 (r,r); //we need 64 bits only
7585*80a68eefSBob Badour     return64(r);
7586*80a68eefSBob Badour }
7587*80a68eefSBob Badour 
7588*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
7589*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
7590*80a68eefSBob Badour {
7591*80a68eefSBob Badour     uint16x4_t res64;
7592*80a68eefSBob Badour     return64(_mm_srli_epi16(_pM128i(a), b));
7593*80a68eefSBob Badour }
7594*80a68eefSBob Badour 
7595*80a68eefSBob Badour 
7596*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
7597*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
7598*80a68eefSBob Badour {
7599*80a68eefSBob Badour     uint32x2_t res64;
7600*80a68eefSBob Badour     return64(_mm_srli_epi32(_pM128i(a), b));
7601*80a68eefSBob Badour }
7602*80a68eefSBob Badour 
7603*80a68eefSBob Badour 
7604*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
7605*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b)
7606*80a68eefSBob Badour {
7607*80a68eefSBob Badour     uint64x1_t res64;
7608*80a68eefSBob Badour     return64(_mm_srli_epi64(_pM128i(a), b));
7609*80a68eefSBob Badour }
7610*80a68eefSBob Badour 
7611*80a68eefSBob Badour 
7612*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
7613*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
7614*80a68eefSBob Badour {
7615*80a68eefSBob Badour     //no 8 bit shift available, go to 16 bit trick
7616*80a68eefSBob Badour     __m128i zero, mask0, a_sign, r, a_sign_mask;
7617*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
7618*80a68eefSBob Badour     zero = _mm_setzero_si128();
7619*80a68eefSBob Badour     mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
7620*80a68eefSBob Badour     a_sign =  _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
7621*80a68eefSBob Badour     r = _mm_srai_epi16 (a, b);
7622*80a68eefSBob Badour     a_sign_mask =  _mm_and_si128 (mask0, a_sign);
7623*80a68eefSBob Badour     r =  _mm_andnot_si128 (mask0, r);
7624*80a68eefSBob Badour     return _mm_or_si128 (r, a_sign_mask);
7625*80a68eefSBob Badour }
7626*80a68eefSBob Badour 
7627*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
7628*80a68eefSBob Badour #define vshrq_n_s16 _mm_srai_epi16
7629*80a68eefSBob Badour 
7630*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
7631*80a68eefSBob Badour #define vshrq_n_s32 _mm_srai_epi32
7632*80a68eefSBob Badour 
7633*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
7634*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
7635*80a68eefSBob Badour {
7636*80a68eefSBob Badour     //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
7637*80a68eefSBob Badour     __m128i c1, signmask,a0,  res64;
7638*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
7639*80a68eefSBob Badour     c1 =  _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
7640*80a68eefSBob Badour     signmask  =  _mm_slli_epi64 (c1, (64 - b));
7641*80a68eefSBob Badour     a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
7642*80a68eefSBob Badour     a0 = _MM_CMPEQ_EPI64 (a, a0);
7643*80a68eefSBob Badour     signmask = _mm_and_si128(a0, signmask);
7644*80a68eefSBob Badour     res64 = _mm_srli_epi64 (a, b);
7645*80a68eefSBob Badour     return _mm_or_si128(res64, signmask);
7646*80a68eefSBob Badour }
7647*80a68eefSBob Badour 
7648*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
7649*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
7650*80a68eefSBob Badour {
7651*80a68eefSBob Badour     //no 8 bit shift available, need the special trick
7652*80a68eefSBob Badour     __m128i mask0, r;
7653*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
7654*80a68eefSBob Badour     mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
7655*80a68eefSBob Badour     r = _mm_srli_epi16 ( a, b);
7656*80a68eefSBob Badour     return _mm_and_si128 (r,  mask0);
7657*80a68eefSBob Badour }
7658*80a68eefSBob Badour 
7659*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
7660*80a68eefSBob Badour #define vshrq_n_u16 _mm_srli_epi16
7661*80a68eefSBob Badour 
7662*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
7663*80a68eefSBob Badour #define vshrq_n_u32 _mm_srli_epi32
7664*80a68eefSBob Badour 
7665*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
7666*80a68eefSBob Badour #define vshrq_n_u64 _mm_srli_epi64
7667*80a68eefSBob Badour 
7668*80a68eefSBob Badour //*************************** Vector shift left by constant *************************
7669*80a68eefSBob Badour //*********************************************************************************
7670*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
7671*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
7672*80a68eefSBob Badour {
7673*80a68eefSBob Badour     //no 8 bit shift available, go to 16 bit
7674*80a68eefSBob Badour     int8x8_t res64;
7675*80a68eefSBob Badour     __m128i r;
7676*80a68eefSBob Badour     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7677*80a68eefSBob Badour     r = _mm_slli_epi16 (r, b); //SSE2
7678*80a68eefSBob Badour     r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
7679*80a68eefSBob Badour     return64(r);
7680*80a68eefSBob Badour }
7681*80a68eefSBob Badour 
7682*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
7683*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b)
7684*80a68eefSBob Badour {
7685*80a68eefSBob Badour     int16x4_t res64;
7686*80a68eefSBob Badour     return64(_mm_slli_epi16(_pM128i(a), b));
7687*80a68eefSBob Badour }
7688*80a68eefSBob Badour 
7689*80a68eefSBob Badour 
7690*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
7691*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
7692*80a68eefSBob Badour {
7693*80a68eefSBob Badour     int32x2_t res64;
7694*80a68eefSBob Badour     return64(_mm_slli_epi32(_pM128i(a), b));
7695*80a68eefSBob Badour }
7696*80a68eefSBob Badour 
7697*80a68eefSBob Badour 
7698*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
7699*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b)
7700*80a68eefSBob Badour {
7701*80a68eefSBob Badour     int64x1_t res64;
7702*80a68eefSBob Badour     return64(_mm_slli_epi64(_pM128i(a), b));
7703*80a68eefSBob Badour }
7704*80a68eefSBob Badour 
7705*80a68eefSBob Badour 
7706*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
7707*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
7708*80a68eefSBob Badour {
7709*80a68eefSBob Badour     //no 8 bit shift available, go to 16 bit
7710*80a68eefSBob Badour     uint8x8_t res64;
7711*80a68eefSBob Badour     __m128i mask8;
7712*80a68eefSBob Badour     __m128i r;
7713*80a68eefSBob Badour     mask8 = _mm_set1_epi16(0xff);
7714*80a68eefSBob Badour     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7715*80a68eefSBob Badour     r = _mm_slli_epi16 (r, b); //SSE2
7716*80a68eefSBob Badour     r = _mm_and_si128(r, mask8); //to avoid saturation
7717*80a68eefSBob Badour     r = _mm_packus_epi16 (r,r); //we need 64 bits only
7718*80a68eefSBob Badour     return64(r);
7719*80a68eefSBob Badour }
7720*80a68eefSBob Badour 
7721*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
7722*80a68eefSBob Badour #define vshl_n_u16 vshl_n_s16
7723*80a68eefSBob Badour 
7724*80a68eefSBob Badour 
7725*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
7726*80a68eefSBob Badour #define vshl_n_u32 vshl_n_s32
7727*80a68eefSBob Badour 
7728*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
7729*80a68eefSBob Badour #define vshl_n_u64 vshl_n_s64
7730*80a68eefSBob Badour 
7731*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
7732*80a68eefSBob Badour #define vshlq_n_s8 vshlq_n_u8
7733*80a68eefSBob Badour 
7734*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
7735*80a68eefSBob Badour #define vshlq_n_s16 _mm_slli_epi16
7736*80a68eefSBob Badour 
7737*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
7738*80a68eefSBob Badour #define vshlq_n_s32 _mm_slli_epi32
7739*80a68eefSBob Badour 
7740*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
7741*80a68eefSBob Badour #define vshlq_n_s64 _mm_slli_epi64
7742*80a68eefSBob Badour 
7743*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
7744*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
7745*80a68eefSBob Badour {
7746*80a68eefSBob Badour     //no 8 bit shift available, need the special trick
7747*80a68eefSBob Badour     __m128i mask0, r;
7748*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
7749*80a68eefSBob Badour     mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
7750*80a68eefSBob Badour     r = _mm_slli_epi16 ( a, b);
7751*80a68eefSBob Badour     return _mm_and_si128 (r,  mask0);
7752*80a68eefSBob Badour }
7753*80a68eefSBob Badour 
7754*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
7755*80a68eefSBob Badour #define vshlq_n_u16 vshlq_n_s16
7756*80a68eefSBob Badour 
7757*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
7758*80a68eefSBob Badour #define vshlq_n_u32 vshlq_n_s32
7759*80a68eefSBob Badour 
7760*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
7761*80a68eefSBob Badour #define vshlq_n_u64 vshlq_n_s64
7762*80a68eefSBob Badour 
7763*80a68eefSBob Badour //************* Vector rounding shift right by constant ******************
7764*80a68eefSBob Badour //*************************************************************************
7765*80a68eefSBob Badour //No corresponding  x86 intrinsics exist, need to do some tricks
7766*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
7767*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
7768*80a68eefSBob Badour {
7769*80a68eefSBob Badour     //no 8 bit shift available, go to 16 bit
7770*80a68eefSBob Badour     int8x8_t res64;
7771*80a68eefSBob Badour     __m128i r, maskb;
7772*80a68eefSBob Badour     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7773*80a68eefSBob Badour     maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
7774*80a68eefSBob Badour     maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
7775*80a68eefSBob Badour     r = _mm_srai_epi16 (r, b);
7776*80a68eefSBob Badour     r = _mm_add_epi16 (r, maskb); //actual rounding
7777*80a68eefSBob Badour     r = _mm_packs_epi16 (r,r); ////we need 64 bits only
7778*80a68eefSBob Badour     return64(r);
7779*80a68eefSBob Badour }
7780*80a68eefSBob Badour 
7781*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
7782*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
7783*80a68eefSBob Badour {
7784*80a68eefSBob Badour     int16x4_t res64;
7785*80a68eefSBob Badour     return64(vrshrq_n_s16(_pM128i(a), b));
7786*80a68eefSBob Badour }
7787*80a68eefSBob Badour 
7788*80a68eefSBob Badour 
7789*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
7790*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
7791*80a68eefSBob Badour {
7792*80a68eefSBob Badour     int32x2_t res64;
7793*80a68eefSBob Badour     return64(vrshrq_n_s32(_pM128i(a), b));
7794*80a68eefSBob Badour }
7795*80a68eefSBob Badour 
7796*80a68eefSBob Badour 
7797*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
7798*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7799*80a68eefSBob Badour {
7800*80a68eefSBob Badour     //serial solution is faster
7801*80a68eefSBob Badour     int64x1_t res;
7802*80a68eefSBob Badour     int64_t a_i64 = *( int64_t*)&a;
7803*80a68eefSBob Badour     if(b==64) {
7804*80a68eefSBob Badour         res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
7805*80a68eefSBob Badour     } else {
7806*80a68eefSBob Badour         int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
7807*80a68eefSBob Badour         res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
7808*80a68eefSBob Badour     }
7809*80a68eefSBob Badour     return res;
7810*80a68eefSBob Badour }
7811*80a68eefSBob Badour 
7812*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
7813*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
7814*80a68eefSBob Badour {
7815*80a68eefSBob Badour     //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
7816*80a68eefSBob Badour     uint8x8_t res64;
7817*80a68eefSBob Badour     __m128i r, maskb;
7818*80a68eefSBob Badour     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7819*80a68eefSBob Badour     maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
7820*80a68eefSBob Badour     maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
7821*80a68eefSBob Badour     r = _mm_srli_epi16 (r, b);
7822*80a68eefSBob Badour     r = _mm_add_epi16 (r, maskb); //actual rounding
7823*80a68eefSBob Badour     r =  _mm_packus_epi16 (r,r); ////we need 64 bits only
7824*80a68eefSBob Badour     return64(r);
7825*80a68eefSBob Badour }
7826*80a68eefSBob Badour 
7827*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
7828*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
7829*80a68eefSBob Badour {
7830*80a68eefSBob Badour     uint16x4_t res64;
7831*80a68eefSBob Badour     return64(vrshrq_n_u16(_pM128i(a), b));
7832*80a68eefSBob Badour }
7833*80a68eefSBob Badour 
7834*80a68eefSBob Badour 
7835*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
7836*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
7837*80a68eefSBob Badour {
7838*80a68eefSBob Badour     uint32x2_t res64;
7839*80a68eefSBob Badour     return64(vrshrq_n_u32(_pM128i(a), b));
7840*80a68eefSBob Badour }
7841*80a68eefSBob Badour 
7842*80a68eefSBob Badour 
7843*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
7844*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
7845*80a68eefSBob Badour {
7846*80a68eefSBob Badour     uint64x1_t res64;
7847*80a68eefSBob Badour     return64(vrshrq_n_u64(_pM128i(a), b));
7848*80a68eefSBob Badour }
7849*80a68eefSBob Badour 
7850*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
7851*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
7852*80a68eefSBob Badour {
7853*80a68eefSBob Badour     //no 8 bit shift available, go to 16 bit trick
7854*80a68eefSBob Badour     __m128i r, mask1, maskb;
7855*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7856*80a68eefSBob Badour     r = vshrq_n_s8 (a, b);
7857*80a68eefSBob Badour     mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7858*80a68eefSBob Badour     maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7859*80a68eefSBob Badour     maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
7860*80a68eefSBob Badour     return _mm_add_epi8(r, maskb); //actual rounding
7861*80a68eefSBob Badour }
7862*80a68eefSBob Badour 
7863*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
7864*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
7865*80a68eefSBob Badour {
7866*80a68eefSBob Badour     __m128i maskb, r;
7867*80a68eefSBob Badour     maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
7868*80a68eefSBob Badour     maskb = _mm_srli_epi16(maskb, 15); //1 or 0
7869*80a68eefSBob Badour     r = _mm_srai_epi16 (a, b);
7870*80a68eefSBob Badour     return _mm_add_epi16 (r, maskb); //actual rounding
7871*80a68eefSBob Badour }
7872*80a68eefSBob Badour 
7873*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
7874*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
7875*80a68eefSBob Badour {
7876*80a68eefSBob Badour     __m128i maskb,  r;
7877*80a68eefSBob Badour     maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
7878*80a68eefSBob Badour     maskb = _mm_srli_epi32 (maskb,31); //1 or 0
7879*80a68eefSBob Badour     r = _mm_srai_epi32(a, b);
7880*80a68eefSBob Badour     return _mm_add_epi32 (r, maskb); //actual rounding
7881*80a68eefSBob Badour }
7882*80a68eefSBob Badour 
7883*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
7884*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
7885*80a68eefSBob Badour {
7886*80a68eefSBob Badour     //solution may be not optimal compared with a serial one
7887*80a68eefSBob Badour     __m128i maskb;
7888*80a68eefSBob Badour     int64x2_t r;
7889*80a68eefSBob Badour     maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
7890*80a68eefSBob Badour     maskb = _mm_srli_epi64 (maskb,63); //1 or 0
7891*80a68eefSBob Badour     r = vshrq_n_s64(a, b);
7892*80a68eefSBob Badour     return _mm_add_epi64 (r, maskb); //actual rounding
7893*80a68eefSBob Badour }
7894*80a68eefSBob Badour 
7895*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
7896*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
7897*80a68eefSBob Badour {
7898*80a68eefSBob Badour     //no 8 bit shift available, go to 16 bit trick
7899*80a68eefSBob Badour     __m128i r, mask1, maskb;
7900*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7901*80a68eefSBob Badour     r = vshrq_n_u8 (a, b);
7902*80a68eefSBob Badour     mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7903*80a68eefSBob Badour     maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7904*80a68eefSBob Badour     maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
7905*80a68eefSBob Badour     return _mm_add_epi8(r, maskb); //actual rounding
7906*80a68eefSBob Badour }
7907*80a68eefSBob Badour 
7908*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
7909*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
7910*80a68eefSBob Badour {
7911*80a68eefSBob Badour     __m128i maskb, r;
7912*80a68eefSBob Badour     maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
7913*80a68eefSBob Badour     maskb = _mm_srli_epi16(maskb, 15); //1 or 0
7914*80a68eefSBob Badour     r = _mm_srli_epi16 (a, b);
7915*80a68eefSBob Badour     return _mm_add_epi16 (r, maskb); //actual rounding
7916*80a68eefSBob Badour }
7917*80a68eefSBob Badour 
7918*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
7919*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
7920*80a68eefSBob Badour {
7921*80a68eefSBob Badour     __m128i maskb,  r;
7922*80a68eefSBob Badour     maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
7923*80a68eefSBob Badour     maskb = _mm_srli_epi32 (maskb,31); //1 or 0
7924*80a68eefSBob Badour     r = _mm_srli_epi32(a, b);
7925*80a68eefSBob Badour     return _mm_add_epi32 (r, maskb); //actual rounding
7926*80a68eefSBob Badour }
7927*80a68eefSBob Badour 
7928*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
7929*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
7930*80a68eefSBob Badour {
7931*80a68eefSBob Badour     //solution may be not optimal compared with a serial one
7932*80a68eefSBob Badour     __m128i maskb,  r;
7933*80a68eefSBob Badour     maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
7934*80a68eefSBob Badour     maskb = _mm_srli_epi64 (maskb,63); //1 or 0
7935*80a68eefSBob Badour     r = _mm_srli_epi64(a, b);
7936*80a68eefSBob Badour     return _mm_add_epi64 (r, maskb); //actual rounding
7937*80a68eefSBob Badour }
7938*80a68eefSBob Badour 
7939*80a68eefSBob Badour //************* Vector shift right by constant and accumulate *********
7940*80a68eefSBob Badour //*********************************************************************
7941*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
7942*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
7943*80a68eefSBob Badour {
7944*80a68eefSBob Badour     int8x8_t shift;
7945*80a68eefSBob Badour     shift = vshr_n_s8(b, c);
7946*80a68eefSBob Badour     return vadd_s8( a, shift);
7947*80a68eefSBob Badour }
7948*80a68eefSBob Badour 
7949*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
7950*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
7951*80a68eefSBob Badour {
7952*80a68eefSBob Badour     int16x4_t shift;
7953*80a68eefSBob Badour     shift = vshr_n_s16( b, c);
7954*80a68eefSBob Badour     return vadd_s16(a, shift);
7955*80a68eefSBob Badour }
7956*80a68eefSBob Badour 
7957*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
7958*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
7959*80a68eefSBob Badour {
7960*80a68eefSBob Badour     //may be not optimal compared with the serial execution
7961*80a68eefSBob Badour     int32x2_t shift;
7962*80a68eefSBob Badour     shift = vshr_n_s32(b, c);
7963*80a68eefSBob Badour     return vadd_s32( a, shift);
7964*80a68eefSBob Badour }
7965*80a68eefSBob Badour 
7966*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
7967*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
7968*80a68eefSBob Badour {
7969*80a68eefSBob Badour     //may be not optimal compared with a serial solution
7970*80a68eefSBob Badour     int64x1_t shift;
7971*80a68eefSBob Badour     shift = vshr_n_s64(b, c);
7972*80a68eefSBob Badour     return vadd_s64( a, shift);
7973*80a68eefSBob Badour }
7974*80a68eefSBob Badour 
7975*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
7976*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
7977*80a68eefSBob Badour {
7978*80a68eefSBob Badour     uint8x8_t shift;
7979*80a68eefSBob Badour     shift = vshr_n_u8(b, c);
7980*80a68eefSBob Badour     return vadd_u8(a, shift);
7981*80a68eefSBob Badour }
7982*80a68eefSBob Badour 
7983*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
7984*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
7985*80a68eefSBob Badour {
7986*80a68eefSBob Badour     uint16x4_t shift;
7987*80a68eefSBob Badour     shift = vshr_n_u16(b, c);
7988*80a68eefSBob Badour     return vadd_u16(a,shift);
7989*80a68eefSBob Badour }
7990*80a68eefSBob Badour 
7991*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
7992*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
7993*80a68eefSBob Badour {
7994*80a68eefSBob Badour     //may be not optimal compared with the serial execution
7995*80a68eefSBob Badour     uint32x2_t shift;
7996*80a68eefSBob Badour     shift = vshr_n_u32(b, c);
7997*80a68eefSBob Badour     return vadd_u32( a, shift);
7998*80a68eefSBob Badour }
7999*80a68eefSBob Badour 
8000*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
8001*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
8002*80a68eefSBob Badour {
8003*80a68eefSBob Badour     //may be not optimal compared with the serial execution
8004*80a68eefSBob Badour     uint64x1_t shift;
8005*80a68eefSBob Badour     shift = vshr_n_u64(b, c);
8006*80a68eefSBob Badour     return vadd_u64(a, shift);
8007*80a68eefSBob Badour }
8008*80a68eefSBob Badour 
8009*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
8010*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
8011*80a68eefSBob Badour {
8012*80a68eefSBob Badour     int8x16_t shift;
8013*80a68eefSBob Badour     shift = vshrq_n_s8(b, c);
8014*80a68eefSBob Badour     return vaddq_s8(a, shift);
8015*80a68eefSBob Badour }
8016*80a68eefSBob Badour 
8017*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
8018*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
8019*80a68eefSBob Badour {
8020*80a68eefSBob Badour     int16x8_t shift;
8021*80a68eefSBob Badour     shift = vshrq_n_s16(b, c);
8022*80a68eefSBob Badour     return vaddq_s16(a, shift);
8023*80a68eefSBob Badour }
8024*80a68eefSBob Badour 
8025*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
8026*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
8027*80a68eefSBob Badour {
8028*80a68eefSBob Badour     int32x4_t shift;
8029*80a68eefSBob Badour     shift = vshrq_n_s32(b, c);
8030*80a68eefSBob Badour     return vaddq_s32(a, shift);
8031*80a68eefSBob Badour }
8032*80a68eefSBob Badour 
8033*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
8034*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
8035*80a68eefSBob Badour {
8036*80a68eefSBob Badour     int64x2_t shift;
8037*80a68eefSBob Badour     shift = vshrq_n_s64(b, c);
8038*80a68eefSBob Badour     return vaddq_s64( a, shift);
8039*80a68eefSBob Badour }
8040*80a68eefSBob Badour 
8041*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
8042*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
8043*80a68eefSBob Badour {
8044*80a68eefSBob Badour     uint8x16_t shift;
8045*80a68eefSBob Badour     shift = vshrq_n_u8(b, c);
8046*80a68eefSBob Badour     return vaddq_u8(a, shift);
8047*80a68eefSBob Badour }
8048*80a68eefSBob Badour 
8049*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
8050*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
8051*80a68eefSBob Badour {
8052*80a68eefSBob Badour     uint16x8_t shift;
8053*80a68eefSBob Badour     shift = vshrq_n_u16(b, c);
8054*80a68eefSBob Badour     return vaddq_u16(a,  shift);
8055*80a68eefSBob Badour }
8056*80a68eefSBob Badour 
8057*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
8058*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
8059*80a68eefSBob Badour {
8060*80a68eefSBob Badour     uint32x4_t shift;
8061*80a68eefSBob Badour     shift = vshrq_n_u32(b, c);
8062*80a68eefSBob Badour     return vaddq_u32(a, shift);
8063*80a68eefSBob Badour }
8064*80a68eefSBob Badour 
8065*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
8066*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
8067*80a68eefSBob Badour {
8068*80a68eefSBob Badour     uint64x2_t shift;
8069*80a68eefSBob Badour     shift = vshrq_n_u64(b, c);
8070*80a68eefSBob Badour     return vaddq_u64(a, shift);
8071*80a68eefSBob Badour }
8072*80a68eefSBob Badour 
8073*80a68eefSBob Badour //************* Vector rounding shift right by constant and accumulate ****************************
8074*80a68eefSBob Badour //************************************************************************************************
8075*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
8076*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
8077*80a68eefSBob Badour {
8078*80a68eefSBob Badour     int8x8_t shift;
8079*80a68eefSBob Badour     shift = vrshr_n_s8(b, c);
8080*80a68eefSBob Badour     return vadd_s8( a, shift);
8081*80a68eefSBob Badour }
8082*80a68eefSBob Badour 
8083*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
8084*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
8085*80a68eefSBob Badour {
8086*80a68eefSBob Badour     int16x4_t shift;
8087*80a68eefSBob Badour     shift = vrshr_n_s16( b, c);
8088*80a68eefSBob Badour     return vadd_s16(a, shift);
8089*80a68eefSBob Badour }
8090*80a68eefSBob Badour 
8091*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
8092*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
8093*80a68eefSBob Badour {
8094*80a68eefSBob Badour     //may be not optimal compared with the serial execution
8095*80a68eefSBob Badour     int32x2_t shift;
8096*80a68eefSBob Badour     shift = vrshr_n_s32(b, c);
8097*80a68eefSBob Badour     return vadd_s32( a, shift);
8098*80a68eefSBob Badour }
8099*80a68eefSBob Badour 
8100*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
8101*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8102*80a68eefSBob Badour {
8103*80a68eefSBob Badour     int64x1_t shift;
8104*80a68eefSBob Badour     shift = vrshr_n_s64(b, c);
8105*80a68eefSBob Badour     return vadd_s64( a, shift);
8106*80a68eefSBob Badour }
8107*80a68eefSBob Badour 
8108*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
8109*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
8110*80a68eefSBob Badour {
8111*80a68eefSBob Badour     uint8x8_t shift;
8112*80a68eefSBob Badour     shift = vrshr_n_u8(b, c);
8113*80a68eefSBob Badour     return vadd_u8(a, shift);
8114*80a68eefSBob Badour }
8115*80a68eefSBob Badour 
8116*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
8117*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
8118*80a68eefSBob Badour {
8119*80a68eefSBob Badour     uint16x4_t shift;
8120*80a68eefSBob Badour     shift = vrshr_n_u16(b, c);
8121*80a68eefSBob Badour     return vadd_u16(a,shift);
8122*80a68eefSBob Badour }
8123*80a68eefSBob Badour 
8124*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
8125*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
8126*80a68eefSBob Badour {
8127*80a68eefSBob Badour     //may be not optimal compared with the serial execution
8128*80a68eefSBob Badour     uint32x2_t shift;
8129*80a68eefSBob Badour     shift = vrshr_n_u32(b, c);
8130*80a68eefSBob Badour     return vadd_u32( a, shift);
8131*80a68eefSBob Badour }
8132*80a68eefSBob Badour 
8133*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
8134*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8135*80a68eefSBob Badour {
8136*80a68eefSBob Badour     //may be not optimal compared with the serial execution
8137*80a68eefSBob Badour     uint64x1_t shift;
8138*80a68eefSBob Badour     shift = vrshr_n_u64(b, c);
8139*80a68eefSBob Badour     return vadd_u64( a, shift);
8140*80a68eefSBob Badour }
8141*80a68eefSBob Badour 
8142*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
8143*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
8144*80a68eefSBob Badour {
8145*80a68eefSBob Badour     int8x16_t shift;
8146*80a68eefSBob Badour     shift = vrshrq_n_s8(b, c);
8147*80a68eefSBob Badour     return vaddq_s8(a, shift);
8148*80a68eefSBob Badour }
8149*80a68eefSBob Badour 
8150*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
8151*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
8152*80a68eefSBob Badour {
8153*80a68eefSBob Badour     int16x8_t shift;
8154*80a68eefSBob Badour     shift = vrshrq_n_s16(b, c);
8155*80a68eefSBob Badour     return vaddq_s16(a, shift);
8156*80a68eefSBob Badour }
8157*80a68eefSBob Badour 
8158*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
8159*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
8160*80a68eefSBob Badour {
8161*80a68eefSBob Badour     int32x4_t shift;
8162*80a68eefSBob Badour     shift = vrshrq_n_s32(b, c);
8163*80a68eefSBob Badour     return vaddq_s32(a, shift);
8164*80a68eefSBob Badour }
8165*80a68eefSBob Badour 
8166*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
8167*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
8168*80a68eefSBob Badour {
8169*80a68eefSBob Badour     int64x2_t shift;
8170*80a68eefSBob Badour     shift = vrshrq_n_s64(b, c);
8171*80a68eefSBob Badour     return vaddq_s64(a, shift);
8172*80a68eefSBob Badour }
8173*80a68eefSBob Badour 
8174*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
8175*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
8176*80a68eefSBob Badour {
8177*80a68eefSBob Badour     uint8x16_t shift;
8178*80a68eefSBob Badour     shift = vrshrq_n_u8(b, c);
8179*80a68eefSBob Badour     return vaddq_u8(a, shift);
8180*80a68eefSBob Badour }
8181*80a68eefSBob Badour 
8182*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
8183*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
8184*80a68eefSBob Badour {
8185*80a68eefSBob Badour     uint16x8_t shift;
8186*80a68eefSBob Badour     shift = vrshrq_n_u16(b, c);
8187*80a68eefSBob Badour     return vaddq_u16(a,  shift);
8188*80a68eefSBob Badour }
8189*80a68eefSBob Badour 
8190*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
8191*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
8192*80a68eefSBob Badour {
8193*80a68eefSBob Badour     uint32x4_t shift;
8194*80a68eefSBob Badour     shift = vrshrq_n_u32(b, c);
8195*80a68eefSBob Badour     return vaddq_u32(a, shift);
8196*80a68eefSBob Badour }
8197*80a68eefSBob Badour 
8198*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
8199*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
8200*80a68eefSBob Badour {
8201*80a68eefSBob Badour     uint64x2_t shift;
8202*80a68eefSBob Badour     shift = vrshrq_n_u64(b, c);
8203*80a68eefSBob Badour     return vaddq_u64(a, shift);
8204*80a68eefSBob Badour }
8205*80a68eefSBob Badour 
8206*80a68eefSBob Badour //**********************Vector saturating shift left by constant *****************************
8207*80a68eefSBob Badour //********************************************************************************************
8208*80a68eefSBob Badour //we don't check const ranges  assuming they are met
8209*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
8210*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
8211*80a68eefSBob Badour {
8212*80a68eefSBob Badour     //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8213*80a68eefSBob Badour     int8x8_t res64;
8214*80a68eefSBob Badour     __m128i a128, r128;
8215*80a68eefSBob Badour     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8216*80a68eefSBob Badour     r128 = _mm_slli_epi16 (a128, b);
8217*80a68eefSBob Badour     r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
8218*80a68eefSBob Badour     return64(r128);
8219*80a68eefSBob Badour }
8220*80a68eefSBob Badour 
8221*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
8222*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
8223*80a68eefSBob Badour {
8224*80a68eefSBob Badour     // go to 32 bit to get the auto saturation (in packs function)
8225*80a68eefSBob Badour     int16x4_t res64;
8226*80a68eefSBob Badour     __m128i a128, r128;
8227*80a68eefSBob Badour     a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8228*80a68eefSBob Badour     r128 = _mm_slli_epi32 (a128, b); //shift_res
8229*80a68eefSBob Badour     r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
8230*80a68eefSBob Badour     return64(r128);
8231*80a68eefSBob Badour }
8232*80a68eefSBob Badour 
8233*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
8234*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
8235*80a68eefSBob Badour {
8236*80a68eefSBob Badour     //serial execution may be faster
8237*80a68eefSBob Badour     int32x2_t res64;
8238*80a68eefSBob Badour     return64(vqshlq_n_s32 (_pM128i(a), b));
8239*80a68eefSBob Badour }
8240*80a68eefSBob Badour 
8241*80a68eefSBob Badour 
8242*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
8243*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8244*80a68eefSBob Badour {
8245*80a68eefSBob Badour     // no effective SIMD solution here
8246*80a68eefSBob Badour     int64x1_t res;
8247*80a68eefSBob Badour     int64_t bmask;
8248*80a68eefSBob Badour     int64_t a_i64 = *( int64_t*)&a;
8249*80a68eefSBob Badour     bmask = ( int64_t)1 << (63 - b); //positive
8250*80a68eefSBob Badour     if (a_i64 >= bmask) {
8251*80a68eefSBob Badour         res.m64_i64[0] = ~(_SIGNBIT64);
8252*80a68eefSBob Badour     } else {
8253*80a68eefSBob Badour         res.m64_i64[0]  = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
8254*80a68eefSBob Badour     }
8255*80a68eefSBob Badour     return res;
8256*80a68eefSBob Badour }
8257*80a68eefSBob Badour 
8258*80a68eefSBob Badour 
8259*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
8260*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
8261*80a68eefSBob Badour {
8262*80a68eefSBob Badour     //no 8 bit shift available in IA32 SIMD, go to 16 bit
8263*80a68eefSBob Badour     uint8x8_t res64;
8264*80a68eefSBob Badour     __m128i a128, r128;
8265*80a68eefSBob Badour     a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
8266*80a68eefSBob Badour     r128 = _mm_slli_epi16 (a128, b); //shift_res
8267*80a68eefSBob Badour     r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8268*80a68eefSBob Badour     return64(r128);
8269*80a68eefSBob Badour }
8270*80a68eefSBob Badour 
8271*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
8272*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
8273*80a68eefSBob Badour {
8274*80a68eefSBob Badour     // go to 32 bit to get the auto saturation (in packus function)
8275*80a68eefSBob Badour     uint16x4_t res64;
8276*80a68eefSBob Badour     __m128i a128, r128;
8277*80a68eefSBob Badour     a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
8278*80a68eefSBob Badour     r128 = _mm_slli_epi32 (a128, b); //shift_res
8279*80a68eefSBob Badour     r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
8280*80a68eefSBob Badour     return64(r128);
8281*80a68eefSBob Badour }
8282*80a68eefSBob Badour 
8283*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
8284*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b)
8285*80a68eefSBob Badour {
8286*80a68eefSBob Badour     uint32x2_t res64;
8287*80a68eefSBob Badour     return64(vqshlq_n_u32(_pM128i(a), b));
8288*80a68eefSBob Badour }
8289*80a68eefSBob Badour 
8290*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
8291*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8292*80a68eefSBob Badour {
8293*80a68eefSBob Badour     // no effective SIMD solution here
8294*80a68eefSBob Badour     uint64x1_t res;
8295*80a68eefSBob Badour     uint64_t bmask;
8296*80a68eefSBob Badour     uint64_t a_i64 = *(uint64_t*)&a;
8297*80a68eefSBob Badour     bmask = ( uint64_t)1 << (64 - b);
8298*80a68eefSBob Badour     res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
8299*80a68eefSBob Badour     return res;
8300*80a68eefSBob Badour }
8301*80a68eefSBob Badour 
8302*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
8303*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
8304*80a68eefSBob Badour {
8305*80a68eefSBob Badour     // go to 16 bit to get the auto saturation (in packs function)
8306*80a68eefSBob Badour     __m128i a128, r128_1, r128_2;
8307*80a68eefSBob Badour     a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8308*80a68eefSBob Badour     r128_1 = _mm_slli_epi16 (a128, b);
8309*80a68eefSBob Badour     //swap hi and low part of a128 to process the remaining data
8310*80a68eefSBob Badour     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8311*80a68eefSBob Badour     a128 = _MM_CVTEPI8_EPI16 (a128);
8312*80a68eefSBob Badour     r128_2 = _mm_slli_epi16 (a128, b);
8313*80a68eefSBob Badour     return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
8314*80a68eefSBob Badour }
8315*80a68eefSBob Badour 
8316*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
8317*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
8318*80a68eefSBob Badour {
8319*80a68eefSBob Badour     // manual saturation solution looks LESS optimal than 32 bits conversion one
8320*80a68eefSBob Badour     // go to 32 bit to get the auto saturation (in packs function)
8321*80a68eefSBob Badour     __m128i a128, r128_1, r128_2;
8322*80a68eefSBob Badour     a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8323*80a68eefSBob Badour     r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8324*80a68eefSBob Badour     //swap hi and low part of a128 to process the remaining data
8325*80a68eefSBob Badour     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8326*80a68eefSBob Badour     a128 = _MM_CVTEPI16_EPI32 (a128);
8327*80a68eefSBob Badour     r128_2 = _mm_slli_epi32 (a128, b);
8328*80a68eefSBob Badour     return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
8329*80a68eefSBob Badour }
8330*80a68eefSBob Badour 
8331*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
8332*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
8333*80a68eefSBob Badour {
8334*80a68eefSBob Badour     // no 64 bit saturation option available, special tricks necessary
8335*80a68eefSBob Badour     __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
8336*80a68eefSBob Badour     c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
8337*80a68eefSBob Badour     maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
8338*80a68eefSBob Badour     saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0  otherwise
8339*80a68eefSBob Badour     c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
8340*80a68eefSBob Badour     shift_res = _mm_slli_epi32 (a, b);
8341*80a68eefSBob Badour     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8342*80a68eefSBob Badour     //result with positive numbers saturated
8343*80a68eefSBob Badour     shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8344*80a68eefSBob Badour     //treat negative numbers
8345*80a68eefSBob Badour     maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones  and (32-b+1)  zeros
8346*80a68eefSBob Badour     saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0  otherwise
8347*80a68eefSBob Badour     c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
8348*80a68eefSBob Badour     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8349*80a68eefSBob Badour     return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8350*80a68eefSBob Badour }
8351*80a68eefSBob Badour 
8352*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
8353*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8354*80a68eefSBob Badour {
8355*80a68eefSBob Badour     // no effective SIMD solution here
8356*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
8357*80a68eefSBob Badour     int64_t bmask;
8358*80a68eefSBob Badour     int i;
8359*80a68eefSBob Badour     bmask = ( int64_t)1 << (63 - b); //positive
8360*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
8361*80a68eefSBob Badour     for (i = 0; i<2; i++) {
8362*80a68eefSBob Badour         if (atmp[i] >= bmask) {
8363*80a68eefSBob Badour             res[i] = ~(_SIGNBIT64);
8364*80a68eefSBob Badour         } else {
8365*80a68eefSBob Badour             res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
8366*80a68eefSBob Badour         }
8367*80a68eefSBob Badour     }
8368*80a68eefSBob Badour     return _mm_load_si128((__m128i*)res);
8369*80a68eefSBob Badour }
8370*80a68eefSBob Badour 
8371*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
8372*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
8373*80a68eefSBob Badour {
8374*80a68eefSBob Badour     // go to 16 bit to get the auto saturation (in packs function)
8375*80a68eefSBob Badour     __m128i a128, r128_1, r128_2;
8376*80a68eefSBob Badour     a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
8377*80a68eefSBob Badour     r128_1 = _mm_slli_epi16 (a128, b);
8378*80a68eefSBob Badour     //swap hi and low part of a128 to process the remaining data
8379*80a68eefSBob Badour     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8380*80a68eefSBob Badour     a128 = _MM_CVTEPU8_EPI16 (a128);
8381*80a68eefSBob Badour     r128_2 = _mm_slli_epi16 (a128, b);
8382*80a68eefSBob Badour     return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8383*80a68eefSBob Badour }
8384*80a68eefSBob Badour 
8385*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
8386*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
8387*80a68eefSBob Badour {
8388*80a68eefSBob Badour     // manual saturation solution looks more optimal than 32 bits conversion one
8389*80a68eefSBob Badour     __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
8390*80a68eefSBob Badour     cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
8391*80a68eefSBob Badour     c8000 = _mm_set1_epi16 ((int16_t)0x8000);
8392*80a68eefSBob Badour //no unsigned shorts comparison in SSE, only signed available, so need the trick
8393*80a68eefSBob Badour     a_signed = _mm_sub_epi16(a, c8000); //go to signed
8394*80a68eefSBob Badour     saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
8395*80a68eefSBob Badour     shift_res = _mm_slli_epi16 (a, b);
8396*80a68eefSBob Badour     return _mm_or_si128 (shift_res, saturation_mask);
8397*80a68eefSBob Badour }
8398*80a68eefSBob Badour 
8399*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
8400*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
8401*80a68eefSBob Badour {
8402*80a68eefSBob Badour     // manual saturation solution, no 64 bit saturation option, the serial version may be faster
8403*80a68eefSBob Badour     __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
8404*80a68eefSBob Badour     cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
8405*80a68eefSBob Badour     c80000000 = _mm_set1_epi32 (0x80000000);
8406*80a68eefSBob Badour //no unsigned ints comparison in SSE, only signed available, so need the trick
8407*80a68eefSBob Badour     a_signed = _mm_sub_epi32(a, c80000000); //go to signed
8408*80a68eefSBob Badour     saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
8409*80a68eefSBob Badour     shift_res = _mm_slli_epi32 (a, b);
8410*80a68eefSBob Badour     return _mm_or_si128 (shift_res, saturation_mask);
8411*80a68eefSBob Badour }
8412*80a68eefSBob Badour 
8413*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
8414*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8415*80a68eefSBob Badour {
8416*80a68eefSBob Badour     // no effective SIMD solution here
8417*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
8418*80a68eefSBob Badour     uint64_t bmask;
8419*80a68eefSBob Badour     int i;
8420*80a68eefSBob Badour     bmask = ( uint64_t)1 << (64 - b);
8421*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
8422*80a68eefSBob Badour     for (i = 0; i<2; i++) {
8423*80a68eefSBob Badour         res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
8424*80a68eefSBob Badour     }
8425*80a68eefSBob Badour     return _mm_load_si128((__m128i*)res);
8426*80a68eefSBob Badour }
8427*80a68eefSBob Badour 
8428*80a68eefSBob Badour //**************Vector signed->unsigned saturating shift left by constant *************
8429*80a68eefSBob Badour //*************************************************************************************
8430*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
8431*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
8432*80a68eefSBob Badour {
8433*80a68eefSBob Badour     //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8434*80a68eefSBob Badour     uint8x8_t res64;
8435*80a68eefSBob Badour     __m128i a128, r128;
8436*80a68eefSBob Badour     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8437*80a68eefSBob Badour     r128 = _mm_slli_epi16 (a128, b);
8438*80a68eefSBob Badour     r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8439*80a68eefSBob Badour     return64(r128);
8440*80a68eefSBob Badour }
8441*80a68eefSBob Badour 
8442*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
8443*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
8444*80a68eefSBob Badour {
8445*80a68eefSBob Badour     uint16x4_t res64;
8446*80a68eefSBob Badour     __m128i a128, r128;
8447*80a68eefSBob Badour     a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8448*80a68eefSBob Badour     r128 = _mm_slli_epi32 (a128, b); //shift_res
8449*80a68eefSBob Badour     r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
8450*80a68eefSBob Badour     return64(r128);
8451*80a68eefSBob Badour }
8452*80a68eefSBob Badour 
8453*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
8454*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b)
8455*80a68eefSBob Badour {
8456*80a68eefSBob Badour     int32x2_t res64;
8457*80a68eefSBob Badour     return64( vqshluq_n_s32(_pM128i(a), b));
8458*80a68eefSBob Badour }
8459*80a68eefSBob Badour 
8460*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
8461*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
8462*80a68eefSBob Badour {
8463*80a68eefSBob Badour     uint64x1_t res;
8464*80a68eefSBob Badour     uint64_t limit;
8465*80a68eefSBob Badour     if (a.m64_i64[0]<=0) {
8466*80a68eefSBob Badour         res.m64_u64[0] = 0;
8467*80a68eefSBob Badour     } else {
8468*80a68eefSBob Badour         limit = (uint64_t) 1 << (64 - b);
8469*80a68eefSBob Badour         res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
8470*80a68eefSBob Badour     }
8471*80a68eefSBob Badour     return res;
8472*80a68eefSBob Badour }
8473*80a68eefSBob Badour 
8474*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
8475*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
8476*80a68eefSBob Badour {
8477*80a68eefSBob Badour     __m128i a128, r128_1, r128_2;
8478*80a68eefSBob Badour     a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8479*80a68eefSBob Badour     r128_1 = _mm_slli_epi16 (a128, b);
8480*80a68eefSBob Badour     //swap hi and low part of a128 to process the remaining data
8481*80a68eefSBob Badour     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8482*80a68eefSBob Badour     a128 = _MM_CVTEPI8_EPI16 (a128);
8483*80a68eefSBob Badour     r128_2 = _mm_slli_epi16 (a128, b);
8484*80a68eefSBob Badour     return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8485*80a68eefSBob Badour }
8486*80a68eefSBob Badour 
8487*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
8488*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
8489*80a68eefSBob Badour {
8490*80a68eefSBob Badour     // manual saturation solution looks LESS optimal than 32 bits conversion one
8491*80a68eefSBob Badour     __m128i a128, r128_1, r128_2;
8492*80a68eefSBob Badour     a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8493*80a68eefSBob Badour     r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8494*80a68eefSBob Badour     //swap hi and low part of a128 to process the remaining data
8495*80a68eefSBob Badour     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8496*80a68eefSBob Badour     a128 = _MM_CVTEPI16_EPI32 (a128);
8497*80a68eefSBob Badour     r128_2 = _mm_slli_epi32 (a128, b);
8498*80a68eefSBob Badour     return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
8499*80a68eefSBob Badour }
8500*80a68eefSBob Badour 
8501*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
8502*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
8503*80a68eefSBob Badour {
8504*80a68eefSBob Badour     //solution may be  not optimal compared with the serial one
8505*80a68eefSBob Badour     __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
8506*80a68eefSBob Badour     zero = _mm_setzero_si128();
8507*80a68eefSBob Badour     maskA = _mm_cmpeq_epi32(a, a);
8508*80a68eefSBob Badour     maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
8509*80a68eefSBob Badour     //saturate negative numbers to zero
8510*80a68eefSBob Badour     maskGT0   = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
8511*80a68eefSBob Badour     a0 = _mm_and_si128 (a,  maskGT0); //negative are zeros now
8512*80a68eefSBob Badour     //saturate positive to 0xffffffff
8513*80a68eefSBob Badour     a_masked = _mm_and_si128 (a0, maskA);
8514*80a68eefSBob Badour     a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
8515*80a68eefSBob Badour     a_shift = _mm_slli_epi32 (a0, b);
8516*80a68eefSBob Badour     return _mm_or_si128 (a_shift, a_masked); //actual saturation
8517*80a68eefSBob Badour }
8518*80a68eefSBob Badour 
8519*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
8520*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
8521*80a68eefSBob Badour {
8522*80a68eefSBob Badour     // no effective SIMD solution here, serial execution looks faster
8523*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t atmp[2];
8524*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t res[2];
8525*80a68eefSBob Badour     uint64_t limit;
8526*80a68eefSBob Badour     int i;
8527*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
8528*80a68eefSBob Badour     for (i = 0; i<2; i++) {
8529*80a68eefSBob Badour         if (atmp[i]<=0) {
8530*80a68eefSBob Badour             res[i] = 0;
8531*80a68eefSBob Badour         } else {
8532*80a68eefSBob Badour             limit = (uint64_t) 1 << (64 - b);
8533*80a68eefSBob Badour             res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
8534*80a68eefSBob Badour         }
8535*80a68eefSBob Badour     }
8536*80a68eefSBob Badour     return _mm_load_si128((__m128i*)res);
8537*80a68eefSBob Badour }
8538*80a68eefSBob Badour 
8539*80a68eefSBob Badour //************** Vector narrowing  shift right by constant **************
8540*80a68eefSBob Badour //**********************************************************************
8541*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
8542*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
8543*80a68eefSBob Badour {
8544*80a68eefSBob Badour     int8x8_t res64;
8545*80a68eefSBob Badour     __m128i r16;
8546*80a68eefSBob Badour     r16  = vshrq_n_s16(a,b);
8547*80a68eefSBob Badour     r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8548*80a68eefSBob Badour     return64(r16);
8549*80a68eefSBob Badour }
8550*80a68eefSBob Badour 
8551*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
8552*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
8553*80a68eefSBob Badour {
8554*80a68eefSBob Badour     int16x4_t res64;
8555*80a68eefSBob Badour     __m128i r32;
8556*80a68eefSBob Badour     r32  = vshrq_n_s32(a,b);
8557*80a68eefSBob Badour     r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8558*80a68eefSBob Badour     return64(r32);
8559*80a68eefSBob Badour }
8560*80a68eefSBob Badour 
8561*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
8562*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
8563*80a68eefSBob Badour {
8564*80a68eefSBob Badour     int32x2_t res64;
8565*80a68eefSBob Badour     __m128i r64;
8566*80a68eefSBob Badour     r64  = vshrq_n_s64(a,b);
8567*80a68eefSBob Badour     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8568*80a68eefSBob Badour     return64(r64);
8569*80a68eefSBob Badour }
8570*80a68eefSBob Badour 
8571*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
8572*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
8573*80a68eefSBob Badour {
8574*80a68eefSBob Badour     uint8x8_t res64;
8575*80a68eefSBob Badour     __m128i mask, r16;
8576*80a68eefSBob Badour     mask = _mm_set1_epi16(0xff);
8577*80a68eefSBob Badour     r16  = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8578*80a68eefSBob Badour     r16 = _mm_and_si128(r16, mask); //to avoid saturation
8579*80a68eefSBob Badour     r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
8580*80a68eefSBob Badour     return64(r16);
8581*80a68eefSBob Badour }
8582*80a68eefSBob Badour 
8583*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
8584*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
8585*80a68eefSBob Badour {
8586*80a68eefSBob Badour     uint16x4_t res64;
8587*80a68eefSBob Badour     __m128i mask, r32;
8588*80a68eefSBob Badour     mask = _mm_set1_epi32(0xffff);
8589*80a68eefSBob Badour     r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
8590*80a68eefSBob Badour     r32 = _mm_and_si128(r32, mask); //to avoid saturation
8591*80a68eefSBob Badour     r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
8592*80a68eefSBob Badour     return64(r32);
8593*80a68eefSBob Badour }
8594*80a68eefSBob Badour 
8595*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
8596*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8597*80a68eefSBob Badour {
8598*80a68eefSBob Badour     uint32x2_t res64;
8599*80a68eefSBob Badour     __m128i r64;
8600*80a68eefSBob Badour     r64  = vshrq_n_u64(a,b);
8601*80a68eefSBob Badour     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8602*80a68eefSBob Badour     return64(r64);
8603*80a68eefSBob Badour }
8604*80a68eefSBob Badour 
8605*80a68eefSBob Badour //************** Vector signed->unsigned narrowing saturating shift right by constant ********
8606*80a68eefSBob Badour //*********************************************************************************************
8607*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
8608*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
8609*80a68eefSBob Badour {
8610*80a68eefSBob Badour     uint8x8_t res64;
8611*80a68eefSBob Badour     __m128i r16;
8612*80a68eefSBob Badour     r16  = vshrq_n_s16(a,b);
8613*80a68eefSBob Badour     r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
8614*80a68eefSBob Badour     return64(r16);
8615*80a68eefSBob Badour }
8616*80a68eefSBob Badour 
8617*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
8618*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
8619*80a68eefSBob Badour {
8620*80a68eefSBob Badour     uint16x4_t res64;
8621*80a68eefSBob Badour     __m128i r32;
8622*80a68eefSBob Badour     r32  = vshrq_n_s32(a,b);
8623*80a68eefSBob Badour     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow(signed to unsigned), use low 64 bits only
8624*80a68eefSBob Badour     return64(r32);
8625*80a68eefSBob Badour }
8626*80a68eefSBob Badour 
8627*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
8628*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8629*80a68eefSBob Badour {
8630*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t atmp[2];
8631*80a68eefSBob Badour     uint32x2_t res;
8632*80a68eefSBob Badour     int64_t res64;
8633*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
8634*80a68eefSBob Badour     if (atmp[0] < 0) {
8635*80a68eefSBob Badour         res.m64_u32[0] = 0;
8636*80a68eefSBob Badour     } else {
8637*80a68eefSBob Badour         res64 = (atmp[0] >> b);
8638*80a68eefSBob Badour         res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
8639*80a68eefSBob Badour     }
8640*80a68eefSBob Badour     if (atmp[1] < 0) {
8641*80a68eefSBob Badour         res.m64_u32[1] = 0;
8642*80a68eefSBob Badour     } else {
8643*80a68eefSBob Badour         res64 = (atmp[1] >> b);
8644*80a68eefSBob Badour         res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
8645*80a68eefSBob Badour     }
8646*80a68eefSBob Badour     return res;
8647*80a68eefSBob Badour }
8648*80a68eefSBob Badour 
8649*80a68eefSBob Badour //**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
8650*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
8651*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
8652*80a68eefSBob Badour {
8653*80a68eefSBob Badour     //solution may be not optimal compared with the serial one
8654*80a68eefSBob Badour     __m128i r16;
8655*80a68eefSBob Badour     uint8x8_t res64;
8656*80a68eefSBob Badour     r16 = vrshrq_n_s16(a,b);
8657*80a68eefSBob Badour     r16 =  _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
8658*80a68eefSBob Badour     return64(r16);
8659*80a68eefSBob Badour }
8660*80a68eefSBob Badour 
8661*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
8662*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
8663*80a68eefSBob Badour {
8664*80a68eefSBob Badour     //solution may be not optimal compared with the serial one
8665*80a68eefSBob Badour     __m128i r32;
8666*80a68eefSBob Badour     uint16x4_t res64;
8667*80a68eefSBob Badour     r32 = vrshrq_n_s32(a,b);
8668*80a68eefSBob Badour     r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow (signed to unsigned), use low 64 bits only
8669*80a68eefSBob Badour     return64(r32);
8670*80a68eefSBob Badour }
8671*80a68eefSBob Badour 
8672*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
8673*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8674*80a68eefSBob Badour {
8675*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t atmp[2];
8676*80a68eefSBob Badour     uint32x2_t res;
8677*80a68eefSBob Badour     int64_t res64;
8678*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
8679*80a68eefSBob Badour     if (atmp[0] < 0) {
8680*80a68eefSBob Badour         res.m64_u32[0] = 0;
8681*80a68eefSBob Badour     } else {
8682*80a68eefSBob Badour         res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
8683*80a68eefSBob Badour         res.m64_u32[0] = (uint32_t) ((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
8684*80a68eefSBob Badour     }
8685*80a68eefSBob Badour     if (atmp[1] < 0) {
8686*80a68eefSBob Badour         res.m64_u32[1] = 0;
8687*80a68eefSBob Badour     } else {
8688*80a68eefSBob Badour         res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
8689*80a68eefSBob Badour         res.m64_u32[1] = (uint32_t)((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
8690*80a68eefSBob Badour     }
8691*80a68eefSBob Badour     return res;
8692*80a68eefSBob Badour }
8693*80a68eefSBob Badour 
8694*80a68eefSBob Badour //***** Vector narrowing saturating shift right by constant ******
8695*80a68eefSBob Badour //*****************************************************************
8696*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
8697*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
8698*80a68eefSBob Badour {
8699*80a68eefSBob Badour     int8x8_t res64;
8700*80a68eefSBob Badour     __m128i r16;
8701*80a68eefSBob Badour     r16  = vshrq_n_s16(a,b);
8702*80a68eefSBob Badour     r16  = _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
8703*80a68eefSBob Badour     return64(r16);
8704*80a68eefSBob Badour }
8705*80a68eefSBob Badour 
8706*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
8707*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
8708*80a68eefSBob Badour {
8709*80a68eefSBob Badour     int16x4_t res64;
8710*80a68eefSBob Badour     __m128i r32;
8711*80a68eefSBob Badour     r32  = vshrq_n_s32(a,b);
8712*80a68eefSBob Badour     r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
8713*80a68eefSBob Badour     return64(r32);
8714*80a68eefSBob Badour }
8715*80a68eefSBob Badour 
8716*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
8717*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8718*80a68eefSBob Badour {
8719*80a68eefSBob Badour     //no optimal SIMD solution found
8720*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
8721*80a68eefSBob Badour     int32x2_t res;
8722*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
8723*80a68eefSBob Badour     res64[0] = (atmp[0] >> b);
8724*80a68eefSBob Badour     res64[1] = (atmp[1] >> b);
8725*80a68eefSBob Badour     if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
8726*80a68eefSBob Badour     if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
8727*80a68eefSBob Badour     if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
8728*80a68eefSBob Badour     if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
8729*80a68eefSBob Badour     res.m64_i32[0] = (int32_t)res64[0];
8730*80a68eefSBob Badour     res.m64_i32[1] = (int32_t)res64[1];
8731*80a68eefSBob Badour     return res;
8732*80a68eefSBob Badour }
8733*80a68eefSBob Badour 
8734*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
8735*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
8736*80a68eefSBob Badour {
8737*80a68eefSBob Badour     uint8x8_t res64;
8738*80a68eefSBob Badour     __m128i r16;
8739*80a68eefSBob Badour     r16  = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8740*80a68eefSBob Badour     r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
8741*80a68eefSBob Badour     return64(r16);
8742*80a68eefSBob Badour }
8743*80a68eefSBob Badour 
8744*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
8745*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
8746*80a68eefSBob Badour {
8747*80a68eefSBob Badour     uint16x4_t res64;
8748*80a68eefSBob Badour     __m128i r32;
8749*80a68eefSBob Badour     r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8750*80a68eefSBob Badour     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
8751*80a68eefSBob Badour     return64(r32);
8752*80a68eefSBob Badour }
8753*80a68eefSBob Badour 
8754*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
8755*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8756*80a68eefSBob Badour {
8757*80a68eefSBob Badour     //serial solution may be faster
8758*80a68eefSBob Badour     uint32x2_t res64;
8759*80a68eefSBob Badour     __m128i r64, res_hi, zero;
8760*80a68eefSBob Badour     zero = _mm_setzero_si128();
8761*80a68eefSBob Badour     r64  = vshrq_n_u64(a,b);
8762*80a68eefSBob Badour     res_hi = _mm_srli_epi64(r64,  32);
8763*80a68eefSBob Badour     res_hi = _mm_cmpgt_epi32(res_hi, zero);
8764*80a68eefSBob Badour     r64 = _mm_or_si128(r64, res_hi);
8765*80a68eefSBob Badour     r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8766*80a68eefSBob Badour     return64(r64);
8767*80a68eefSBob Badour }
8768*80a68eefSBob Badour 
8769*80a68eefSBob Badour 
8770*80a68eefSBob Badour //********* Vector rounding narrowing shift right by constant *************************
8771*80a68eefSBob Badour //****************************************************************************************
8772*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
8773*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
8774*80a68eefSBob Badour {
8775*80a68eefSBob Badour     int8x8_t res64;
8776*80a68eefSBob Badour     __m128i r16;
8777*80a68eefSBob Badour      r16  = vrshrq_n_s16(a,b);
8778*80a68eefSBob Badour     r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8779*80a68eefSBob Badour     return64(r16);
8780*80a68eefSBob Badour }
8781*80a68eefSBob Badour 
8782*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
8783*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
8784*80a68eefSBob Badour {
8785*80a68eefSBob Badour     int16x4_t res64;
8786*80a68eefSBob Badour     __m128i r32;
8787*80a68eefSBob Badour     r32  = vrshrq_n_s32(a,b);
8788*80a68eefSBob Badour     r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8789*80a68eefSBob Badour     return64(r32);
8790*80a68eefSBob Badour }
8791*80a68eefSBob Badour 
8792*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
8793*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
8794*80a68eefSBob Badour {
8795*80a68eefSBob Badour     int32x2_t res64;
8796*80a68eefSBob Badour     __m128i r64;
8797*80a68eefSBob Badour     r64  = vrshrq_n_s64(a,b);
8798*80a68eefSBob Badour     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8799*80a68eefSBob Badour     return64(r64);
8800*80a68eefSBob Badour }
8801*80a68eefSBob Badour 
8802*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
8803*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
8804*80a68eefSBob Badour {
8805*80a68eefSBob Badour     uint8x8_t res64;
8806*80a68eefSBob Badour     __m128i mask, r16;
8807*80a68eefSBob Badour     mask = _mm_set1_epi16(0xff);
8808*80a68eefSBob Badour     r16  = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8809*80a68eefSBob Badour     r16 = _mm_and_si128(r16, mask); //to avoid saturation
8810*80a68eefSBob Badour     r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
8811*80a68eefSBob Badour     return64(r16);
8812*80a68eefSBob Badour }
8813*80a68eefSBob Badour 
8814*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
8815*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
8816*80a68eefSBob Badour {
8817*80a68eefSBob Badour     uint16x4_t res64;
8818*80a68eefSBob Badour     __m128i mask, r32;
8819*80a68eefSBob Badour     mask = _mm_set1_epi32(0xffff);
8820*80a68eefSBob Badour     r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8821*80a68eefSBob Badour     r32 = _mm_and_si128(r32, mask); //to avoid saturation
8822*80a68eefSBob Badour     r32 = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
8823*80a68eefSBob Badour     return64(r32);
8824*80a68eefSBob Badour }
8825*80a68eefSBob Badour 
8826*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
8827*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
8828*80a68eefSBob Badour {
8829*80a68eefSBob Badour     uint32x2_t res64;
8830*80a68eefSBob Badour     __m128i r64;
8831*80a68eefSBob Badour     r64  = vrshrq_n_u64(a,b);
8832*80a68eefSBob Badour     r64  =  _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8833*80a68eefSBob Badour     return64(r64);
8834*80a68eefSBob Badour }
8835*80a68eefSBob Badour 
8836*80a68eefSBob Badour //************* Vector rounding narrowing saturating shift right by constant ************
8837*80a68eefSBob Badour //****************************************************************************************
8838*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
8839*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
8840*80a68eefSBob Badour {
8841*80a68eefSBob Badour     int8x8_t res64;
8842*80a68eefSBob Badour     __m128i r16;
8843*80a68eefSBob Badour     r16  = vrshrq_n_s16(a,b);
8844*80a68eefSBob Badour     r16  =  _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
8845*80a68eefSBob Badour     return64(r16);
8846*80a68eefSBob Badour }
8847*80a68eefSBob Badour 
8848*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
8849*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
8850*80a68eefSBob Badour {
8851*80a68eefSBob Badour     int16x4_t res64;
8852*80a68eefSBob Badour     __m128i r32;
8853*80a68eefSBob Badour     r32  = vrshrq_n_s32(a,b);
8854*80a68eefSBob Badour     r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
8855*80a68eefSBob Badour     return64(r32);
8856*80a68eefSBob Badour }
8857*80a68eefSBob Badour 
8858*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
8859*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8860*80a68eefSBob Badour {
8861*80a68eefSBob Badour     //no optimal SIMD solution found
8862*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
8863*80a68eefSBob Badour     int32x2_t res;
8864*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
8865*80a68eefSBob Badour     maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
8866*80a68eefSBob Badour     res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
8867*80a68eefSBob Badour     maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
8868*80a68eefSBob Badour     res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
8869*80a68eefSBob Badour     if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
8870*80a68eefSBob Badour     if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
8871*80a68eefSBob Badour     if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
8872*80a68eefSBob Badour     if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
8873*80a68eefSBob Badour     res.m64_i32[0] = (int32_t)res64[0];
8874*80a68eefSBob Badour     res.m64_i32[1] = (int32_t)res64[1];
8875*80a68eefSBob Badour     return res;
8876*80a68eefSBob Badour }
8877*80a68eefSBob Badour 
8878*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
8879*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
8880*80a68eefSBob Badour {
8881*80a68eefSBob Badour     uint8x8_t res64;
8882*80a68eefSBob Badour     __m128i r16;
8883*80a68eefSBob Badour     r16  = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8884*80a68eefSBob Badour     r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
8885*80a68eefSBob Badour     return64(r16);
8886*80a68eefSBob Badour }
8887*80a68eefSBob Badour 
8888*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
8889*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
8890*80a68eefSBob Badour {
8891*80a68eefSBob Badour     uint16x4_t res64;
8892*80a68eefSBob Badour     __m128i r32;
8893*80a68eefSBob Badour     r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8894*80a68eefSBob Badour     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
8895*80a68eefSBob Badour     return64(r32);
8896*80a68eefSBob Badour }
8897*80a68eefSBob Badour 
8898*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
8899*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8900*80a68eefSBob Badour {
8901*80a68eefSBob Badour     //serial solution may be faster
8902*80a68eefSBob Badour     uint32x2_t res64;
8903*80a68eefSBob Badour     __m128i r64, res_hi, zero;
8904*80a68eefSBob Badour     zero = _mm_setzero_si128();
8905*80a68eefSBob Badour     r64  = vrshrq_n_u64(a,b);
8906*80a68eefSBob Badour     res_hi = _mm_srli_epi64(r64,  32);
8907*80a68eefSBob Badour     res_hi = _mm_cmpgt_epi32(res_hi, zero);
8908*80a68eefSBob Badour     r64 = _mm_or_si128(r64, res_hi);
8909*80a68eefSBob Badour     r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8910*80a68eefSBob Badour     return64(r64);
8911*80a68eefSBob Badour }
8912*80a68eefSBob Badour 
8913*80a68eefSBob Badour //************** Vector widening shift left by constant ****************
8914*80a68eefSBob Badour //************************************************************************
8915*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
8916*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
8917*80a68eefSBob Badour {
8918*80a68eefSBob Badour     __m128i r;
8919*80a68eefSBob Badour     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8920*80a68eefSBob Badour     return _mm_slli_epi16 (r, b);
8921*80a68eefSBob Badour }
8922*80a68eefSBob Badour 
8923*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
8924*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
8925*80a68eefSBob Badour {
8926*80a68eefSBob Badour     __m128i r;
8927*80a68eefSBob Badour     r =  _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
8928*80a68eefSBob Badour     return _mm_slli_epi32 (r, b);
8929*80a68eefSBob Badour }
8930*80a68eefSBob Badour 
8931*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
8932*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
8933*80a68eefSBob Badour {
8934*80a68eefSBob Badour     __m128i r;
8935*80a68eefSBob Badour     r =  _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
8936*80a68eefSBob Badour     return _mm_slli_epi64 (r, b);
8937*80a68eefSBob Badour }
8938*80a68eefSBob Badour 
8939*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
8940*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
8941*80a68eefSBob Badour {
8942*80a68eefSBob Badour     //no uint8 to uint16 conversion available, manual conversion used
8943*80a68eefSBob Badour     __m128i zero,  r;
8944*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
8945*80a68eefSBob Badour     r = _mm_unpacklo_epi8(_pM128i(a), zero);
8946*80a68eefSBob Badour     return _mm_slli_epi16 (r, b);
8947*80a68eefSBob Badour }
8948*80a68eefSBob Badour 
8949*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
8950*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
8951*80a68eefSBob Badour {
8952*80a68eefSBob Badour     //no uint16 to uint32 conversion available, manual conversion used
8953*80a68eefSBob Badour     __m128i zero,  r;
8954*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
8955*80a68eefSBob Badour     r = _mm_unpacklo_epi16(_pM128i(a), zero);
8956*80a68eefSBob Badour     return _mm_slli_epi32 (r, b);
8957*80a68eefSBob Badour }
8958*80a68eefSBob Badour 
8959*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
8960*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
8961*80a68eefSBob Badour {
8962*80a68eefSBob Badour     //no uint32 to uint64 conversion available, manual conversion used
8963*80a68eefSBob Badour     __m128i zero,  r;
8964*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
8965*80a68eefSBob Badour     r = _mm_unpacklo_epi32(_pM128i(a), zero);
8966*80a68eefSBob Badour     return _mm_slli_epi64 (r, b);
8967*80a68eefSBob Badour }
8968*80a68eefSBob Badour 
8969*80a68eefSBob Badour //************************************************************************************
8970*80a68eefSBob Badour //**************************** Shifts with insert ************************************
8971*80a68eefSBob Badour //************************************************************************************
8972*80a68eefSBob Badour //takes each element in a vector,  shifts them by an immediate value,
8973*80a68eefSBob Badour //and inserts the results in the destination vector. Bits shifted out of the each element are lost.
8974*80a68eefSBob Badour 
8975*80a68eefSBob Badour //**************** Vector shift right and insert ************************************
8976*80a68eefSBob Badour //Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
8977*80a68eefSBob Badour //All other bits are taken from b shifted.
8978*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
8979*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c)
8980*80a68eefSBob Badour {
8981*80a68eefSBob Badour     int8x8_t res64;
8982*80a68eefSBob Badour     return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
8983*80a68eefSBob Badour }
8984*80a68eefSBob Badour 
8985*80a68eefSBob Badour 
8986*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
8987*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c)
8988*80a68eefSBob Badour {
8989*80a68eefSBob Badour     int16x4_t res64;
8990*80a68eefSBob Badour     return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
8991*80a68eefSBob Badour }
8992*80a68eefSBob Badour 
8993*80a68eefSBob Badour 
8994*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
8995*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c)
8996*80a68eefSBob Badour {
8997*80a68eefSBob Badour     int32x2_t res64;
8998*80a68eefSBob Badour     return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
8999*80a68eefSBob Badour }
9000*80a68eefSBob Badour 
9001*80a68eefSBob Badour 
9002*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
9003*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
9004*80a68eefSBob Badour {
9005*80a68eefSBob Badour     int64x1_t res;
9006*80a68eefSBob Badour     if (c ==64)
9007*80a68eefSBob Badour         res = a;
9008*80a68eefSBob Badour     else{
9009*80a68eefSBob Badour         res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
9010*80a68eefSBob Badour     }
9011*80a68eefSBob Badour     return res;
9012*80a68eefSBob Badour }
9013*80a68eefSBob Badour 
9014*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
9015*80a68eefSBob Badour #define vsri_n_u8 vsri_n_s8
9016*80a68eefSBob Badour 
9017*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
9018*80a68eefSBob Badour #define vsri_n_u16 vsri_n_s16
9019*80a68eefSBob Badour 
9020*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
9021*80a68eefSBob Badour #define vsri_n_u32 vsri_n_s32
9022*80a68eefSBob Badour 
9023*80a68eefSBob Badour 
9024*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
9025*80a68eefSBob Badour #define vsri_n_u64 vsri_n_s64
9026*80a68eefSBob Badour 
9027*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
9028*80a68eefSBob Badour #define vsri_n_p8 vsri_n_u8
9029*80a68eefSBob Badour 
9030*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
9031*80a68eefSBob Badour #define vsri_n_p16 vsri_n_u16
9032*80a68eefSBob Badour 
9033*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9034*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
9035*80a68eefSBob Badour {
9036*80a68eefSBob Badour     __m128i maskA, a_masked;
9037*80a68eefSBob Badour     uint8x16_t b_shift;
9038*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
9039*80a68eefSBob Badour     maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
9040*80a68eefSBob Badour     a_masked = _mm_and_si128 (a, maskA);
9041*80a68eefSBob Badour     b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
9042*80a68eefSBob Badour     return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
9043*80a68eefSBob Badour }
9044*80a68eefSBob Badour 
9045*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9046*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
9047*80a68eefSBob Badour {
9048*80a68eefSBob Badour     //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
9049*80a68eefSBob Badour     uint16x8_t b_shift;
9050*80a68eefSBob Badour     uint16x8_t a_c;
9051*80a68eefSBob Badour     b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
9052*80a68eefSBob Badour     a_c = vshrq_n_u16( a, (16 - c));
9053*80a68eefSBob Badour     a_c  = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
9054*80a68eefSBob Badour     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9055*80a68eefSBob Badour }
9056*80a68eefSBob Badour 
9057*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
9058*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
9059*80a68eefSBob Badour {
9060*80a68eefSBob Badour     //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
9061*80a68eefSBob Badour     uint32x4_t b_shift;
9062*80a68eefSBob Badour     uint32x4_t a_c;
9063*80a68eefSBob Badour     b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
9064*80a68eefSBob Badour     a_c = vshrq_n_u32( a, (32 - c));
9065*80a68eefSBob Badour     a_c  = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
9066*80a68eefSBob Badour     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9067*80a68eefSBob Badour }
9068*80a68eefSBob Badour 
9069*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
9070*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
9071*80a68eefSBob Badour {
9072*80a68eefSBob Badour     //serial solution may be faster
9073*80a68eefSBob Badour     uint64x2_t b_shift;
9074*80a68eefSBob Badour     uint64x2_t a_c;
9075*80a68eefSBob Badour     b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
9076*80a68eefSBob Badour     a_c = _mm_srli_epi64(a, (64 - c));
9077*80a68eefSBob Badour     a_c  = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
9078*80a68eefSBob Badour     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9079*80a68eefSBob Badour }
9080*80a68eefSBob Badour 
9081*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9082*80a68eefSBob Badour #define vsriq_n_u8 vsriq_n_s8
9083*80a68eefSBob Badour 
9084*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9085*80a68eefSBob Badour #define vsriq_n_u16 vsriq_n_s16
9086*80a68eefSBob Badour 
9087*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
9088*80a68eefSBob Badour #define vsriq_n_u32 vsriq_n_s32
9089*80a68eefSBob Badour 
9090*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
9091*80a68eefSBob Badour #define vsriq_n_u64 vsriq_n_s64
9092*80a68eefSBob Badour 
9093*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9094*80a68eefSBob Badour #define vsriq_n_p8 vsriq_n_u8
9095*80a68eefSBob Badour 
9096*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9097*80a68eefSBob Badour #define vsriq_n_p16 vsriq_n_u16
9098*80a68eefSBob Badour 
9099*80a68eefSBob Badour //***** Vector shift left and insert *********************************************
9100*80a68eefSBob Badour //*********************************************************************************
9101*80a68eefSBob Badour //Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
9102*80a68eefSBob Badour //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
9103*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9104*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c)
9105*80a68eefSBob Badour {
9106*80a68eefSBob Badour     int8x8_t res64;
9107*80a68eefSBob Badour     return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
9108*80a68eefSBob Badour }
9109*80a68eefSBob Badour 
9110*80a68eefSBob Badour 
9111*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9112*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c)
9113*80a68eefSBob Badour {
9114*80a68eefSBob Badour     int16x4_t res64;
9115*80a68eefSBob Badour     return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
9116*80a68eefSBob Badour }
9117*80a68eefSBob Badour 
9118*80a68eefSBob Badour 
9119*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
9120*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c)
9121*80a68eefSBob Badour {
9122*80a68eefSBob Badour     int32x2_t res64;
9123*80a68eefSBob Badour     return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
9124*80a68eefSBob Badour }
9125*80a68eefSBob Badour 
9126*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
9127*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
9128*80a68eefSBob Badour {
9129*80a68eefSBob Badour     int64x1_t res;
9130*80a68eefSBob Badour     res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
9131*80a68eefSBob Badour     return res;
9132*80a68eefSBob Badour }
9133*80a68eefSBob Badour 
9134*80a68eefSBob Badour 
9135*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9136*80a68eefSBob Badour #define vsli_n_u8 vsli_n_s8
9137*80a68eefSBob Badour 
9138*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9139*80a68eefSBob Badour #define vsli_n_u16 vsli_n_s16
9140*80a68eefSBob Badour 
9141*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
9142*80a68eefSBob Badour #define vsli_n_u32 vsli_n_s32
9143*80a68eefSBob Badour 
9144*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
9145*80a68eefSBob Badour #define vsli_n_u64 vsli_n_s64
9146*80a68eefSBob Badour 
9147*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9148*80a68eefSBob Badour #define vsli_n_p8 vsli_n_u8
9149*80a68eefSBob Badour 
9150*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9151*80a68eefSBob Badour #define vsli_n_p16 vsli_n_u16
9152*80a68eefSBob Badour 
9153*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9154*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
9155*80a68eefSBob Badour {
9156*80a68eefSBob Badour     __m128i maskA, a_masked;
9157*80a68eefSBob Badour     int8x16_t b_shift;
9158*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
9159*80a68eefSBob Badour     maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
9160*80a68eefSBob Badour     b_shift = vshlq_n_s8( b, c);
9161*80a68eefSBob Badour     a_masked = _mm_and_si128 (a, maskA);
9162*80a68eefSBob Badour     return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
9163*80a68eefSBob Badour }
9164*80a68eefSBob Badour 
9165*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9166*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
9167*80a68eefSBob Badour {
9168*80a68eefSBob Badour     //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
9169*80a68eefSBob Badour     int16x8_t b_shift;
9170*80a68eefSBob Badour     int16x8_t a_c;
9171*80a68eefSBob Badour     b_shift = vshlq_n_s16( b, c);
9172*80a68eefSBob Badour     a_c = vshlq_n_s16( a, (16 - c));
9173*80a68eefSBob Badour     a_c  = _mm_srli_epi16(a_c, (16 - c));
9174*80a68eefSBob Badour     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9175*80a68eefSBob Badour }
9176*80a68eefSBob Badour 
9177*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
9178*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
9179*80a68eefSBob Badour {
9180*80a68eefSBob Badour     //solution may be  not optimal compared with the serial one
9181*80a68eefSBob Badour     //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
9182*80a68eefSBob Badour     int32x4_t b_shift;
9183*80a68eefSBob Badour     int32x4_t a_c;
9184*80a68eefSBob Badour     b_shift = vshlq_n_s32( b, c);
9185*80a68eefSBob Badour     a_c = vshlq_n_s32( a, (32 - c));
9186*80a68eefSBob Badour     a_c  = _mm_srli_epi32(a_c, (32 - c));
9187*80a68eefSBob Badour     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9188*80a68eefSBob Badour }
9189*80a68eefSBob Badour 
9190*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
9191*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
9192*80a68eefSBob Badour {
9193*80a68eefSBob Badour     //solution may be  not optimal compared with the serial one
9194*80a68eefSBob Badour     //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
9195*80a68eefSBob Badour     int64x2_t b_shift;
9196*80a68eefSBob Badour     int64x2_t a_c;
9197*80a68eefSBob Badour     b_shift = vshlq_n_s64( b, c);
9198*80a68eefSBob Badour     a_c = vshlq_n_s64( a, (64 - c));
9199*80a68eefSBob Badour     a_c  = _mm_srli_epi64(a_c, (64 - c));
9200*80a68eefSBob Badour     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9201*80a68eefSBob Badour }
9202*80a68eefSBob Badour 
9203*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9204*80a68eefSBob Badour #define vsliq_n_u8 vsliq_n_s8
9205*80a68eefSBob Badour 
9206*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9207*80a68eefSBob Badour #define vsliq_n_u16 vsliq_n_s16
9208*80a68eefSBob Badour 
9209*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
9210*80a68eefSBob Badour #define vsliq_n_u32 vsliq_n_s32
9211*80a68eefSBob Badour 
9212*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
9213*80a68eefSBob Badour #define vsliq_n_u64 vsliq_n_s64
9214*80a68eefSBob Badour 
9215*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9216*80a68eefSBob Badour #define vsliq_n_p8 vsliq_n_u8
9217*80a68eefSBob Badour 
9218*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9219*80a68eefSBob Badour #define vsliq_n_p16 vsliq_n_u16
9220*80a68eefSBob Badour 
9221*80a68eefSBob Badour // ***********************************************************************************************
9222*80a68eefSBob Badour // ****************** Loads and stores of a single vector ***************************************
9223*80a68eefSBob Badour // ***********************************************************************************************
9224*80a68eefSBob Badour //Performs loads and stores of a single vector of some type.
9225*80a68eefSBob Badour //*******************************  Loads ********************************************************
9226*80a68eefSBob Badour // ***********************************************************************************************
9227*80a68eefSBob Badour //We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
9228*80a68eefSBob Badour //also for SSE3  supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
9229*80a68eefSBob Badour // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
9230*80a68eefSBob Badour //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
9231*80a68eefSBob Badour #define LOAD_SI128(ptr) \
9232*80a68eefSBob Badour         ( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr))
9233*80a68eefSBob Badour 
9234*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9235*80a68eefSBob Badour #define vld1q_u8 LOAD_SI128
9236*80a68eefSBob Badour 
9237*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9238*80a68eefSBob Badour #define vld1q_u16 LOAD_SI128
9239*80a68eefSBob Badour 
9240*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9241*80a68eefSBob Badour #define vld1q_u32 LOAD_SI128
9242*80a68eefSBob Badour 
9243*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9244*80a68eefSBob Badour #define vld1q_u64 LOAD_SI128
9245*80a68eefSBob Badour 
9246*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9247*80a68eefSBob Badour #define vld1q_s8 LOAD_SI128
9248*80a68eefSBob Badour 
9249*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9250*80a68eefSBob Badour #define vld1q_s16 LOAD_SI128
9251*80a68eefSBob Badour 
9252*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9253*80a68eefSBob Badour #define vld1q_s32 LOAD_SI128
9254*80a68eefSBob Badour 
9255*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9256*80a68eefSBob Badour #define vld1q_s64 LOAD_SI128
9257*80a68eefSBob Badour 
9258*80a68eefSBob Badour _NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
9259*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
9260*80a68eefSBob Badour /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
9261*80a68eefSBob Badour {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9262*80a68eefSBob Badour __m128 f2;
9263*80a68eefSBob Badour f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
9264*80a68eefSBob Badour }*/
9265*80a68eefSBob Badour 
9266*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9267*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
9268*80a68eefSBob Badour {
9269*80a68eefSBob Badour     if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned
9270*80a68eefSBob Badour         return _mm_load_ps(ptr);
9271*80a68eefSBob Badour     else
9272*80a68eefSBob Badour         return _mm_loadu_ps(ptr);
9273*80a68eefSBob Badour }
9274*80a68eefSBob Badour 
9275*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9276*80a68eefSBob Badour #define vld1q_p8  LOAD_SI128
9277*80a68eefSBob Badour 
9278*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9279*80a68eefSBob Badour #define vld1q_p16 LOAD_SI128
9280*80a68eefSBob Badour 
9281*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
9282*80a68eefSBob Badour #define vld1_u8(ptr)  *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
9283*80a68eefSBob Badour 
9284*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
9285*80a68eefSBob Badour #define vld1_u16 vld1_u8
9286*80a68eefSBob Badour 
9287*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
9288*80a68eefSBob Badour #define vld1_u32 vld1_u8
9289*80a68eefSBob Badour 
9290*80a68eefSBob Badour 
9291*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9292*80a68eefSBob Badour #define vld1_u64 vld1_u8
9293*80a68eefSBob Badour 
9294*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
9295*80a68eefSBob Badour #define vld1_s8 vld1_u8
9296*80a68eefSBob Badour 
9297*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
9298*80a68eefSBob Badour #define vld1_s16 vld1_u16
9299*80a68eefSBob Badour 
9300*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
9301*80a68eefSBob Badour #define vld1_s32 vld1_u32
9302*80a68eefSBob Badour 
9303*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9304*80a68eefSBob Badour #define vld1_s64 vld1_u64
9305*80a68eefSBob Badour 
9306*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
9307*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9308*80a68eefSBob Badour 
9309*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
9310*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
9311*80a68eefSBob Badour {
9312*80a68eefSBob Badour     float32x2_t res;
9313*80a68eefSBob Badour     res.m64_f32[0] = *(ptr);
9314*80a68eefSBob Badour     res.m64_f32[1] = *(ptr + 1);
9315*80a68eefSBob Badour     return res;
9316*80a68eefSBob Badour }
9317*80a68eefSBob Badour 
9318*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
9319*80a68eefSBob Badour #define vld1_p8 vld1_u8
9320*80a68eefSBob Badour 
9321*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
9322*80a68eefSBob Badour #define vld1_p16 vld1_u16
9323*80a68eefSBob Badour 
9324*80a68eefSBob Badour 
9325*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9326*80a68eefSBob Badour _NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr)
9327*80a68eefSBob Badour {
9328*80a68eefSBob Badour     if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned
9329*80a68eefSBob Badour         return _mm_load_pd(ptr);
9330*80a68eefSBob Badour     else
9331*80a68eefSBob Badour         return _mm_loadu_pd(ptr);
9332*80a68eefSBob Badour }
9333*80a68eefSBob Badour 
9334*80a68eefSBob Badour 
9335*80a68eefSBob Badour //***********************************************************************************************************
9336*80a68eefSBob Badour //******* Lane load functions - insert the data at  vector's given position (lane) *************************
9337*80a68eefSBob Badour //***********************************************************************************************************
9338*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9339*80a68eefSBob Badour #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9340*80a68eefSBob Badour 
9341*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9342*80a68eefSBob Badour #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9343*80a68eefSBob Badour 
9344*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9345*80a68eefSBob Badour #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9346*80a68eefSBob Badour 
9347*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
9348*80a68eefSBob Badour #define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
9349*80a68eefSBob Badour 
9350*80a68eefSBob Badour 
9351*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9352*80a68eefSBob Badour #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9353*80a68eefSBob Badour 
9354*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9355*80a68eefSBob Badour #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9356*80a68eefSBob Badour 
9357*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9358*80a68eefSBob Badour #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9359*80a68eefSBob Badour 
9360*80a68eefSBob Badour _NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9361*80a68eefSBob Badour //current IA SIMD doesn't support float16
9362*80a68eefSBob Badour 
9363*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9364*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
9365*80a68eefSBob Badour {
9366*80a68eefSBob Badour     //we need to deal with  ptr  16bit NOT aligned case
9367*80a68eefSBob Badour     __m128 p;
9368*80a68eefSBob Badour     p = _mm_set1_ps(*(ptr));
9369*80a68eefSBob Badour     return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
9370*80a68eefSBob Badour }
9371*80a68eefSBob Badour 
9372*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
9373*80a68eefSBob Badour #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
9374*80a68eefSBob Badour 
9375*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9376*80a68eefSBob Badour #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9377*80a68eefSBob Badour 
9378*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9379*80a68eefSBob Badour #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9380*80a68eefSBob Badour 
9381*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9382*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
9383*80a68eefSBob Badour {
9384*80a68eefSBob Badour     uint8x8_t res;
9385*80a68eefSBob Badour     res = vec;
9386*80a68eefSBob Badour     res.m64_u8[lane] = *(ptr);
9387*80a68eefSBob Badour     return res;
9388*80a68eefSBob Badour }
9389*80a68eefSBob Badour 
9390*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9391*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
9392*80a68eefSBob Badour {
9393*80a68eefSBob Badour     uint16x4_t res;
9394*80a68eefSBob Badour     res = vec;
9395*80a68eefSBob Badour     res.m64_u16[lane] = *(ptr);
9396*80a68eefSBob Badour     return res;
9397*80a68eefSBob Badour }
9398*80a68eefSBob Badour 
9399*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9400*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
9401*80a68eefSBob Badour {
9402*80a68eefSBob Badour     uint32x2_t res;
9403*80a68eefSBob Badour     res = vec;
9404*80a68eefSBob Badour     res.m64_u32[lane] = *(ptr);
9405*80a68eefSBob Badour     return res;
9406*80a68eefSBob Badour }
9407*80a68eefSBob Badour 
9408*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
9409*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
9410*80a68eefSBob Badour {
9411*80a68eefSBob Badour     uint64x1_t res;
9412*80a68eefSBob Badour     res.m64_u64[0] = *(ptr);
9413*80a68eefSBob Badour     return res;
9414*80a68eefSBob Badour }
9415*80a68eefSBob Badour 
9416*80a68eefSBob Badour 
9417*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9418*80a68eefSBob Badour #define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
9419*80a68eefSBob Badour 
9420*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9421*80a68eefSBob Badour #define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
9422*80a68eefSBob Badour 
9423*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9424*80a68eefSBob Badour #define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
9425*80a68eefSBob Badour 
9426*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9427*80a68eefSBob Badour //current IA SIMD doesn't support float16
9428*80a68eefSBob Badour 
9429*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9430*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
9431*80a68eefSBob Badour {
9432*80a68eefSBob Badour     float32x2_t res;
9433*80a68eefSBob Badour     res = vec;
9434*80a68eefSBob Badour     res.m64_f32[lane] = *(ptr);
9435*80a68eefSBob Badour     return res;
9436*80a68eefSBob Badour }
9437*80a68eefSBob Badour 
9438*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
9439*80a68eefSBob Badour #define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
9440*80a68eefSBob Badour 
9441*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9442*80a68eefSBob Badour #define vld1_lane_p8 vld1_lane_u8
9443*80a68eefSBob Badour 
9444*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9445*80a68eefSBob Badour #define vld1_lane_p16 vld1_lane_s16
9446*80a68eefSBob Badour 
9447*80a68eefSBob Badour // ****************** Load single value ( set all lanes of vector with same value from memory)**********************
9448*80a68eefSBob Badour // ******************************************************************************************************************
9449*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9450*80a68eefSBob Badour #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
9451*80a68eefSBob Badour 
9452*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9453*80a68eefSBob Badour #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
9454*80a68eefSBob Badour 
9455*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9456*80a68eefSBob Badour #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
9457*80a68eefSBob Badour 
9458*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9459*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
9460*80a68eefSBob Badour {
9461*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
9462*80a68eefSBob Badour     return LOAD_SI128(val);
9463*80a68eefSBob Badour }
9464*80a68eefSBob Badour 
9465*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9466*80a68eefSBob Badour #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
9467*80a68eefSBob Badour 
9468*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9469*80a68eefSBob Badour #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
9470*80a68eefSBob Badour 
9471*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9472*80a68eefSBob Badour #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
9473*80a68eefSBob Badour 
9474*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9475*80a68eefSBob Badour #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
9476*80a68eefSBob Badour 
9477*80a68eefSBob Badour _NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9478*80a68eefSBob Badour //current IA SIMD doesn't support float16, need to go to 32 bits
9479*80a68eefSBob Badour 
9480*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9481*80a68eefSBob Badour #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
9482*80a68eefSBob Badour 
9483*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9484*80a68eefSBob Badour #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
9485*80a68eefSBob Badour 
9486*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9487*80a68eefSBob Badour #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
9488*80a68eefSBob Badour 
9489*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8 (__transfersize (1)uint8_t const * ptr),_NEON2SSE_REASON_SLOW_SERIAL)9490*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9491*80a68eefSBob Badour {
9492*80a68eefSBob Badour     uint8x8_t res;
9493*80a68eefSBob Badour     int i;
9494*80a68eefSBob Badour     for(i = 0; i<8; i++) {
9495*80a68eefSBob Badour         res.m64_u8[i] =  *(ptr);
9496*80a68eefSBob Badour     }
9497*80a68eefSBob Badour     return res;
9498*80a68eefSBob Badour }
9499*80a68eefSBob Badour 
9500*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16 (__transfersize (1)uint16_t const * ptr),_NEON2SSE_REASON_SLOW_SERIAL)9501*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9502*80a68eefSBob Badour {
9503*80a68eefSBob Badour     uint16x4_t res;
9504*80a68eefSBob Badour     int i;
9505*80a68eefSBob Badour     for(i = 0; i<4; i++) {
9506*80a68eefSBob Badour         res.m64_u16[i] =  *(ptr);
9507*80a68eefSBob Badour     }
9508*80a68eefSBob Badour     return res;
9509*80a68eefSBob Badour }
9510*80a68eefSBob Badour 
9511*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32 (__transfersize (1)uint32_t const * ptr),_NEON2SSE_REASON_SLOW_SERIAL)9512*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9513*80a68eefSBob Badour {
9514*80a68eefSBob Badour     uint32x2_t res;
9515*80a68eefSBob Badour     res.m64_u32[0] = *(ptr);
9516*80a68eefSBob Badour     res.m64_u32[1] = *(ptr);
9517*80a68eefSBob Badour     return res;
9518*80a68eefSBob Badour }
9519*80a68eefSBob Badour 
9520*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9521*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
9522*80a68eefSBob Badour {
9523*80a68eefSBob Badour     uint64x1_t res;
9524*80a68eefSBob Badour     res.m64_u64[0] = *(ptr);
9525*80a68eefSBob Badour     return res;
9526*80a68eefSBob Badour }
9527*80a68eefSBob Badour 
9528*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9529*80a68eefSBob Badour #define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
9530*80a68eefSBob Badour 
9531*80a68eefSBob Badour 
9532*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9533*80a68eefSBob Badour #define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
9534*80a68eefSBob Badour 
9535*80a68eefSBob Badour 
9536*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9537*80a68eefSBob Badour #define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
9538*80a68eefSBob Badour 
9539*80a68eefSBob Badour 
9540*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9541*80a68eefSBob Badour #define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
9542*80a68eefSBob Badour 
9543*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9544*80a68eefSBob Badour //current IA SIMD doesn't support float16
9545*80a68eefSBob Badour 
9546*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9547*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
9548*80a68eefSBob Badour {
9549*80a68eefSBob Badour     float32x2_t res;
9550*80a68eefSBob Badour     res.m64_f32[0] = *(ptr);
9551*80a68eefSBob Badour     res.m64_f32[1] = res.m64_f32[0];
9552*80a68eefSBob Badour     return res; // use last 64bits only
9553*80a68eefSBob Badour }
9554*80a68eefSBob Badour 
9555*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9556*80a68eefSBob Badour #define vld1_dup_p8 vld1_dup_u8
9557*80a68eefSBob Badour 
9558*80a68eefSBob Badour 
9559*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9560*80a68eefSBob Badour #define vld1_dup_p16 vld1_dup_u16
9561*80a68eefSBob Badour 
9562*80a68eefSBob Badour 
9563*80a68eefSBob Badour //*************************************************************************************
9564*80a68eefSBob Badour //********************************* Store **********************************************
9565*80a68eefSBob Badour //*************************************************************************************
9566*80a68eefSBob Badour // If ptr is 16bit aligned and you  need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
9567*80a68eefSBob Badour //here we assume the case of  NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
9568*80a68eefSBob Badour #define STORE_SI128(ptr, val) \
9569*80a68eefSBob Badour         (((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
9570*80a68eefSBob Badour 
9571*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
9572*80a68eefSBob Badour #define vst1q_u8 STORE_SI128
9573*80a68eefSBob Badour 
9574*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
9575*80a68eefSBob Badour #define vst1q_u16 STORE_SI128
9576*80a68eefSBob Badour 
9577*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
9578*80a68eefSBob Badour #define vst1q_u32 STORE_SI128
9579*80a68eefSBob Badour 
9580*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
9581*80a68eefSBob Badour #define vst1q_u64 STORE_SI128
9582*80a68eefSBob Badour 
9583*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
9584*80a68eefSBob Badour #define vst1q_s8 STORE_SI128
9585*80a68eefSBob Badour 
9586*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
9587*80a68eefSBob Badour #define vst1q_s16 STORE_SI128
9588*80a68eefSBob Badour 
9589*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
9590*80a68eefSBob Badour #define vst1q_s32 STORE_SI128
9591*80a68eefSBob Badour 
9592*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
9593*80a68eefSBob Badour #define vst1q_s64 STORE_SI128
9594*80a68eefSBob Badour 
9595*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
9596*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently
9597*80a68eefSBob Badour 
9598*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
9599*80a68eefSBob Badour _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
9600*80a68eefSBob Badour {
9601*80a68eefSBob Badour     if( ((uintptr_t)(ptr) & 15)  == 0 ) //16 bits aligned
9602*80a68eefSBob Badour         _mm_store_ps (ptr, val);
9603*80a68eefSBob Badour     else
9604*80a68eefSBob Badour         _mm_storeu_ps (ptr, val);
9605*80a68eefSBob Badour }
9606*80a68eefSBob Badour 
9607*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
9608*80a68eefSBob Badour #define vst1q_p8  vst1q_u8
9609*80a68eefSBob Badour 
9610*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
9611*80a68eefSBob Badour #define vst1q_p16 vst1q_u16
9612*80a68eefSBob Badour 
9613*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
9614*80a68eefSBob Badour _NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
9615*80a68eefSBob Badour {
9616*80a68eefSBob Badour     int i;
9617*80a68eefSBob Badour     for (i = 0; i<8; i++) {
9618*80a68eefSBob Badour         *(ptr + i) = ((uint8_t*)&val)[i];
9619*80a68eefSBob Badour     }
9620*80a68eefSBob Badour     //_mm_storel_epi64((__m128i*)ptr, val);
9621*80a68eefSBob Badour     return;
9622*80a68eefSBob Badour }
9623*80a68eefSBob Badour 
9624*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
9625*80a68eefSBob Badour _NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
9626*80a68eefSBob Badour {
9627*80a68eefSBob Badour     int i;
9628*80a68eefSBob Badour     for (i = 0; i<4; i++) {
9629*80a68eefSBob Badour         *(ptr + i) = ((uint16_t*)&val)[i];
9630*80a68eefSBob Badour     }
9631*80a68eefSBob Badour     //_mm_storel_epi64((__m128i*)ptr, val);
9632*80a68eefSBob Badour     return;
9633*80a68eefSBob Badour }
9634*80a68eefSBob Badour 
9635*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
9636*80a68eefSBob Badour _NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
9637*80a68eefSBob Badour {
9638*80a68eefSBob Badour     int i;
9639*80a68eefSBob Badour     for (i = 0; i<2; i++) {
9640*80a68eefSBob Badour         *(ptr + i) = ((uint32_t*)&val)[i];
9641*80a68eefSBob Badour     }
9642*80a68eefSBob Badour     //_mm_storel_epi64((__m128i*)ptr, val);
9643*80a68eefSBob Badour     return;
9644*80a68eefSBob Badour }
9645*80a68eefSBob Badour 
9646*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
9647*80a68eefSBob Badour _NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
9648*80a68eefSBob Badour {
9649*80a68eefSBob Badour     *(ptr) = *((uint64_t*)&val);
9650*80a68eefSBob Badour     //_mm_storel_epi64((__m128i*)ptr, val);
9651*80a68eefSBob Badour     return;
9652*80a68eefSBob Badour }
9653*80a68eefSBob Badour 
9654*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
9655*80a68eefSBob Badour #define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
9656*80a68eefSBob Badour 
9657*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
9658*80a68eefSBob Badour #define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
9659*80a68eefSBob Badour 
9660*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
9661*80a68eefSBob Badour #define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
9662*80a68eefSBob Badour 
9663*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
9664*80a68eefSBob Badour #define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
9665*80a68eefSBob Badour 
9666*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
9667*80a68eefSBob Badour //current IA SIMD doesn't support float16
9668*80a68eefSBob Badour 
9669*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
9670*80a68eefSBob Badour _NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
9671*80a68eefSBob Badour {
9672*80a68eefSBob Badour     *(ptr) =   val.m64_f32[0];
9673*80a68eefSBob Badour     *(ptr + 1) = val.m64_f32[1];
9674*80a68eefSBob Badour     return;
9675*80a68eefSBob Badour }
9676*80a68eefSBob Badour 
9677*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
9678*80a68eefSBob Badour #define vst1_p8 vst1_u8
9679*80a68eefSBob Badour 
9680*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
9681*80a68eefSBob Badour #define vst1_p16 vst1_u16
9682*80a68eefSBob Badour 
9683*80a68eefSBob Badour //***********Store a lane of a vector into memory (extract given lane) *********************
9684*80a68eefSBob Badour //******************************************************************************************
9685*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9686*80a68eefSBob Badour #define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane)
9687*80a68eefSBob Badour 
9688*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9689*80a68eefSBob Badour #define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane)
9690*80a68eefSBob Badour 
9691*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9692*80a68eefSBob Badour #define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane)
9693*80a68eefSBob Badour 
9694*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
9695*80a68eefSBob Badour #define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane)
9696*80a68eefSBob Badour 
9697*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9698*80a68eefSBob Badour #define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane)
9699*80a68eefSBob Badour 
9700*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9701*80a68eefSBob Badour #define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane)
9702*80a68eefSBob Badour 
9703*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9704*80a68eefSBob Badour #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
9705*80a68eefSBob Badour 
9706*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
9707*80a68eefSBob Badour #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
9708*80a68eefSBob Badour 
9709*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9710*80a68eefSBob Badour //current IA SIMD doesn't support float16
9711*80a68eefSBob Badour 
9712*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9713*80a68eefSBob Badour _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
9714*80a68eefSBob Badour {
9715*80a68eefSBob Badour     int32_t ilane;
9716*80a68eefSBob Badour     ilane = _MM_EXTRACT_PS(val,lane);
9717*80a68eefSBob Badour     *(ptr) =  *((float*)&ilane);
9718*80a68eefSBob Badour }
9719*80a68eefSBob Badour 
9720*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9721*80a68eefSBob Badour #define vst1q_lane_p8   vst1q_lane_u8
9722*80a68eefSBob Badour 
9723*80a68eefSBob Badour _NEON2SSESTORAGE void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9724*80a68eefSBob Badour #define vst1q_lane_p16   vst1q_lane_s16
9725*80a68eefSBob Badour 
9726*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9727*80a68eefSBob Badour _NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
9728*80a68eefSBob Badour {
9729*80a68eefSBob Badour     *(ptr) = val.m64_u8[lane];
9730*80a68eefSBob Badour }
9731*80a68eefSBob Badour 
9732*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9733*80a68eefSBob Badour _NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
9734*80a68eefSBob Badour {
9735*80a68eefSBob Badour     *(ptr) = val.m64_u16[lane];
9736*80a68eefSBob Badour }
9737*80a68eefSBob Badour 
9738*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9739*80a68eefSBob Badour _NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
9740*80a68eefSBob Badour {
9741*80a68eefSBob Badour     *(ptr) = val.m64_u32[lane];
9742*80a68eefSBob Badour }
9743*80a68eefSBob Badour 
9744*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
9745*80a68eefSBob Badour _NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
9746*80a68eefSBob Badour {
9747*80a68eefSBob Badour     *(ptr) = val.m64_u64[0];
9748*80a68eefSBob Badour }
9749*80a68eefSBob Badour 
9750*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9751*80a68eefSBob Badour #define  vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
9752*80a68eefSBob Badour 
9753*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9754*80a68eefSBob Badour #define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
9755*80a68eefSBob Badour 
9756*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9757*80a68eefSBob Badour #define vst1_lane_s32(ptr, val, lane)  vst1_lane_u32((uint32_t*)ptr, val, lane)
9758*80a68eefSBob Badour 
9759*80a68eefSBob Badour 
9760*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
9761*80a68eefSBob Badour #define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
9762*80a68eefSBob Badour 
9763*80a68eefSBob Badour 
9764*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9765*80a68eefSBob Badour //current IA SIMD doesn't support float16
9766*80a68eefSBob Badour 
9767*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9768*80a68eefSBob Badour _NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
9769*80a68eefSBob Badour {
9770*80a68eefSBob Badour     *(ptr) = val.m64_f32[lane];
9771*80a68eefSBob Badour }
9772*80a68eefSBob Badour 
9773*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9774*80a68eefSBob Badour #define vst1_lane_p8 vst1_lane_u8
9775*80a68eefSBob Badour 
9776*80a68eefSBob Badour _NEON2SSESTORAGE void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9777*80a68eefSBob Badour #define vst1_lane_p16 vst1_lane_s16
9778*80a68eefSBob Badour 
9779*80a68eefSBob Badour //***********************************************************************************************
9780*80a68eefSBob Badour //**************** Loads and stores of an N-element structure **********************************
9781*80a68eefSBob Badour //***********************************************************************************************
9782*80a68eefSBob Badour //These intrinsics load or store an n-element structure. The array structures are defined in the beginning
9783*80a68eefSBob Badour //We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
9784*80a68eefSBob Badour //****************** 2 elements load  *********************************************
9785*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9786*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
9787*80a68eefSBob Badour {
9788*80a68eefSBob Badour     uint8x16x2_t v;
9789*80a68eefSBob Badour     v.val[0] = vld1q_u8(ptr);
9790*80a68eefSBob Badour     v.val[1] = vld1q_u8((ptr + 16));
9791*80a68eefSBob Badour     v = vuzpq_s8(v.val[0], v.val[1]);
9792*80a68eefSBob Badour     return v;
9793*80a68eefSBob Badour }
9794*80a68eefSBob Badour 
9795*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9796*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
9797*80a68eefSBob Badour {
9798*80a68eefSBob Badour     uint16x8x2_t v;
9799*80a68eefSBob Badour     v.val[0] = vld1q_u16( ptr);
9800*80a68eefSBob Badour     v.val[1] = vld1q_u16( (ptr + 8));
9801*80a68eefSBob Badour     v = vuzpq_s16(v.val[0], v.val[1]);
9802*80a68eefSBob Badour     return v;
9803*80a68eefSBob Badour }
9804*80a68eefSBob Badour 
9805*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9806*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9807*80a68eefSBob Badour {
9808*80a68eefSBob Badour     uint32x4x2_t v;
9809*80a68eefSBob Badour     v.val[0] = vld1q_u32 ( ptr);
9810*80a68eefSBob Badour     v.val[1] = vld1q_u32 ( (ptr + 4));
9811*80a68eefSBob Badour     v = vuzpq_s32(v.val[0], v.val[1]);
9812*80a68eefSBob Badour     return v;
9813*80a68eefSBob Badour }
9814*80a68eefSBob Badour 
9815*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
9816*80a68eefSBob Badour #define  vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
9817*80a68eefSBob Badour 
9818*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9819*80a68eefSBob Badour #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
9820*80a68eefSBob Badour 
9821*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9822*80a68eefSBob Badour #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
9823*80a68eefSBob Badour 
9824*80a68eefSBob Badour 
9825*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
9826*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
9827*80a68eefSBob Badour 
9828*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9829*80a68eefSBob Badour _NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9830*80a68eefSBob Badour {
9831*80a68eefSBob Badour     float32x4x2_t v;
9832*80a68eefSBob Badour     v.val[0] =  vld1q_f32 (ptr);
9833*80a68eefSBob Badour     v.val[1] =  vld1q_f32 ((ptr + 4));
9834*80a68eefSBob Badour     v = vuzpq_f32(v.val[0], v.val[1]);
9835*80a68eefSBob Badour     return v;
9836*80a68eefSBob Badour }
9837*80a68eefSBob Badour 
9838*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9839*80a68eefSBob Badour #define  vld2q_p8 vld2q_u8
9840*80a68eefSBob Badour 
9841*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9842*80a68eefSBob Badour #define vld2q_p16 vld2q_u16
9843*80a68eefSBob Badour 
9844*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9845*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
9846*80a68eefSBob Badour {
9847*80a68eefSBob Badour     uint8x8x2_t v;
9848*80a68eefSBob Badour     __m128i ld128;
9849*80a68eefSBob Badour     ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
9850*80a68eefSBob Badour     ld128 =  _mm_shuffle_epi8(ld128, *(__m128i*)mask8_16_even_odd);
9851*80a68eefSBob Badour     vst1q_u8((v.val), ld128); //  v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
9852*80a68eefSBob Badour     return v;
9853*80a68eefSBob Badour }
9854*80a68eefSBob Badour 
9855*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9856*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
9857*80a68eefSBob Badour {
9858*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint16x4x2_t v;
9859*80a68eefSBob Badour     __m128i ld128;
9860*80a68eefSBob Badour     ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
9861*80a68eefSBob Badour     ld128 = _mm_shuffle_epi8(ld128, *(__m128i*) mask8_32_even_odd);
9862*80a68eefSBob Badour     vst1q_u16((v.val), ld128);
9863*80a68eefSBob Badour     return v;
9864*80a68eefSBob Badour }
9865*80a68eefSBob Badour 
9866*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9867*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
9868*80a68eefSBob Badour {
9869*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint32x2x2_t v;
9870*80a68eefSBob Badour     __m128i ld128;
9871*80a68eefSBob Badour     ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
9872*80a68eefSBob Badour     ld128 = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
9873*80a68eefSBob Badour     vst1q_u32((v.val), ld128);
9874*80a68eefSBob Badour     return v;
9875*80a68eefSBob Badour }
9876*80a68eefSBob Badour 
9877*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9878*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
9879*80a68eefSBob Badour {
9880*80a68eefSBob Badour     uint64x1x2_t v;
9881*80a68eefSBob Badour     v.val[0].m64_u64[0] = *(ptr);
9882*80a68eefSBob Badour     v.val[1].m64_u64[0] = *(ptr + 1);
9883*80a68eefSBob Badour     return v;
9884*80a68eefSBob Badour }
9885*80a68eefSBob Badour 
9886*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9887*80a68eefSBob Badour #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
9888*80a68eefSBob Badour 
9889*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9890*80a68eefSBob Badour #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
9891*80a68eefSBob Badour 
9892*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9893*80a68eefSBob Badour #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
9894*80a68eefSBob Badour 
9895*80a68eefSBob Badour _NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9896*80a68eefSBob Badour #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
9897*80a68eefSBob Badour 
9898*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
9899*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
9900*80a68eefSBob Badour 
9901*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9902*80a68eefSBob Badour _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
9903*80a68eefSBob Badour {
9904*80a68eefSBob Badour     float32x2x2_t v;
9905*80a68eefSBob Badour     v.val[0].m64_f32[0] = *(ptr);
9906*80a68eefSBob Badour     v.val[0].m64_f32[1] = *(ptr + 2);
9907*80a68eefSBob Badour     v.val[1].m64_f32[0] = *(ptr + 1);
9908*80a68eefSBob Badour     v.val[1].m64_f32[1] = *(ptr + 3);
9909*80a68eefSBob Badour     return v;
9910*80a68eefSBob Badour }
9911*80a68eefSBob Badour 
9912*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9913*80a68eefSBob Badour #define vld2_p8 vld2_u8
9914*80a68eefSBob Badour 
9915*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9916*80a68eefSBob Badour #define vld2_p16 vld2_u16
9917*80a68eefSBob Badour 
9918*80a68eefSBob Badour //******************** Triplets ***************************************
9919*80a68eefSBob Badour //*********************************************************************
9920*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
9921*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
9922*80a68eefSBob Badour {
9923*80a68eefSBob Badour     //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
9924*80a68eefSBob Badour     //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
9925*80a68eefSBob Badour     //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
9926*80a68eefSBob Badour     //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
9927*80a68eefSBob Badour     uint8x16x3_t v;
9928*80a68eefSBob Badour     __m128i tmp0, tmp1,tmp2, tmp3;
9929*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
9930*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
9931*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
9932*80a68eefSBob Badour 
9933*80a68eefSBob Badour     v.val[0] =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
9934*80a68eefSBob Badour     v.val[1] =  vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
9935*80a68eefSBob Badour     v.val[2] =  vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
9936*80a68eefSBob Badour 
9937*80a68eefSBob Badour     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
9938*80a68eefSBob Badour     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
9939*80a68eefSBob Badour     tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
9940*80a68eefSBob Badour 
9941*80a68eefSBob Badour     tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
9942*80a68eefSBob Badour     tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
9943*80a68eefSBob Badour     tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
9944*80a68eefSBob Badour     tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
9945*80a68eefSBob Badour     v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
9946*80a68eefSBob Badour     v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
9947*80a68eefSBob Badour 
9948*80a68eefSBob Badour     tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
9949*80a68eefSBob Badour     tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
9950*80a68eefSBob Badour     v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
9951*80a68eefSBob Badour     v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
9952*80a68eefSBob Badour     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
9953*80a68eefSBob Badour     v.val[1] =  _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
9954*80a68eefSBob Badour     v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
9955*80a68eefSBob Badour     tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
9956*80a68eefSBob Badour     tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
9957*80a68eefSBob Badour     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
9958*80a68eefSBob Badour 
9959*80a68eefSBob Badour     tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
9960*80a68eefSBob Badour     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
9961*80a68eefSBob Badour     v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
9962*80a68eefSBob Badour     v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
9963*80a68eefSBob Badour     v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
9964*80a68eefSBob Badour     tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
9965*80a68eefSBob Badour     v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
9966*80a68eefSBob Badour     return v;
9967*80a68eefSBob Badour }
9968*80a68eefSBob Badour 
9969*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
9970*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
9971*80a68eefSBob Badour {
9972*80a68eefSBob Badour     //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
9973*80a68eefSBob Badour     uint16x8x3_t v;
9974*80a68eefSBob Badour     __m128i tmp0, tmp1,tmp2, tmp3;
9975*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
9976*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
9977*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
9978*80a68eefSBob Badour 
9979*80a68eefSBob Badour     v.val[0] =  vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
9980*80a68eefSBob Badour     v.val[1] =  vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
9981*80a68eefSBob Badour     v.val[2] =  vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
9982*80a68eefSBob Badour 
9983*80a68eefSBob Badour     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
9984*80a68eefSBob Badour     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
9985*80a68eefSBob Badour     tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
9986*80a68eefSBob Badour 
9987*80a68eefSBob Badour     tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
9988*80a68eefSBob Badour     tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
9989*80a68eefSBob Badour     tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
9990*80a68eefSBob Badour     tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
9991*80a68eefSBob Badour     v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
9992*80a68eefSBob Badour     v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
9993*80a68eefSBob Badour 
9994*80a68eefSBob Badour     tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
9995*80a68eefSBob Badour     tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
9996*80a68eefSBob Badour     v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
9997*80a68eefSBob Badour     v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
9998*80a68eefSBob Badour     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
9999*80a68eefSBob Badour     v.val[1] =  _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
10000*80a68eefSBob Badour     v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
10001*80a68eefSBob Badour     tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
10002*80a68eefSBob Badour     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
10003*80a68eefSBob Badour     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
10004*80a68eefSBob Badour 
10005*80a68eefSBob Badour     tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
10006*80a68eefSBob Badour     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
10007*80a68eefSBob Badour     v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
10008*80a68eefSBob Badour     v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
10009*80a68eefSBob Badour     v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
10010*80a68eefSBob Badour     tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
10011*80a68eefSBob Badour     v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
10012*80a68eefSBob Badour     return v;
10013*80a68eefSBob Badour }
10014*80a68eefSBob Badour 
10015*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10016*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
10017*80a68eefSBob Badour {
10018*80a68eefSBob Badour     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
10019*80a68eefSBob Badour     uint32x4x3_t v;
10020*80a68eefSBob Badour     __m128i tmp0, tmp1,tmp2, tmp3;
10021*80a68eefSBob Badour     v.val[0] =  vld1q_u32 (ptr); //a0,a1,a2,a3,
10022*80a68eefSBob Badour     v.val[1] =  vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
10023*80a68eefSBob Badour     v.val[2] =  vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
10024*80a68eefSBob Badour 
10025*80a68eefSBob Badour     tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
10026*80a68eefSBob Badour     tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
10027*80a68eefSBob Badour     tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
10028*80a68eefSBob Badour 
10029*80a68eefSBob Badour     tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
10030*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
10031*80a68eefSBob Badour     tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
10032*80a68eefSBob Badour     v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
10033*80a68eefSBob Badour     v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
10034*80a68eefSBob Badour     v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
10035*80a68eefSBob Badour     return v;
10036*80a68eefSBob Badour }
10037*80a68eefSBob Badour 
10038*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10039*80a68eefSBob Badour #define  vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
10040*80a68eefSBob Badour 
10041*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10042*80a68eefSBob Badour #define  vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
10043*80a68eefSBob Badour 
10044*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10045*80a68eefSBob Badour #define  vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
10046*80a68eefSBob Badour 
10047*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10048*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10049*80a68eefSBob Badour 
10050*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10051*80a68eefSBob Badour _NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
10052*80a68eefSBob Badour {
10053*80a68eefSBob Badour     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
10054*80a68eefSBob Badour     float32x4x3_t v;
10055*80a68eefSBob Badour     __m128 tmp0, tmp1,tmp2, tmp3;
10056*80a68eefSBob Badour     v.val[0] =  vld1q_f32 (ptr); //a0,a1,a2,a3,
10057*80a68eefSBob Badour     v.val[1] =  vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
10058*80a68eefSBob Badour     v.val[2] =  vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
10059*80a68eefSBob Badour 
10060*80a68eefSBob Badour     tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
10061*80a68eefSBob Badour     tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
10062*80a68eefSBob Badour     tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
10063*80a68eefSBob Badour     tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
10064*80a68eefSBob Badour 
10065*80a68eefSBob Badour     v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
10066*80a68eefSBob Badour     tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
10067*80a68eefSBob Badour     v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
10068*80a68eefSBob Badour     v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
10069*80a68eefSBob Badour     v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
10070*80a68eefSBob Badour     return v;
10071*80a68eefSBob Badour }
10072*80a68eefSBob Badour 
10073*80a68eefSBob Badour poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10074*80a68eefSBob Badour #define vld3q_p8 vld3q_u8
10075*80a68eefSBob Badour 
10076*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10077*80a68eefSBob Badour #define vld3q_p16 vld3q_u16
10078*80a68eefSBob Badour 
10079*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10080*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
10081*80a68eefSBob Badour {
10082*80a68eefSBob Badour     //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
10083*80a68eefSBob Badour     uint8x8x3_t v;
10084*80a68eefSBob Badour     __m128i val0, val1, val2, tmp0, tmp1;
10085*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
10086*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
10087*80a68eefSBob Badour     val0 =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
10088*80a68eefSBob Badour     val2 =  _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
10089*80a68eefSBob Badour 
10090*80a68eefSBob Badour     tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
10091*80a68eefSBob Badour     tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
10092*80a68eefSBob Badour     val0 = _mm_slli_si128(tmp0,10);
10093*80a68eefSBob Badour     val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
10094*80a68eefSBob Badour     val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
10095*80a68eefSBob Badour     val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
10096*80a68eefSBob Badour     _M64(v.val[0], val0);
10097*80a68eefSBob Badour     val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
10098*80a68eefSBob Badour     val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
10099*80a68eefSBob Badour     val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
10100*80a68eefSBob Badour     val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
10101*80a68eefSBob Badour     val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
10102*80a68eefSBob Badour     _M64(v.val[1], val1);
10103*80a68eefSBob Badour 
10104*80a68eefSBob Badour     tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
10105*80a68eefSBob Badour     val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
10106*80a68eefSBob Badour     val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
10107*80a68eefSBob Badour     val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
10108*80a68eefSBob Badour     _M64(v.val[2], val2);
10109*80a68eefSBob Badour     return v;
10110*80a68eefSBob Badour }
10111*80a68eefSBob Badour 
10112*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10113*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
10114*80a68eefSBob Badour {
10115*80a68eefSBob Badour     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
10116*80a68eefSBob Badour     uint16x4x3_t v;
10117*80a68eefSBob Badour     __m128i val0, val1, val2, tmp0, tmp1;
10118*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
10119*80a68eefSBob Badour     val0 =  vld1q_u16 (ptr); //a0,a1,a2,a3,  b0,b1,b2,b3
10120*80a68eefSBob Badour     val2 =  _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
10121*80a68eefSBob Badour 
10122*80a68eefSBob Badour     tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
10123*80a68eefSBob Badour     tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01     : c1, c2, c0, c3,
10124*80a68eefSBob Badour     val0 = _mm_slli_si128(tmp0,10);
10125*80a68eefSBob Badour     val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
10126*80a68eefSBob Badour     val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
10127*80a68eefSBob Badour     val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
10128*80a68eefSBob Badour     val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
10129*80a68eefSBob Badour     _M64(v.val[0], val0);
10130*80a68eefSBob Badour 
10131*80a68eefSBob Badour     val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
10132*80a68eefSBob Badour     val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
10133*80a68eefSBob Badour     val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
10134*80a68eefSBob Badour     val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
10135*80a68eefSBob Badour     val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
10136*80a68eefSBob Badour     _M64(v.val[1], val1);
10137*80a68eefSBob Badour 
10138*80a68eefSBob Badour     tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
10139*80a68eefSBob Badour     tmp1 = _mm_srli_si128(tmp1,4);
10140*80a68eefSBob Badour     tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
10141*80a68eefSBob Badour     val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
10142*80a68eefSBob Badour     _M64(v.val[2], val2);
10143*80a68eefSBob Badour     return v;
10144*80a68eefSBob Badour }
10145*80a68eefSBob Badour 
10146*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10147*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
10148*80a68eefSBob Badour {
10149*80a68eefSBob Badour     //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
10150*80a68eefSBob Badour     uint32x2x3_t v;
10151*80a68eefSBob Badour     __m128i val0, val1, val2;
10152*80a68eefSBob Badour     val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
10153*80a68eefSBob Badour     val2 =   _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
10154*80a68eefSBob Badour 
10155*80a68eefSBob Badour     val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
10156*80a68eefSBob Badour     _M64(v.val[0], val0);
10157*80a68eefSBob Badour     val2 =  _mm_slli_si128(val2, 8); //x, x,c0,c1,
10158*80a68eefSBob Badour     val1 =  _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
10159*80a68eefSBob Badour     _M64(v.val[1], val1);
10160*80a68eefSBob Badour     val2 =  _mm_srli_si128(val1, 8); //b0, c1, x, x,
10161*80a68eefSBob Badour     _M64(v.val[2], val2);
10162*80a68eefSBob Badour     return v;
10163*80a68eefSBob Badour }
10164*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10165*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10166*80a68eefSBob Badour {
10167*80a68eefSBob Badour     uint64x1x3_t v;
10168*80a68eefSBob Badour     v.val[0].m64_u64[0] = *(ptr);
10169*80a68eefSBob Badour     v.val[1].m64_u64[0] = *(ptr + 1);
10170*80a68eefSBob Badour     v.val[2].m64_u64[0] = *(ptr + 2);
10171*80a68eefSBob Badour     return v;
10172*80a68eefSBob Badour }
10173*80a68eefSBob Badour 
10174*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10175*80a68eefSBob Badour #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
10176*80a68eefSBob Badour 
10177*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10178*80a68eefSBob Badour #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
10179*80a68eefSBob Badour 
10180*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10181*80a68eefSBob Badour #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
10182*80a68eefSBob Badour 
10183*80a68eefSBob Badour int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10184*80a68eefSBob Badour #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
10185*80a68eefSBob Badour 
10186*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10187*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10188*80a68eefSBob Badour 
10189*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10190*80a68eefSBob Badour _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
10191*80a68eefSBob Badour {
10192*80a68eefSBob Badour     //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
10193*80a68eefSBob Badour     float32x2x3_t v;
10194*80a68eefSBob Badour     v.val[0].m64_f32[0] = *(ptr);
10195*80a68eefSBob Badour     v.val[0].m64_f32[1] = *(ptr + 3);
10196*80a68eefSBob Badour 
10197*80a68eefSBob Badour     v.val[1].m64_f32[0] = *(ptr + 1);
10198*80a68eefSBob Badour     v.val[1].m64_f32[1] = *(ptr + 4);
10199*80a68eefSBob Badour 
10200*80a68eefSBob Badour     v.val[2].m64_f32[0] = *(ptr + 2);
10201*80a68eefSBob Badour     v.val[2].m64_f32[1] = *(ptr + 5);
10202*80a68eefSBob Badour     return v;
10203*80a68eefSBob Badour }
10204*80a68eefSBob Badour 
10205*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10206*80a68eefSBob Badour #define vld3_p8 vld3_u8
10207*80a68eefSBob Badour 
10208*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10209*80a68eefSBob Badour #define vld3_p16 vld3_u16
10210*80a68eefSBob Badour 
10211*80a68eefSBob Badour //***************  Quadruples load ********************************
10212*80a68eefSBob Badour //*****************************************************************
10213*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10214*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
10215*80a68eefSBob Badour {
10216*80a68eefSBob Badour     uint8x16x4_t v;
10217*80a68eefSBob Badour     __m128i tmp3, tmp2, tmp1, tmp0;
10218*80a68eefSBob Badour 
10219*80a68eefSBob Badour     v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
10220*80a68eefSBob Badour     v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
10221*80a68eefSBob Badour     v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
10222*80a68eefSBob Badour     v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
10223*80a68eefSBob Badour 
10224*80a68eefSBob Badour     tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
10225*80a68eefSBob Badour     tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
10226*80a68eefSBob Badour     tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
10227*80a68eefSBob Badour     tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
10228*80a68eefSBob Badour 
10229*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8,  a1,a9, b1,b9, ....a3,a11, b3,b11
10230*80a68eefSBob Badour     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
10231*80a68eefSBob Badour     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
10232*80a68eefSBob Badour     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
10233*80a68eefSBob Badour 
10234*80a68eefSBob Badour     tmp0 =  _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
10235*80a68eefSBob Badour     tmp1 =  _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
10236*80a68eefSBob Badour     tmp2 =  _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
10237*80a68eefSBob Badour     tmp3 =  _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
10238*80a68eefSBob Badour 
10239*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
10240*80a68eefSBob Badour     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
10241*80a68eefSBob Badour     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
10242*80a68eefSBob Badour     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
10243*80a68eefSBob Badour     return v;
10244*80a68eefSBob Badour }
10245*80a68eefSBob Badour 
10246*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10247*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
10248*80a68eefSBob Badour {
10249*80a68eefSBob Badour     uint16x8x4_t v;
10250*80a68eefSBob Badour     __m128i tmp3, tmp2, tmp1, tmp0;
10251*80a68eefSBob Badour     tmp0  =  vld1q_u16 (ptr); //a0,a1,a2,...a7
10252*80a68eefSBob Badour     tmp1  =  vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
10253*80a68eefSBob Badour     tmp2  =  vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
10254*80a68eefSBob Badour     tmp3  =  vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
10255*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
10256*80a68eefSBob Badour     v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
10257*80a68eefSBob Badour     v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
10258*80a68eefSBob Badour     v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
10259*80a68eefSBob Badour     tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
10260*80a68eefSBob Badour     tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
10261*80a68eefSBob Badour     tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
10262*80a68eefSBob Badour     tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
10263*80a68eefSBob Badour     v.val[0] =  _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
10264*80a68eefSBob Badour     v.val[1] =  _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
10265*80a68eefSBob Badour     v.val[2] =  _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
10266*80a68eefSBob Badour     v.val[3] =  _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
10267*80a68eefSBob Badour     return v;
10268*80a68eefSBob Badour }
10269*80a68eefSBob Badour 
10270*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10271*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10272*80a68eefSBob Badour {
10273*80a68eefSBob Badour     uint32x4x4_t v;
10274*80a68eefSBob Badour     __m128i tmp3, tmp2, tmp1, tmp0;
10275*80a68eefSBob Badour     v.val[0] =  vld1q_u32 (ptr);
10276*80a68eefSBob Badour     v.val[1] =  vld1q_u32 ((ptr + 4));
10277*80a68eefSBob Badour     v.val[2] =  vld1q_u32 ((ptr + 8));
10278*80a68eefSBob Badour     v.val[3] =  vld1q_u32 ((ptr + 12));
10279*80a68eefSBob Badour     tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
10280*80a68eefSBob Badour     tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
10281*80a68eefSBob Badour     tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
10282*80a68eefSBob Badour     tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
10283*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
10284*80a68eefSBob Badour     v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
10285*80a68eefSBob Badour     v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
10286*80a68eefSBob Badour     v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
10287*80a68eefSBob Badour     return v;
10288*80a68eefSBob Badour }
10289*80a68eefSBob Badour 
10290*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10291*80a68eefSBob Badour #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
10292*80a68eefSBob Badour 
10293*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10294*80a68eefSBob Badour #define  vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
10295*80a68eefSBob Badour 
10296*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10297*80a68eefSBob Badour #define  vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
10298*80a68eefSBob Badour 
10299*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10300*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10301*80a68eefSBob Badour 
10302*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10303*80a68eefSBob Badour _NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10304*80a68eefSBob Badour {
10305*80a68eefSBob Badour     float32x4x4_t v;
10306*80a68eefSBob Badour     __m128 tmp3, tmp2, tmp1, tmp0;
10307*80a68eefSBob Badour 
10308*80a68eefSBob Badour     v.val[0] =  vld1q_f32 ((float*) ptr);
10309*80a68eefSBob Badour     v.val[1] =  vld1q_f32 ((float*) (ptr + 4));
10310*80a68eefSBob Badour     v.val[2] =  vld1q_f32 ((float*) (ptr + 8));
10311*80a68eefSBob Badour     v.val[3] =  vld1q_f32 ((float*) (ptr + 12));
10312*80a68eefSBob Badour     tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
10313*80a68eefSBob Badour     tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
10314*80a68eefSBob Badour     tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
10315*80a68eefSBob Badour     tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
10316*80a68eefSBob Badour     v.val[0] = _mm_movelh_ps(tmp0, tmp2);
10317*80a68eefSBob Badour     v.val[1] = _mm_movehl_ps(tmp2, tmp0);
10318*80a68eefSBob Badour     v.val[2] = _mm_movelh_ps(tmp1, tmp3);
10319*80a68eefSBob Badour     v.val[3] = _mm_movehl_ps(tmp3, tmp1);
10320*80a68eefSBob Badour     return v;
10321*80a68eefSBob Badour }
10322*80a68eefSBob Badour 
10323*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10324*80a68eefSBob Badour #define vld4q_p8 vld4q_u8
10325*80a68eefSBob Badour 
10326*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10327*80a68eefSBob Badour #define vld4q_p16 vld4q_s16
10328*80a68eefSBob Badour 
10329*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10330*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
10331*80a68eefSBob Badour {
10332*80a68eefSBob Badour     uint8x8x4_t v;
10333*80a68eefSBob Badour     __m128i sh0, sh1;
10334*80a68eefSBob Badour     __m128i val0,  val2;
10335*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
10336*80a68eefSBob Badour 
10337*80a68eefSBob Badour     val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
10338*80a68eefSBob Badour     val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
10339*80a68eefSBob Badour 
10340*80a68eefSBob Badour     sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
10341*80a68eefSBob Badour     sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
10342*80a68eefSBob Badour     val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
10343*80a68eefSBob Badour     vst1q_u8(&v.val[0], val0 );
10344*80a68eefSBob Badour     val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
10345*80a68eefSBob Badour     vst1q_u8(&v.val[2], val2 );
10346*80a68eefSBob Badour     return v;
10347*80a68eefSBob Badour }
10348*80a68eefSBob Badour 
10349*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10350*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
10351*80a68eefSBob Badour {
10352*80a68eefSBob Badour     uint16x4x4_t v;
10353*80a68eefSBob Badour     __m128i sh0, sh1;
10354*80a68eefSBob Badour     __m128i val0, val2;
10355*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
10356*80a68eefSBob Badour     val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
10357*80a68eefSBob Badour     val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
10358*80a68eefSBob Badour     sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
10359*80a68eefSBob Badour     sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
10360*80a68eefSBob Badour     val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
10361*80a68eefSBob Badour     vst1q_u16(&v.val[0], val0 );
10362*80a68eefSBob Badour     val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
10363*80a68eefSBob Badour     vst1q_u16(&v.val[2], val2 );
10364*80a68eefSBob Badour     return v;
10365*80a68eefSBob Badour }
10366*80a68eefSBob Badour 
10367*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10368*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
10369*80a68eefSBob Badour {
10370*80a68eefSBob Badour     //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10371*80a68eefSBob Badour     uint32x2x4_t v;
10372*80a68eefSBob Badour     __m128i val0, val01, val2;
10373*80a68eefSBob Badour     val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
10374*80a68eefSBob Badour     val2 =  vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
10375*80a68eefSBob Badour     val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
10376*80a68eefSBob Badour     val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
10377*80a68eefSBob Badour     vst1q_u32(&v.val[0], val01);
10378*80a68eefSBob Badour     vst1q_u32(&v.val[2], val2 );
10379*80a68eefSBob Badour     return v;
10380*80a68eefSBob Badour }
10381*80a68eefSBob Badour 
10382*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10383*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10384*80a68eefSBob Badour {
10385*80a68eefSBob Badour     uint64x1x4_t v;
10386*80a68eefSBob Badour     v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
10387*80a68eefSBob Badour     v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
10388*80a68eefSBob Badour     v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
10389*80a68eefSBob Badour     v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
10390*80a68eefSBob Badour     return v;
10391*80a68eefSBob Badour }
10392*80a68eefSBob Badour 
10393*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10394*80a68eefSBob Badour #define  vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
10395*80a68eefSBob Badour 
10396*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10397*80a68eefSBob Badour #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
10398*80a68eefSBob Badour 
10399*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10400*80a68eefSBob Badour #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
10401*80a68eefSBob Badour 
10402*80a68eefSBob Badour int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10403*80a68eefSBob Badour #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
10404*80a68eefSBob Badour 
10405*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10406*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10407*80a68eefSBob Badour 
10408*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10409*80a68eefSBob Badour _NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
10410*80a68eefSBob Badour {
10411*80a68eefSBob Badour     //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10412*80a68eefSBob Badour     float32x2x4_t res;
10413*80a68eefSBob Badour     res.val[0].m64_f32[0] = *(ptr);
10414*80a68eefSBob Badour     res.val[0].m64_f32[1] = *(ptr + 4);
10415*80a68eefSBob Badour     res.val[1].m64_f32[0] = *(ptr + 1);
10416*80a68eefSBob Badour     res.val[1].m64_f32[1] = *(ptr + 5);
10417*80a68eefSBob Badour     res.val[2].m64_f32[0] = *(ptr + 2);
10418*80a68eefSBob Badour     res.val[2].m64_f32[1] = *(ptr + 6);
10419*80a68eefSBob Badour     res.val[3].m64_f32[0] = *(ptr + 3);
10420*80a68eefSBob Badour     res.val[3].m64_f32[1] = *(ptr + 7);
10421*80a68eefSBob Badour     return res;
10422*80a68eefSBob Badour }
10423*80a68eefSBob Badour 
10424*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10425*80a68eefSBob Badour #define vld4_p8 vld4_u8
10426*80a68eefSBob Badour 
10427*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10428*80a68eefSBob Badour #define vld4_p16 vld4_u16
10429*80a68eefSBob Badour 
10430*80a68eefSBob Badour //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
10431*80a68eefSBob Badour //*******************************************************************************************************************
10432*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10433*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
10434*80a68eefSBob Badour {
10435*80a68eefSBob Badour     uint8x8x2_t v;
10436*80a68eefSBob Badour     __m128i val0, val1;
10437*80a68eefSBob Badour     val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
10438*80a68eefSBob Badour     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
10439*80a68eefSBob Badour     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
10440*80a68eefSBob Badour     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10441*80a68eefSBob Badour     vst1q_u8(v.val, val0);
10442*80a68eefSBob Badour     return v;
10443*80a68eefSBob Badour }
10444*80a68eefSBob Badour 
10445*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10446*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
10447*80a68eefSBob Badour {
10448*80a68eefSBob Badour     uint16x4x2_t v;
10449*80a68eefSBob Badour     __m128i val0, val1;
10450*80a68eefSBob Badour     val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
10451*80a68eefSBob Badour     val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
10452*80a68eefSBob Badour     _M64(v.val[0], val0);
10453*80a68eefSBob Badour     val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
10454*80a68eefSBob Badour     _M64(v.val[1], val1);
10455*80a68eefSBob Badour     return v;
10456*80a68eefSBob Badour }
10457*80a68eefSBob Badour 
10458*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10459*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10460*80a68eefSBob Badour {
10461*80a68eefSBob Badour     uint32x2x2_t v;
10462*80a68eefSBob Badour     __m128i val0;
10463*80a68eefSBob Badour     val0 = LOAD_SI128(ptr); //0,1,x,x
10464*80a68eefSBob Badour     val0 = _mm_shuffle_epi32(val0,   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
10465*80a68eefSBob Badour     vst1q_u32(v.val, val0);
10466*80a68eefSBob Badour     return v;
10467*80a68eefSBob Badour }
10468*80a68eefSBob Badour 
10469*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10470*80a68eefSBob Badour #define vld2_dup_u64 vld2_u64
10471*80a68eefSBob Badour 
10472*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10473*80a68eefSBob Badour #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
10474*80a68eefSBob Badour 
10475*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10476*80a68eefSBob Badour #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
10477*80a68eefSBob Badour 
10478*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10479*80a68eefSBob Badour #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
10480*80a68eefSBob Badour 
10481*80a68eefSBob Badour _NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10482*80a68eefSBob Badour #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
10483*80a68eefSBob Badour 
10484*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10485*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10486*80a68eefSBob Badour 
10487*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10488*80a68eefSBob Badour _NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10489*80a68eefSBob Badour {
10490*80a68eefSBob Badour     float32x2x2_t v;
10491*80a68eefSBob Badour     v.val[0].m64_f32[0] = *(ptr); //0,0
10492*80a68eefSBob Badour     v.val[0].m64_f32[1] = *(ptr); //0,0
10493*80a68eefSBob Badour     v.val[1].m64_f32[0] = *(ptr + 1); //1,1
10494*80a68eefSBob Badour     v.val[1].m64_f32[1] = *(ptr + 1); //1,1
10495*80a68eefSBob Badour     return v;
10496*80a68eefSBob Badour }
10497*80a68eefSBob Badour 
10498*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10499*80a68eefSBob Badour #define vld2_dup_p8 vld2_dup_u8
10500*80a68eefSBob Badour 
10501*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10502*80a68eefSBob Badour #define vld2_dup_p16 vld2_dup_s16
10503*80a68eefSBob Badour 
10504*80a68eefSBob Badour //************* Duplicate (or propagate)triplets: *******************
10505*80a68eefSBob Badour //********************************************************************
10506*80a68eefSBob Badour //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
10507*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10508*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
10509*80a68eefSBob Badour {
10510*80a68eefSBob Badour     uint8x8x3_t v;
10511*80a68eefSBob Badour     __m128i val0, val1, val2;
10512*80a68eefSBob Badour     val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
10513*80a68eefSBob Badour     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
10514*80a68eefSBob Badour     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
10515*80a68eefSBob Badour     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10516*80a68eefSBob Badour     val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
10517*80a68eefSBob Badour     vst1q_u8(v.val, val0);
10518*80a68eefSBob Badour     _M64(v.val[2], val2);
10519*80a68eefSBob Badour     return v;
10520*80a68eefSBob Badour }
10521*80a68eefSBob Badour 
10522*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10523*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
10524*80a68eefSBob Badour {
10525*80a68eefSBob Badour     uint16x4x3_t v;
10526*80a68eefSBob Badour     __m128i val0, val1, val2;
10527*80a68eefSBob Badour     val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
10528*80a68eefSBob Badour     val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
10529*80a68eefSBob Badour     val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
10530*80a68eefSBob Badour     val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
10531*80a68eefSBob Badour     _M64(v.val[0], val0);
10532*80a68eefSBob Badour     _M64(v.val[1], val1);
10533*80a68eefSBob Badour     _M64(v.val[2], val2);
10534*80a68eefSBob Badour     return v;
10535*80a68eefSBob Badour }
10536*80a68eefSBob Badour 
10537*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10538*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10539*80a68eefSBob Badour {
10540*80a68eefSBob Badour     uint32x2x3_t v;
10541*80a68eefSBob Badour     __m128i val0, val1, val2;
10542*80a68eefSBob Badour     val2 = LOAD_SI128(ptr); //0,1,2,x
10543*80a68eefSBob Badour     val0 = _mm_shuffle_epi32(val2,   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
10544*80a68eefSBob Badour     val1 = _mm_shuffle_epi32(val2,   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
10545*80a68eefSBob Badour     val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
10546*80a68eefSBob Badour     _M64(v.val[0], val0);
10547*80a68eefSBob Badour     _M64(v.val[1], val1);
10548*80a68eefSBob Badour     _M64(v.val[2], val2);
10549*80a68eefSBob Badour     return v;
10550*80a68eefSBob Badour }
10551*80a68eefSBob Badour 
10552*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10553*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10554*80a68eefSBob Badour {
10555*80a68eefSBob Badour     uint64x1x3_t v;
10556*80a68eefSBob Badour     v.val[0].m64_u64[0] = *(ptr);
10557*80a68eefSBob Badour     v.val[1].m64_u64[0] = *(ptr + 1);
10558*80a68eefSBob Badour     v.val[2].m64_u64[0] = *(ptr + 2);
10559*80a68eefSBob Badour     return v;
10560*80a68eefSBob Badour }
10561*80a68eefSBob Badour 
10562*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10563*80a68eefSBob Badour #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
10564*80a68eefSBob Badour 
10565*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10566*80a68eefSBob Badour #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
10567*80a68eefSBob Badour 
10568*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10569*80a68eefSBob Badour #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
10570*80a68eefSBob Badour 
10571*80a68eefSBob Badour int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10572*80a68eefSBob Badour #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
10573*80a68eefSBob Badour 
10574*80a68eefSBob Badour 
10575*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10576*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10577*80a68eefSBob Badour 
10578*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10579*80a68eefSBob Badour _NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10580*80a68eefSBob Badour {
10581*80a68eefSBob Badour     float32x2x3_t v;
10582*80a68eefSBob Badour     int i;
10583*80a68eefSBob Badour     for (i = 0; i<3; i++) {
10584*80a68eefSBob Badour         v.val[i].m64_f32[0] = *(ptr + i);
10585*80a68eefSBob Badour         v.val[i].m64_f32[1] = *(ptr + i);
10586*80a68eefSBob Badour     }
10587*80a68eefSBob Badour     return v;
10588*80a68eefSBob Badour }
10589*80a68eefSBob Badour 
10590*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10591*80a68eefSBob Badour #define vld3_dup_p8 vld3_dup_u8
10592*80a68eefSBob Badour 
10593*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10594*80a68eefSBob Badour #define vld3_dup_p16 vld3_dup_s16
10595*80a68eefSBob Badour 
10596*80a68eefSBob Badour 
10597*80a68eefSBob Badour //************* Duplicate (or propagate) quadruples: *******************
10598*80a68eefSBob Badour //***********************************************************************
10599*80a68eefSBob Badour //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes  and  ptr[3] to all val[3] lanes
10600*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10601*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10602*80a68eefSBob Badour {
10603*80a68eefSBob Badour     uint8x8x4_t v;
10604*80a68eefSBob Badour     __m128i val0, val1, val2;
10605*80a68eefSBob Badour     val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
10606*80a68eefSBob Badour     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
10607*80a68eefSBob Badour     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
10608*80a68eefSBob Badour     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10609*80a68eefSBob Badour     val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
10610*80a68eefSBob Badour     vst1q_u8(&v.val[0], val0);
10611*80a68eefSBob Badour     vst1q_u8(&v.val[2], val2);
10612*80a68eefSBob Badour     return v;
10613*80a68eefSBob Badour }
10614*80a68eefSBob Badour 
10615*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10616*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10617*80a68eefSBob Badour {
10618*80a68eefSBob Badour     uint16x4x4_t v;
10619*80a68eefSBob Badour     __m128i val0, val1, val2, val3;
10620*80a68eefSBob Badour     val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
10621*80a68eefSBob Badour     val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
10622*80a68eefSBob Badour     val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
10623*80a68eefSBob Badour     val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
10624*80a68eefSBob Badour     val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
10625*80a68eefSBob Badour     _M64(v.val[0], val0);
10626*80a68eefSBob Badour     _M64(v.val[1], val1);
10627*80a68eefSBob Badour     _M64(v.val[2], val2);
10628*80a68eefSBob Badour     _M64(v.val[3], val3);
10629*80a68eefSBob Badour     return v;
10630*80a68eefSBob Badour }
10631*80a68eefSBob Badour 
10632*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10633*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10634*80a68eefSBob Badour {
10635*80a68eefSBob Badour     uint32x2x4_t v;
10636*80a68eefSBob Badour     __m128i val0, val1, val2, val3;
10637*80a68eefSBob Badour     val3 = LOAD_SI128(ptr); //0,1,2,3
10638*80a68eefSBob Badour     val0 = _mm_shuffle_epi32(val3,   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
10639*80a68eefSBob Badour     val1 = _mm_shuffle_epi32(val3,   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
10640*80a68eefSBob Badour     val2 = _mm_shuffle_epi32(val3,   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
10641*80a68eefSBob Badour     val3 = _mm_shuffle_epi32(val3,   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
10642*80a68eefSBob Badour     _M64(v.val[0], val0);
10643*80a68eefSBob Badour     _M64(v.val[1], val1);
10644*80a68eefSBob Badour     _M64(v.val[2], val2);
10645*80a68eefSBob Badour     _M64(v.val[3], val3);
10646*80a68eefSBob Badour     return v;
10647*80a68eefSBob Badour }
10648*80a68eefSBob Badour 
10649*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10650*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10651*80a68eefSBob Badour {
10652*80a68eefSBob Badour     uint64x1x4_t v;
10653*80a68eefSBob Badour     v.val[0].m64_u64[0] = *(ptr);
10654*80a68eefSBob Badour     v.val[1].m64_u64[0] = *(ptr + 1);
10655*80a68eefSBob Badour     v.val[2].m64_u64[0] = *(ptr + 2);
10656*80a68eefSBob Badour     v.val[3].m64_u64[0] = *(ptr + 3);
10657*80a68eefSBob Badour     return v;
10658*80a68eefSBob Badour }
10659*80a68eefSBob Badour 
10660*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10661*80a68eefSBob Badour #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
10662*80a68eefSBob Badour 
10663*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10664*80a68eefSBob Badour #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
10665*80a68eefSBob Badour 
10666*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10667*80a68eefSBob Badour #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
10668*80a68eefSBob Badour 
10669*80a68eefSBob Badour int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10670*80a68eefSBob Badour #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
10671*80a68eefSBob Badour 
10672*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10673*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10674*80a68eefSBob Badour 
10675*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10676*80a68eefSBob Badour _NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10677*80a68eefSBob Badour {
10678*80a68eefSBob Badour     float32x2x4_t v;
10679*80a68eefSBob Badour     int i;
10680*80a68eefSBob Badour     for (i = 0; i<4; i++) {
10681*80a68eefSBob Badour         v.val[i].m64_f32[0] = *(ptr + i);
10682*80a68eefSBob Badour         v.val[i].m64_f32[1] = *(ptr + i);
10683*80a68eefSBob Badour     }
10684*80a68eefSBob Badour     return v;
10685*80a68eefSBob Badour }
10686*80a68eefSBob Badour 
10687*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const  * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10688*80a68eefSBob Badour #define vld4_dup_p8 vld4_dup_u8
10689*80a68eefSBob Badour 
10690*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10691*80a68eefSBob Badour #define vld4_dup_p16 vld4_dup_u16
10692*80a68eefSBob Badour 
10693*80a68eefSBob Badour 
10694*80a68eefSBob Badour //**********************************************************************************
10695*80a68eefSBob Badour //*******************Lane loads for  an N-element structures ***********************
10696*80a68eefSBob Badour //**********************************************************************************
10697*80a68eefSBob Badour //********************** Lane pairs  ************************************************
10698*80a68eefSBob Badour //does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
10699*80a68eefSBob Badour //we assume  src is 16 bit aligned
10700*80a68eefSBob Badour 
10701*80a68eefSBob Badour //!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
10702*80a68eefSBob Badour //to fix it the all functions below work with  xxxxxx_2t pointers and the corresponding original functions are redefined
10703*80a68eefSBob Badour 
10704*80a68eefSBob Badour //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10705*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
10706*80a68eefSBob Badour {
10707*80a68eefSBob Badour     uint16x8x2_t v;
10708*80a68eefSBob Badour     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
10709*80a68eefSBob Badour     v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
10710*80a68eefSBob Badour     return v;
10711*80a68eefSBob Badour }
10712*80a68eefSBob Badour #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
10713*80a68eefSBob Badour 
10714*80a68eefSBob Badour //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10715*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10716*80a68eefSBob Badour {
10717*80a68eefSBob Badour     uint32x4x2_t v;
10718*80a68eefSBob Badour     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
10719*80a68eefSBob Badour     v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
10720*80a68eefSBob Badour     return v;
10721*80a68eefSBob Badour }
10722*80a68eefSBob Badour #define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
10723*80a68eefSBob Badour 
10724*80a68eefSBob Badour //int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10725*80a68eefSBob Badour _NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
10726*80a68eefSBob Badour {
10727*80a68eefSBob Badour     int16x8x2_t v;
10728*80a68eefSBob Badour     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
10729*80a68eefSBob Badour     v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
10730*80a68eefSBob Badour     return v;
10731*80a68eefSBob Badour }
10732*80a68eefSBob Badour #define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
10733*80a68eefSBob Badour 
10734*80a68eefSBob Badour //int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10735*80a68eefSBob Badour _NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
10736*80a68eefSBob Badour {
10737*80a68eefSBob Badour     int32x4x2_t v;
10738*80a68eefSBob Badour     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
10739*80a68eefSBob Badour     v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
10740*80a68eefSBob Badour     return v;
10741*80a68eefSBob Badour }
10742*80a68eefSBob Badour #define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
10743*80a68eefSBob Badour 
10744*80a68eefSBob Badour //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10745*80a68eefSBob Badour //current IA SIMD doesn't support float16
10746*80a68eefSBob Badour 
10747*80a68eefSBob Badour //float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10748*80a68eefSBob Badour _NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10749*80a68eefSBob Badour {
10750*80a68eefSBob Badour     float32x4x2_t v;
10751*80a68eefSBob Badour     v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
10752*80a68eefSBob Badour     v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
10753*80a68eefSBob Badour     return v;
10754*80a68eefSBob Badour }
10755*80a68eefSBob Badour #define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
10756*80a68eefSBob Badour 
10757*80a68eefSBob Badour //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10758*80a68eefSBob Badour #define vld2q_lane_p16 vld2q_lane_u16
10759*80a68eefSBob Badour 
10760*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10761*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
10762*80a68eefSBob Badour {
10763*80a68eefSBob Badour     uint8x8x2_t v;
10764*80a68eefSBob Badour     v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
10765*80a68eefSBob Badour     v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
10766*80a68eefSBob Badour     return v;
10767*80a68eefSBob Badour }
10768*80a68eefSBob Badour 
10769*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10770*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane)
10771*80a68eefSBob Badour {
10772*80a68eefSBob Badour     uint16x4x2_t v;
10773*80a68eefSBob Badour     v.val[0]  =  vld1_lane_u16(ptr, src.val[0], lane);
10774*80a68eefSBob Badour     v.val[1]  = vld1_lane_u16((ptr + 1), src.val[1], lane);
10775*80a68eefSBob Badour     return v;
10776*80a68eefSBob Badour }
10777*80a68eefSBob Badour 
10778*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10779*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane)
10780*80a68eefSBob Badour {
10781*80a68eefSBob Badour     uint32x2x2_t v;
10782*80a68eefSBob Badour     v.val[0]  =  vld1_lane_u32(ptr, src.val[0], lane);
10783*80a68eefSBob Badour     v.val[1]  = vld1_lane_u32((ptr + 1), src.val[1], lane);
10784*80a68eefSBob Badour     return v;
10785*80a68eefSBob Badour }
10786*80a68eefSBob Badour 
10787*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10788*80a68eefSBob Badour #define vld2_lane_s8(ptr, src, lane)  vld2_lane_u8(( uint8_t*) ptr, src, lane)
10789*80a68eefSBob Badour 
10790*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10791*80a68eefSBob Badour #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
10792*80a68eefSBob Badour 
10793*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10794*80a68eefSBob Badour #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
10795*80a68eefSBob Badour 
10796*80a68eefSBob Badour //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10797*80a68eefSBob Badour //current IA SIMD doesn't support float16
10798*80a68eefSBob Badour 
10799*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
10800*80a68eefSBob Badour _NEON2SSE_INLINE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t  src,__constrange(0,1) int lane)
10801*80a68eefSBob Badour {
10802*80a68eefSBob Badour     float32x2x2_t v;
10803*80a68eefSBob Badour     v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
10804*80a68eefSBob Badour     v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
10805*80a68eefSBob Badour     return v;
10806*80a68eefSBob Badour }
10807*80a68eefSBob Badour 
10808*80a68eefSBob Badour //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10809*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
10810*80a68eefSBob Badour #define vld2_lane_p8 vld2_lane_u8
10811*80a68eefSBob Badour 
10812*80a68eefSBob Badour //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10813*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10814*80a68eefSBob Badour #define vld2_lane_p16 vld2_lane_u16
10815*80a68eefSBob Badour 
10816*80a68eefSBob Badour //*********** Lane triplets **********************
10817*80a68eefSBob Badour //*************************************************
10818*80a68eefSBob Badour //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
10819*80a68eefSBob Badour //we assume src is 16 bit aligned
10820*80a68eefSBob Badour 
10821*80a68eefSBob Badour //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10822*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10823*80a68eefSBob Badour {
10824*80a68eefSBob Badour     uint16x8x3_t v;
10825*80a68eefSBob Badour     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
10826*80a68eefSBob Badour     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
10827*80a68eefSBob Badour     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
10828*80a68eefSBob Badour     return v;
10829*80a68eefSBob Badour }
10830*80a68eefSBob Badour #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
10831*80a68eefSBob Badour 
10832*80a68eefSBob Badour //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10833*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10834*80a68eefSBob Badour {
10835*80a68eefSBob Badour     uint32x4x3_t v;
10836*80a68eefSBob Badour     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
10837*80a68eefSBob Badour     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
10838*80a68eefSBob Badour     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
10839*80a68eefSBob Badour     return v;
10840*80a68eefSBob Badour }
10841*80a68eefSBob Badour #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
10842*80a68eefSBob Badour 
10843*80a68eefSBob Badour //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10844*80a68eefSBob Badour _NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10845*80a68eefSBob Badour {
10846*80a68eefSBob Badour     int16x8x3_t v;
10847*80a68eefSBob Badour     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
10848*80a68eefSBob Badour     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
10849*80a68eefSBob Badour     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
10850*80a68eefSBob Badour     return v;
10851*80a68eefSBob Badour }
10852*80a68eefSBob Badour #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
10853*80a68eefSBob Badour 
10854*80a68eefSBob Badour //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10855*80a68eefSBob Badour _NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10856*80a68eefSBob Badour {
10857*80a68eefSBob Badour     int32x4x3_t v;
10858*80a68eefSBob Badour     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
10859*80a68eefSBob Badour     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
10860*80a68eefSBob Badour     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
10861*80a68eefSBob Badour     return v;
10862*80a68eefSBob Badour }
10863*80a68eefSBob Badour #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
10864*80a68eefSBob Badour 
10865*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10866*80a68eefSBob Badour //current IA SIMD doesn't support float16
10867*80a68eefSBob Badour #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
10868*80a68eefSBob Badour 
10869*80a68eefSBob Badour 
10870*80a68eefSBob Badour //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10871*80a68eefSBob Badour _NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10872*80a68eefSBob Badour {
10873*80a68eefSBob Badour     float32x4x3_t v;
10874*80a68eefSBob Badour     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
10875*80a68eefSBob Badour     v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
10876*80a68eefSBob Badour     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
10877*80a68eefSBob Badour     return v;
10878*80a68eefSBob Badour }
10879*80a68eefSBob Badour #define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
10880*80a68eefSBob Badour 
10881*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10882*80a68eefSBob Badour #define vld3q_lane_p16 vld3q_lane_u16
10883*80a68eefSBob Badour 
10884*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10885*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10886*80a68eefSBob Badour {
10887*80a68eefSBob Badour     uint8x8x3_t v;
10888*80a68eefSBob Badour     v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
10889*80a68eefSBob Badour     v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
10890*80a68eefSBob Badour     v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
10891*80a68eefSBob Badour     return v;
10892*80a68eefSBob Badour }
10893*80a68eefSBob Badour 
10894*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t   const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10895*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10896*80a68eefSBob Badour {
10897*80a68eefSBob Badour     uint16x4x3_t v;
10898*80a68eefSBob Badour     v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
10899*80a68eefSBob Badour     v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
10900*80a68eefSBob Badour     v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
10901*80a68eefSBob Badour     return v;
10902*80a68eefSBob Badour }
10903*80a68eefSBob Badour 
10904*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10905*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10906*80a68eefSBob Badour {
10907*80a68eefSBob Badour     //need to merge into 128 bit anyway
10908*80a68eefSBob Badour     uint32x2x3_t v;
10909*80a68eefSBob Badour     v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);;
10910*80a68eefSBob Badour     v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);;
10911*80a68eefSBob Badour     v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);;
10912*80a68eefSBob Badour     return v;
10913*80a68eefSBob Badour }
10914*80a68eefSBob Badour 
10915*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t  src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10916*80a68eefSBob Badour #define vld3_lane_s8(ptr, src, lane)  vld3_lane_u8(( uint8_t*) ptr, src, lane)
10917*80a68eefSBob Badour 
10918*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t  src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10919*80a68eefSBob Badour #define vld3_lane_s16(ptr, src, lane)  vld3_lane_u16(( uint16_t*) ptr, src, lane)
10920*80a68eefSBob Badour 
10921*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t  src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10922*80a68eefSBob Badour #define vld3_lane_s32(ptr, src, lane)  vld3_lane_u32(( uint32_t*) ptr, src, lane)
10923*80a68eefSBob Badour 
10924*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10925*80a68eefSBob Badour //current IA SIMD doesn't support float16
10926*80a68eefSBob Badour 
10927*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10928*80a68eefSBob Badour _NEON2SSE_INLINE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10929*80a68eefSBob Badour {
10930*80a68eefSBob Badour     float32x2x3_t v;
10931*80a68eefSBob Badour     v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
10932*80a68eefSBob Badour     v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
10933*80a68eefSBob Badour     v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
10934*80a68eefSBob Badour     return v;
10935*80a68eefSBob Badour }
10936*80a68eefSBob Badour 
10937*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10938*80a68eefSBob Badour #define vld3_lane_p8 vld3_lane_u8
10939*80a68eefSBob Badour 
10940*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10941*80a68eefSBob Badour #define vld3_lane_p16 vld3_lane_u16
10942*80a68eefSBob Badour 
10943*80a68eefSBob Badour //******************* Lane Quadruples  load ***************************
10944*80a68eefSBob Badour //*********************************************************************
10945*80a68eefSBob Badour //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
10946*80a68eefSBob Badour //we assume src is 16 bit aligned
10947*80a68eefSBob Badour 
10948*80a68eefSBob Badour //uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10949*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
10950*80a68eefSBob Badour {
10951*80a68eefSBob Badour     uint16x8x4_t v;
10952*80a68eefSBob Badour     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
10953*80a68eefSBob Badour     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
10954*80a68eefSBob Badour     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
10955*80a68eefSBob Badour     v.val[3] = _MM_INSERT_EPI16 ( src->val[3],  ptr[3], lane);
10956*80a68eefSBob Badour     return v;
10957*80a68eefSBob Badour }
10958*80a68eefSBob Badour #define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
10959*80a68eefSBob Badour 
10960*80a68eefSBob Badour //uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10961*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
10962*80a68eefSBob Badour {
10963*80a68eefSBob Badour     uint32x4x4_t v;
10964*80a68eefSBob Badour     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
10965*80a68eefSBob Badour     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
10966*80a68eefSBob Badour     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
10967*80a68eefSBob Badour     v.val[3] = _MM_INSERT_EPI32 ( src->val[3],  ptr[3], lane);
10968*80a68eefSBob Badour     return v;
10969*80a68eefSBob Badour }
10970*80a68eefSBob Badour #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
10971*80a68eefSBob Badour 
10972*80a68eefSBob Badour //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10973*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10974*80a68eefSBob Badour #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
10975*80a68eefSBob Badour 
10976*80a68eefSBob Badour //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10977*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10978*80a68eefSBob Badour #define vld4q_lane_s32(ptr, src, lane)  vld4q_lane_u32(( uint32_t*) ptr, src, lane)
10979*80a68eefSBob Badour 
10980*80a68eefSBob Badour //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10981*80a68eefSBob Badour _NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10982*80a68eefSBob Badour //current IA SIMD doesn't support float16
10983*80a68eefSBob Badour 
10984*80a68eefSBob Badour //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10985*80a68eefSBob Badour _NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
10986*80a68eefSBob Badour {
10987*80a68eefSBob Badour     float32x4x4_t v;
10988*80a68eefSBob Badour     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
10989*80a68eefSBob Badour     v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
10990*80a68eefSBob Badour     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
10991*80a68eefSBob Badour     v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
10992*80a68eefSBob Badour     return v;
10993*80a68eefSBob Badour }
10994*80a68eefSBob Badour #define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
10995*80a68eefSBob Badour 
10996*80a68eefSBob Badour //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10997*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10998*80a68eefSBob Badour #define vld4q_lane_p16 vld4q_lane_u16
10999*80a68eefSBob Badour 
11000*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11001*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)
11002*80a68eefSBob Badour {
11003*80a68eefSBob Badour     uint8x8x4_t v;
11004*80a68eefSBob Badour     v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
11005*80a68eefSBob Badour     v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
11006*80a68eefSBob Badour     v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
11007*80a68eefSBob Badour     v.val[3] = vld1_lane_u8((ptr + 3), src.val[3], lane);
11008*80a68eefSBob Badour     return v;
11009*80a68eefSBob Badour }
11010*80a68eefSBob Badour 
11011*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11012*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane)
11013*80a68eefSBob Badour {
11014*80a68eefSBob Badour     uint16x4x4_t v;
11015*80a68eefSBob Badour     v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
11016*80a68eefSBob Badour     v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
11017*80a68eefSBob Badour     v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
11018*80a68eefSBob Badour     v.val[3] = vld1_lane_u16((ptr + 3), src.val[3], lane);
11019*80a68eefSBob Badour     return v;
11020*80a68eefSBob Badour }
11021*80a68eefSBob Badour 
11022*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11023*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane)
11024*80a68eefSBob Badour {
11025*80a68eefSBob Badour     uint32x2x4_t v;
11026*80a68eefSBob Badour     v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);
11027*80a68eefSBob Badour     v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);
11028*80a68eefSBob Badour     v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);
11029*80a68eefSBob Badour     v.val[3] = vld1_lane_u32((ptr + 3), src.val[3], lane);
11030*80a68eefSBob Badour     return v;
11031*80a68eefSBob Badour }
11032*80a68eefSBob Badour 
11033*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11034*80a68eefSBob Badour #define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
11035*80a68eefSBob Badour 
11036*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11037*80a68eefSBob Badour #define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
11038*80a68eefSBob Badour 
11039*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11040*80a68eefSBob Badour #define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
11041*80a68eefSBob Badour 
11042*80a68eefSBob Badour //float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11043*80a68eefSBob Badour _NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
11044*80a68eefSBob Badour //current IA SIMD doesn't support float16
11045*80a68eefSBob Badour 
11046*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11047*80a68eefSBob Badour _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)
11048*80a68eefSBob Badour {
11049*80a68eefSBob Badour     //serial solution may be faster
11050*80a68eefSBob Badour     float32x2x4_t v;
11051*80a68eefSBob Badour     v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
11052*80a68eefSBob Badour     v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
11053*80a68eefSBob Badour     v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
11054*80a68eefSBob Badour     v.val[3] = vld1_lane_f32((ptr + 3), src.val[3], lane);
11055*80a68eefSBob Badour     return v;
11056*80a68eefSBob Badour }
11057*80a68eefSBob Badour 
11058*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11059*80a68eefSBob Badour #define vld4_lane_p8 vld4_lane_u8
11060*80a68eefSBob Badour 
11061*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11062*80a68eefSBob Badour #define vld4_lane_p16 vld4_lane_u16
11063*80a68eefSBob Badour 
11064*80a68eefSBob Badour //******************* Store duplets *********************************************
11065*80a68eefSBob Badour //********************************************************************************
11066*80a68eefSBob Badour //void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
11067*80a68eefSBob Badour _NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
11068*80a68eefSBob Badour {
11069*80a68eefSBob Badour     uint8x16x2_t v;
11070*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
11071*80a68eefSBob Badour     v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
11072*80a68eefSBob Badour     vst1q_u8 (ptr, v.val[0]);
11073*80a68eefSBob Badour     vst1q_u8 ((ptr + 16),  v.val[1]);
11074*80a68eefSBob Badour }
11075*80a68eefSBob Badour #define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
11076*80a68eefSBob Badour 
11077*80a68eefSBob Badour //void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
11078*80a68eefSBob Badour _NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
11079*80a68eefSBob Badour {
11080*80a68eefSBob Badour     uint16x8x2_t v;
11081*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
11082*80a68eefSBob Badour     v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
11083*80a68eefSBob Badour     vst1q_u16 (ptr, v.val[0]);
11084*80a68eefSBob Badour     vst1q_u16 ((ptr + 8),  v.val[1]);
11085*80a68eefSBob Badour }
11086*80a68eefSBob Badour #define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
11087*80a68eefSBob Badour 
11088*80a68eefSBob Badour //void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11089*80a68eefSBob Badour _NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
11090*80a68eefSBob Badour {
11091*80a68eefSBob Badour     uint32x4x2_t v;
11092*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
11093*80a68eefSBob Badour     v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
11094*80a68eefSBob Badour     vst1q_u32 (ptr, v.val[0]);
11095*80a68eefSBob Badour     vst1q_u32 ((ptr + 4),  v.val[1]);
11096*80a68eefSBob Badour }
11097*80a68eefSBob Badour #define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
11098*80a68eefSBob Badour 
11099*80a68eefSBob Badour //void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
11100*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
11101*80a68eefSBob Badour #define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
11102*80a68eefSBob Badour 
11103*80a68eefSBob Badour //void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11104*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
11105*80a68eefSBob Badour #define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
11106*80a68eefSBob Badour 
11107*80a68eefSBob Badour //void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
11108*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
11109*80a68eefSBob Badour #define vst2q_s32(ptr, val)  vst2q_u32((uint32_t*)(ptr), val)
11110*80a68eefSBob Badour 
11111*80a68eefSBob Badour //void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11112*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
11113*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently
11114*80a68eefSBob Badour 
11115*80a68eefSBob Badour //void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11116*80a68eefSBob Badour _NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
11117*80a68eefSBob Badour {
11118*80a68eefSBob Badour     float32x4x2_t v;
11119*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
11120*80a68eefSBob Badour     v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
11121*80a68eefSBob Badour     vst1q_f32 (ptr, v.val[0]);
11122*80a68eefSBob Badour     vst1q_f32 ((ptr + 4),  v.val[1]);
11123*80a68eefSBob Badour }
11124*80a68eefSBob Badour #define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
11125*80a68eefSBob Badour 
11126*80a68eefSBob Badour //void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
11127*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
11128*80a68eefSBob Badour #define vst2q_p8 vst2q_u8
11129*80a68eefSBob Badour 
11130*80a68eefSBob Badour //void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11131*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
11132*80a68eefSBob Badour #define vst2q_p16 vst2q_u16
11133*80a68eefSBob Badour 
11134*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11135*80a68eefSBob Badour _NEON2SSE_INLINE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val)
11136*80a68eefSBob Badour {
11137*80a68eefSBob Badour     __m128i v0;
11138*80a68eefSBob Badour     v0 = _mm_unpacklo_epi8(_pM128i(val.val[0]), _pM128i(val.val[1]));
11139*80a68eefSBob Badour     vst1q_u8 (ptr, v0);
11140*80a68eefSBob Badour }
11141*80a68eefSBob Badour 
11142*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
11143*80a68eefSBob Badour _NEON2SSE_INLINE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val)
11144*80a68eefSBob Badour {
11145*80a68eefSBob Badour     __m128i v0;
11146*80a68eefSBob Badour     v0 = _mm_unpacklo_epi16(_pM128i(val.val[0]), _pM128i(val.val[1]));
11147*80a68eefSBob Badour     vst1q_u16 (ptr, v0);
11148*80a68eefSBob Badour }
11149*80a68eefSBob Badour 
11150*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
11151*80a68eefSBob Badour _NEON2SSE_INLINE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val)
11152*80a68eefSBob Badour {
11153*80a68eefSBob Badour     __m128i v0;
11154*80a68eefSBob Badour     v0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1]));
11155*80a68eefSBob Badour     vst1q_u32 (ptr, v0);
11156*80a68eefSBob Badour }
11157*80a68eefSBob Badour 
11158*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
11159*80a68eefSBob Badour _NEON2SSE_INLINE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val)
11160*80a68eefSBob Badour {
11161*80a68eefSBob Badour     *(ptr) = val.val[0].m64_u64[0];
11162*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u64[0];
11163*80a68eefSBob Badour }
11164*80a68eefSBob Badour 
11165*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11166*80a68eefSBob Badour #define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
11167*80a68eefSBob Badour 
11168*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11169*80a68eefSBob Badour #define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
11170*80a68eefSBob Badour 
11171*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11172*80a68eefSBob Badour #define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
11173*80a68eefSBob Badour 
11174*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
11175*80a68eefSBob Badour #define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
11176*80a68eefSBob Badour 
11177*80a68eefSBob Badour //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11178*80a68eefSBob Badour //current IA SIMD doesn't support float16
11179*80a68eefSBob Badour 
11180*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11181*80a68eefSBob Badour _NEON2SSE_INLINE void vst2_f32(__transfersize(4) float32_t* ptr, float32x2x2_t val)
11182*80a68eefSBob Badour {
11183*80a68eefSBob Badour     *(ptr) =   val.val[0].m64_f32[0];
11184*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_f32[0];
11185*80a68eefSBob Badour     *(ptr + 2) = val.val[0].m64_f32[1];
11186*80a68eefSBob Badour     *(ptr + 3) = val.val[1].m64_f32[1];
11187*80a68eefSBob Badour }
11188*80a68eefSBob Badour 
11189*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t  val); // VST2.8 {d0, d1}, [r0]
11190*80a68eefSBob Badour #define vst2_p8 vst2_u8
11191*80a68eefSBob Badour 
11192*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t  val); // VST2.16 {d0, d1}, [r0]
11193*80a68eefSBob Badour #define vst2_p16 vst2_u16
11194*80a68eefSBob Badour 
11195*80a68eefSBob Badour //******************** Triplets store  *****************************************
11196*80a68eefSBob Badour //******************************************************************************
11197*80a68eefSBob Badour //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
11198*80a68eefSBob Badour _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
11199*80a68eefSBob Badour {
11200*80a68eefSBob Badour     uint8x16x3_t v;
11201*80a68eefSBob Badour     __m128i v0,v1,v2, cff, bldmask;
11202*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask0[16]   = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
11203*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask1[16]   = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
11204*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] =    {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
11205*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
11206*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
11207*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
11208*80a68eefSBob Badour 
11209*80a68eefSBob Badour     v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
11210*80a68eefSBob Badour     v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
11211*80a68eefSBob Badour     v1 =  _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
11212*80a68eefSBob Badour     v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
11213*80a68eefSBob Badour     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
11214*80a68eefSBob Badour     cff = _mm_cmpeq_epi8(v0, v0); //all ff
11215*80a68eefSBob Badour     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
11216*80a68eefSBob Badour     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
11217*80a68eefSBob Badour     vst1q_u8(ptr,   v.val[0]);
11218*80a68eefSBob Badour     v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
11219*80a68eefSBob Badour     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
11220*80a68eefSBob Badour     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
11221*80a68eefSBob Badour     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
11222*80a68eefSBob Badour     vst1q_u8((ptr + 16),  v.val[1]);
11223*80a68eefSBob Badour     v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
11224*80a68eefSBob Badour     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
11225*80a68eefSBob Badour     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
11226*80a68eefSBob Badour     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
11227*80a68eefSBob Badour     vst1q_u8((ptr + 32),  v.val[2]);
11228*80a68eefSBob Badour }
11229*80a68eefSBob Badour #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
11230*80a68eefSBob Badour 
11231*80a68eefSBob Badour //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
11232*80a68eefSBob Badour _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
11233*80a68eefSBob Badour {
11234*80a68eefSBob Badour     uint16x8x3_t v;
11235*80a68eefSBob Badour     __m128i v0,v1,v2, cff, bldmask;
11236*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask0[16]   = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
11237*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask1[16]   = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
11238*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] =    {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
11239*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
11240*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
11241*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
11242*80a68eefSBob Badour 
11243*80a68eefSBob Badour     v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
11244*80a68eefSBob Badour     v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
11245*80a68eefSBob Badour     v1 =  _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
11246*80a68eefSBob Badour     v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
11247*80a68eefSBob Badour     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
11248*80a68eefSBob Badour     cff = _mm_cmpeq_epi16(v0, v0); //all ff
11249*80a68eefSBob Badour     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
11250*80a68eefSBob Badour     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
11251*80a68eefSBob Badour     vst1q_u16(ptr,      v.val[0]);
11252*80a68eefSBob Badour     v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
11253*80a68eefSBob Badour     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
11254*80a68eefSBob Badour     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
11255*80a68eefSBob Badour     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
11256*80a68eefSBob Badour     vst1q_u16((ptr + 8),  v.val[1]);
11257*80a68eefSBob Badour     v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
11258*80a68eefSBob Badour     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
11259*80a68eefSBob Badour     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
11260*80a68eefSBob Badour     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
11261*80a68eefSBob Badour     vst1q_u16((ptr + 16), v.val[2]);
11262*80a68eefSBob Badour }
11263*80a68eefSBob Badour #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
11264*80a68eefSBob Badour 
11265*80a68eefSBob Badour //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11266*80a68eefSBob Badour _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
11267*80a68eefSBob Badour {
11268*80a68eefSBob Badour     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
11269*80a68eefSBob Badour     uint32x4x3_t v;
11270*80a68eefSBob Badour     __m128i tmp0, tmp1,tmp2;
11271*80a68eefSBob Badour     tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
11272*80a68eefSBob Badour     tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
11273*80a68eefSBob Badour     tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
11274*80a68eefSBob Badour     v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
11275*80a68eefSBob Badour     v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
11276*80a68eefSBob Badour     v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
11277*80a68eefSBob Badour     tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
11278*80a68eefSBob Badour     v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
11279*80a68eefSBob Badour 
11280*80a68eefSBob Badour     vst1q_u32(ptr,      v.val[0]);
11281*80a68eefSBob Badour     vst1q_u32((ptr + 4),  v.val[1]);
11282*80a68eefSBob Badour     vst1q_u32((ptr + 8),  v.val[2]);
11283*80a68eefSBob Badour }
11284*80a68eefSBob Badour #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
11285*80a68eefSBob Badour 
11286*80a68eefSBob Badour //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
11287*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
11288*80a68eefSBob Badour #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
11289*80a68eefSBob Badour 
11290*80a68eefSBob Badour //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
11291*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
11292*80a68eefSBob Badour #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
11293*80a68eefSBob Badour 
11294*80a68eefSBob Badour //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
11295*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
11296*80a68eefSBob Badour #define vst3q_s32(ptr, val)  vst3q_u32((uint32_t*)(ptr), val)
11297*80a68eefSBob Badour 
11298*80a68eefSBob Badour //void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11299*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
11300*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently
11301*80a68eefSBob Badour 
11302*80a68eefSBob Badour //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11303*80a68eefSBob Badour _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
11304*80a68eefSBob Badour {
11305*80a68eefSBob Badour     float32x4x3_t v;
11306*80a68eefSBob Badour     __m128 tmp0, tmp1,tmp2;
11307*80a68eefSBob Badour     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
11308*80a68eefSBob Badour     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
11309*80a68eefSBob Badour     tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
11310*80a68eefSBob Badour     v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
11311*80a68eefSBob Badour     v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
11312*80a68eefSBob Badour     v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
11313*80a68eefSBob Badour     tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
11314*80a68eefSBob Badour     v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
11315*80a68eefSBob Badour 
11316*80a68eefSBob Badour     vst1q_f32( ptr,    v.val[0]);
11317*80a68eefSBob Badour     vst1q_f32( (ptr + 4),  v.val[1]);
11318*80a68eefSBob Badour     vst1q_f32( (ptr + 8),  v.val[2]);
11319*80a68eefSBob Badour }
11320*80a68eefSBob Badour #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
11321*80a68eefSBob Badour 
11322*80a68eefSBob Badour //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
11323*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
11324*80a68eefSBob Badour #define vst3q_p8 vst3q_u8
11325*80a68eefSBob Badour 
11326*80a68eefSBob Badour //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11327*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
11328*80a68eefSBob Badour #define vst3q_p16 vst3q_u16
11329*80a68eefSBob Badour 
11330*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11331*80a68eefSBob Badour _NEON2SSE_INLINE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)
11332*80a68eefSBob Badour {
11333*80a68eefSBob Badour     __m128i tmp, sh0, sh1, val0, val2;
11334*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
11335*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
11336*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
11337*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
11338*80a68eefSBob Badour     tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]) );
11339*80a68eefSBob Badour     sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
11340*80a68eefSBob Badour     val2 = _pM128i(val.val[2]);
11341*80a68eefSBob Badour     sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask0);
11342*80a68eefSBob Badour     val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
11343*80a68eefSBob Badour     vst1q_u8(ptr,   val0); //store as 128 bit structure
11344*80a68eefSBob Badour     sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
11345*80a68eefSBob Badour     sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask1);
11346*80a68eefSBob Badour     val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
11347*80a68eefSBob Badour     _M64((*(__m64_128*)(ptr + 16)),  val2); //need it to fit into *ptr memory
11348*80a68eefSBob Badour }
11349*80a68eefSBob Badour 
11350*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11351*80a68eefSBob Badour _NEON2SSE_INLINE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)
11352*80a68eefSBob Badour {
11353*80a68eefSBob Badour     __m128i tmp, val0, val1, val2;
11354*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
11355*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23,   0,0,0,0,0,0,0,0};
11356*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0]  otherwise from v.val[1]
11357*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1]  otherwise from v.val[0]
11358*80a68eefSBob Badour     tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]));
11359*80a68eefSBob Badour     val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
11360*80a68eefSBob Badour     val2 = _pM128i(val.val[2]);
11361*80a68eefSBob Badour     val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
11362*80a68eefSBob Badour     val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
11363*80a68eefSBob Badour     vst1q_u16(ptr,    val0); //store as 128 bit structure
11364*80a68eefSBob Badour     val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
11365*80a68eefSBob Badour     val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
11366*80a68eefSBob Badour     val1 = _MM_BLENDV_EPI8(val0, val1,  *(__m128i*)mask1f); //change the operands order
11367*80a68eefSBob Badour     _M64((*(__m64_128*)(ptr + 8)),  val1); //need it to fit into *ptr memory
11368*80a68eefSBob Badour }
11369*80a68eefSBob Badour 
11370*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
11371*80a68eefSBob Badour _NEON2SSE_INLINE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)
11372*80a68eefSBob Badour {
11373*80a68eefSBob Badour     //val.val[0]:0,3,val.val[1]:1,4; val.val[2]:2,5,x,x;
11374*80a68eefSBob Badour     __m128i val0, val1;
11375*80a68eefSBob Badour     val0 = _mm_unpacklo_epi64(_pM128i(val.val[1]), _pM128i(val.val[2])); //val[0]: 1,4,2,5
11376*80a68eefSBob Badour     val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
11377*80a68eefSBob Badour     val1 = _mm_srli_si128(val0, 8); //4,5, x,x
11378*80a68eefSBob Badour     _M64((*(__m64_128*)(ptr + 4)),  val1);
11379*80a68eefSBob Badour     val0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), val0); //0,1,3,2
11380*80a68eefSBob Badour     val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
11381*80a68eefSBob Badour     vst1q_u32(ptr, val0); //store as 128 bit structure
11382*80a68eefSBob Badour }
11383*80a68eefSBob Badour 
11384*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val);// VST1.64 {d0, d1, d2}, [r0]
11385*80a68eefSBob Badour _NEON2SSE_INLINE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)
11386*80a68eefSBob Badour {
11387*80a68eefSBob Badour     *(ptr) = val.val[0].m64_u64[0];
11388*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u64[0];
11389*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_u64[0];
11390*80a68eefSBob Badour }
11391*80a68eefSBob Badour 
11392*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val);  // VST3.8 {d0, d1, d2}, [r0]
11393*80a68eefSBob Badour #define vst3_s8(ptr, val) vst3_u8((uint8_t*)ptr, val)
11394*80a68eefSBob Badour 
11395*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val);  // VST3.16 {d0, d1, d2}, [r0]
11396*80a68eefSBob Badour #define vst3_s16(ptr, val) vst3_u16((uint16_t*)ptr, val)
11397*80a68eefSBob Badour 
11398*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
11399*80a68eefSBob Badour #define vst3_s32(ptr, val) vst3_u32((uint32_t*)ptr, val)
11400*80a68eefSBob Badour 
11401*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
11402*80a68eefSBob Badour #define vst3_s64(ptr, val) vst3_u64((uint64_t*)ptr, val)
11403*80a68eefSBob Badour 
11404*80a68eefSBob Badour //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11405*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
11406*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11407*80a68eefSBob Badour 
11408*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
11409*80a68eefSBob Badour _NEON2SSE_INLINE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)
11410*80a68eefSBob Badour {
11411*80a68eefSBob Badour     //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;   -> 0,2, 4,1, 3,5
11412*80a68eefSBob Badour     *(ptr) =   val.val[0].m64_f32[0];
11413*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_f32[0];
11414*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_f32[0];
11415*80a68eefSBob Badour     *(ptr + 3) = val.val[0].m64_f32[1];
11416*80a68eefSBob Badour     *(ptr + 4) = val.val[1].m64_f32[1];
11417*80a68eefSBob Badour     *(ptr + 5) = val.val[2].m64_f32[1];
11418*80a68eefSBob Badour }
11419*80a68eefSBob Badour 
11420*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11421*80a68eefSBob Badour #define vst3_p8 vst3_u8
11422*80a68eefSBob Badour 
11423*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11424*80a68eefSBob Badour #define vst3_p16 vst3_u16
11425*80a68eefSBob Badour 
11426*80a68eefSBob Badour //***************  Quadruples store ********************************
11427*80a68eefSBob Badour //*********************************************************************
11428*80a68eefSBob Badour //void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
11429*80a68eefSBob Badour _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
11430*80a68eefSBob Badour {
11431*80a68eefSBob Badour     __m128i tmp1, tmp2, res;
11432*80a68eefSBob Badour     tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
11433*80a68eefSBob Badour     tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
11434*80a68eefSBob Badour     res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
11435*80a68eefSBob Badour     vst1q_u8(ptr,  res);
11436*80a68eefSBob Badour     res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
11437*80a68eefSBob Badour     vst1q_u8((ptr + 16), res);
11438*80a68eefSBob Badour     tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
11439*80a68eefSBob Badour     tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
11440*80a68eefSBob Badour     res = _mm_unpacklo_epi16(tmp1, tmp2); //
11441*80a68eefSBob Badour     vst1q_u8((ptr + 32), res);
11442*80a68eefSBob Badour     res = _mm_unpackhi_epi16(tmp1, tmp2); //
11443*80a68eefSBob Badour     vst1q_u8((ptr + 48), res);
11444*80a68eefSBob Badour }
11445*80a68eefSBob Badour #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
11446*80a68eefSBob Badour 
11447*80a68eefSBob Badour //void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
11448*80a68eefSBob Badour _NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val)
11449*80a68eefSBob Badour {
11450*80a68eefSBob Badour     uint16x8x4_t v;
11451*80a68eefSBob Badour     __m128i tmp1, tmp2;
11452*80a68eefSBob Badour     tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11453*80a68eefSBob Badour     tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11454*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
11455*80a68eefSBob Badour     v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
11456*80a68eefSBob Badour     tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11457*80a68eefSBob Badour     tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11458*80a68eefSBob Badour     v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
11459*80a68eefSBob Badour     v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
11460*80a68eefSBob Badour     vst1q_u16(ptr,     v.val[0]);
11461*80a68eefSBob Badour     vst1q_u16((ptr + 8), v.val[1]);
11462*80a68eefSBob Badour     vst1q_u16((ptr + 16),v.val[2]);
11463*80a68eefSBob Badour     vst1q_u16((ptr + 24), v.val[3]);
11464*80a68eefSBob Badour }
11465*80a68eefSBob Badour #define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
11466*80a68eefSBob Badour 
11467*80a68eefSBob Badour //void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11468*80a68eefSBob Badour _NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val)
11469*80a68eefSBob Badour {
11470*80a68eefSBob Badour     uint16x8x4_t v;
11471*80a68eefSBob Badour     __m128i tmp1, tmp2;
11472*80a68eefSBob Badour     tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11473*80a68eefSBob Badour     tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11474*80a68eefSBob Badour     v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
11475*80a68eefSBob Badour     v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
11476*80a68eefSBob Badour     tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11477*80a68eefSBob Badour     tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11478*80a68eefSBob Badour     v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
11479*80a68eefSBob Badour     v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
11480*80a68eefSBob Badour     vst1q_u32(ptr,      v.val[0]);
11481*80a68eefSBob Badour     vst1q_u32((ptr + 4),  v.val[1]);
11482*80a68eefSBob Badour     vst1q_u32((ptr + 8),  v.val[2]);
11483*80a68eefSBob Badour     vst1q_u32((ptr + 12), v.val[3]);
11484*80a68eefSBob Badour }
11485*80a68eefSBob Badour #define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
11486*80a68eefSBob Badour 
11487*80a68eefSBob Badour //void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
11488*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);
11489*80a68eefSBob Badour #define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
11490*80a68eefSBob Badour 
11491*80a68eefSBob Badour //void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
11492*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);
11493*80a68eefSBob Badour #define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
11494*80a68eefSBob Badour 
11495*80a68eefSBob Badour //void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
11496*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);
11497*80a68eefSBob Badour #define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
11498*80a68eefSBob Badour 
11499*80a68eefSBob Badour //void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11500*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);
11501*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently
11502*80a68eefSBob Badour 
11503*80a68eefSBob Badour //void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11504*80a68eefSBob Badour _NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val)
11505*80a68eefSBob Badour {
11506*80a68eefSBob Badour     __m128 tmp3, tmp2, tmp1, tmp0;
11507*80a68eefSBob Badour     float32x4x4_t v;
11508*80a68eefSBob Badour     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
11509*80a68eefSBob Badour     tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
11510*80a68eefSBob Badour     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
11511*80a68eefSBob Badour     tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
11512*80a68eefSBob Badour     v.val[0] = _mm_movelh_ps(tmp0, tmp2);
11513*80a68eefSBob Badour     v.val[1] = _mm_movehl_ps(tmp2, tmp0);
11514*80a68eefSBob Badour     v.val[2] = _mm_movelh_ps(tmp1, tmp3);
11515*80a68eefSBob Badour     v.val[3] = _mm_movehl_ps(tmp3, tmp1);
11516*80a68eefSBob Badour     vst1q_f32(ptr,   v.val[0]);
11517*80a68eefSBob Badour     vst1q_f32((ptr + 4), v.val[1]);
11518*80a68eefSBob Badour     vst1q_f32((ptr + 8), v.val[2]);
11519*80a68eefSBob Badour     vst1q_f32((ptr + 12), v.val[3]);
11520*80a68eefSBob Badour }
11521*80a68eefSBob Badour #define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
11522*80a68eefSBob Badour 
11523*80a68eefSBob Badour //void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
11524*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);
11525*80a68eefSBob Badour #define vst4q_p8 vst4q_u8
11526*80a68eefSBob Badour 
11527*80a68eefSBob Badour //void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11528*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
11529*80a68eefSBob Badour #define vst4q_p16 vst4q_s16
11530*80a68eefSBob Badour 
11531*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11532*80a68eefSBob Badour _NEON2SSE_INLINE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)
11533*80a68eefSBob Badour {
11534*80a68eefSBob Badour     __m128i sh0, sh1, val0, val2;
11535*80a68eefSBob Badour     sh0 = _mm_unpacklo_epi8(_pM128i(val.val[0]),_pM128i(val.val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
11536*80a68eefSBob Badour     sh1 = _mm_unpacklo_epi8(_pM128i(val.val[2]),_pM128i(val.val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
11537*80a68eefSBob Badour     val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
11538*80a68eefSBob Badour     val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
11539*80a68eefSBob Badour     vst1q_u8(ptr,    val0);
11540*80a68eefSBob Badour     vst1q_u8((ptr + 16),  val2);
11541*80a68eefSBob Badour }
11542*80a68eefSBob Badour 
11543*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11544*80a68eefSBob Badour _NEON2SSE_INLINE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)
11545*80a68eefSBob Badour {
11546*80a68eefSBob Badour     __m128i sh0, sh1, val0, val2;
11547*80a68eefSBob Badour     sh0 = _mm_unpacklo_epi16(_pM128i(val.val[0]),_pM128i(val.val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
11548*80a68eefSBob Badour     sh1 = _mm_unpacklo_epi16(_pM128i(val.val[2]),_pM128i(val.val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
11549*80a68eefSBob Badour     val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
11550*80a68eefSBob Badour     val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
11551*80a68eefSBob Badour     vst1q_u16(ptr,      val0); //store as 128 bit structure
11552*80a68eefSBob Badour     vst1q_u16((ptr + 8),  val2);
11553*80a68eefSBob Badour }
11554*80a68eefSBob Badour 
11555*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
11556*80a68eefSBob Badour _NEON2SSE_INLINE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)
11557*80a68eefSBob Badour {
11558*80a68eefSBob Badour     //0,4,   1,5,  2,6,  3,7
11559*80a68eefSBob Badour     __m128i sh0, sh1, val0, val1;
11560*80a68eefSBob Badour     sh0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1])); //0,1,4,5
11561*80a68eefSBob Badour     sh1 = _mm_unpacklo_epi32(_pM128i(val.val[2]), _pM128i(val.val[3])); //2,3,6,7
11562*80a68eefSBob Badour     val0 = _mm_unpacklo_epi64(sh0,sh1); //
11563*80a68eefSBob Badour     val1 = _mm_unpackhi_epi64(sh0,sh1); //
11564*80a68eefSBob Badour     vst1q_u32(ptr,     val0); //store as 128 bit structure
11565*80a68eefSBob Badour     vst1q_u32((ptr + 4),  val1);
11566*80a68eefSBob Badour }
11567*80a68eefSBob Badour 
11568*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val);// VST1.64 {d0, d1, d2, d3}, [r0]
11569*80a68eefSBob Badour _NEON2SSE_INLINE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)
11570*80a68eefSBob Badour {
11571*80a68eefSBob Badour     *(ptr) =  val.val[0].m64_u64[0];
11572*80a68eefSBob Badour     *(ptr + 1) =  val.val[1].m64_u64[0];
11573*80a68eefSBob Badour     *(ptr + 2) =  val.val[2].m64_u64[0];
11574*80a68eefSBob Badour     *(ptr + 3) =  val.val[3].m64_u64[0];
11575*80a68eefSBob Badour }
11576*80a68eefSBob Badour 
11577*80a68eefSBob Badour //void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val)  //VST4.8 {d0, d1, d2, d3}, [r0]
11578*80a68eefSBob Badour #define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
11579*80a68eefSBob Badour 
11580*80a68eefSBob Badour //void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val)  // VST4.16 {d0, d1, d2, d3}, [r0]
11581*80a68eefSBob Badour #define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
11582*80a68eefSBob Badour 
11583*80a68eefSBob Badour //void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
11584*80a68eefSBob Badour #define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
11585*80a68eefSBob Badour 
11586*80a68eefSBob Badour //void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
11587*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);
11588*80a68eefSBob Badour #define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
11589*80a68eefSBob Badour 
11590*80a68eefSBob Badour //void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11591*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
11592*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11593*80a68eefSBob Badour 
11594*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
11595*80a68eefSBob Badour _NEON2SSE_INLINE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)
11596*80a68eefSBob Badour {
11597*80a68eefSBob Badour     //0,4,   1,5,  2,6,  3,7 -> 0,1, 2,3, 4,5, 6,7
11598*80a68eefSBob Badour     *(ptr) =   val.val[0].m64_f32[0];
11599*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_f32[0];
11600*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_f32[0];
11601*80a68eefSBob Badour     *(ptr + 3) = val.val[3].m64_f32[0];
11602*80a68eefSBob Badour     *(ptr + 4) = val.val[0].m64_f32[1];
11603*80a68eefSBob Badour     *(ptr + 5) = val.val[1].m64_f32[1];
11604*80a68eefSBob Badour     *(ptr + 6) = val.val[2].m64_f32[1];
11605*80a68eefSBob Badour     *(ptr + 7) = val.val[3].m64_f32[1];
11606*80a68eefSBob Badour }
11607*80a68eefSBob Badour 
11608*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11609*80a68eefSBob Badour #define vst4_p8 vst4_u8
11610*80a68eefSBob Badour 
11611*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11612*80a68eefSBob Badour #define vst4_p16 vst4_u16
11613*80a68eefSBob Badour 
11614*80a68eefSBob Badour //*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors  *********************
11615*80a68eefSBob Badour //********************************************************************************************************************
11616*80a68eefSBob Badour //void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
11617*80a68eefSBob Badour _NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane)
11618*80a68eefSBob Badour {
11619*80a68eefSBob Badour     vst1q_lane_s16(ptr, val->val[0], lane);
11620*80a68eefSBob Badour     vst1q_lane_s16((ptr + 1), val->val[1], lane);
11621*80a68eefSBob Badour }
11622*80a68eefSBob Badour #define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
11623*80a68eefSBob Badour 
11624*80a68eefSBob Badour //void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11625*80a68eefSBob Badour _NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane)
11626*80a68eefSBob Badour {
11627*80a68eefSBob Badour     vst1q_lane_u32(ptr, val->val[0], lane);
11628*80a68eefSBob Badour     vst1q_lane_u32((ptr + 1), val->val[1], lane);
11629*80a68eefSBob Badour }
11630*80a68eefSBob Badour #define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
11631*80a68eefSBob Badour 
11632*80a68eefSBob Badour //void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11633*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);
11634*80a68eefSBob Badour #define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
11635*80a68eefSBob Badour 
11636*80a68eefSBob Badour //void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
11637*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);
11638*80a68eefSBob Badour #define vst2q_lane_s32(ptr, val, lane)  vst2q_lane_u32((uint32_t*)ptr, val, lane)
11639*80a68eefSBob Badour 
11640*80a68eefSBob Badour //void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11641*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);
11642*80a68eefSBob Badour //current IA SIMD doesn't support float16
11643*80a68eefSBob Badour 
11644*80a68eefSBob Badour //void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11645*80a68eefSBob Badour _NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane)
11646*80a68eefSBob Badour {
11647*80a68eefSBob Badour     vst1q_lane_f32(ptr, val->val[0], lane);
11648*80a68eefSBob Badour     vst1q_lane_f32((ptr + 1), val->val[1], lane);
11649*80a68eefSBob Badour }
11650*80a68eefSBob Badour #define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
11651*80a68eefSBob Badour 
11652*80a68eefSBob Badour //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11653*80a68eefSBob Badour _NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
11654*80a68eefSBob Badour #define vst2q_lane_p16 vst2q_lane_s16
11655*80a68eefSBob Badour 
11656*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11657*80a68eefSBob Badour _NEON2SSE_INLINE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
11658*80a68eefSBob Badour {
11659*80a68eefSBob Badour     *(ptr) = val.val[0].m64_u8[lane];
11660*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u8[lane];
11661*80a68eefSBob Badour }
11662*80a68eefSBob Badour 
11663*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11664*80a68eefSBob Badour _NEON2SSE_INLINE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane)
11665*80a68eefSBob Badour {
11666*80a68eefSBob Badour     *(ptr) = val.val[0].m64_u16[lane];
11667*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u16[lane];
11668*80a68eefSBob Badour }
11669*80a68eefSBob Badour 
11670*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11671*80a68eefSBob Badour _NEON2SSE_INLINE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane)
11672*80a68eefSBob Badour {
11673*80a68eefSBob Badour     *(ptr) = val.val[0].m64_u32[lane];
11674*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u32[lane];
11675*80a68eefSBob Badour }
11676*80a68eefSBob Badour 
11677*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11678*80a68eefSBob Badour #define vst2_lane_s8(ptr, val, lane)  vst2_lane_u8((uint8_t*)ptr, val, lane)
11679*80a68eefSBob Badour 
11680*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11681*80a68eefSBob Badour #define vst2_lane_s16(ptr, val, lane)  vst2_lane_u16((uint16_t*)ptr, val, lane)
11682*80a68eefSBob Badour 
11683*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11684*80a68eefSBob Badour #define vst2_lane_s32(ptr, val, lane)  vst2_lane_u32((uint32_t*)ptr, val, lane)
11685*80a68eefSBob Badour 
11686*80a68eefSBob Badour //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
11687*80a68eefSBob Badour //current IA SIMD doesn't support float16
11688*80a68eefSBob Badour 
11689*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
11690*80a68eefSBob Badour _NEON2SSE_INLINE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane)
11691*80a68eefSBob Badour {
11692*80a68eefSBob Badour     *(ptr) = val.val[0].m64_f32[lane];
11693*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_f32[lane];
11694*80a68eefSBob Badour }
11695*80a68eefSBob Badour 
11696*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11697*80a68eefSBob Badour #define vst2_lane_p8 vst2_lane_u8
11698*80a68eefSBob Badour 
11699*80a68eefSBob Badour _NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11700*80a68eefSBob Badour #define vst2_lane_p16 vst2_lane_u16
11701*80a68eefSBob Badour 
11702*80a68eefSBob Badour //************************* Triple lanes  stores *******************************************************
11703*80a68eefSBob Badour //*******************************************************************************************************
11704*80a68eefSBob Badour //void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11705*80a68eefSBob Badour _NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane)
11706*80a68eefSBob Badour {
11707*80a68eefSBob Badour     vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
11708*80a68eefSBob Badour     vst1q_lane_u16((ptr + 2), val->val[2], lane);
11709*80a68eefSBob Badour }
11710*80a68eefSBob Badour #define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
11711*80a68eefSBob Badour 
11712*80a68eefSBob Badour //void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11713*80a68eefSBob Badour _NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane)
11714*80a68eefSBob Badour {
11715*80a68eefSBob Badour     vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
11716*80a68eefSBob Badour     vst1q_lane_u32((ptr + 2), val->val[2], lane);
11717*80a68eefSBob Badour }
11718*80a68eefSBob Badour #define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
11719*80a68eefSBob Badour 
11720*80a68eefSBob Badour //void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11721*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);
11722*80a68eefSBob Badour #define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
11723*80a68eefSBob Badour 
11724*80a68eefSBob Badour //void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11725*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);
11726*80a68eefSBob Badour #define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
11727*80a68eefSBob Badour 
11728*80a68eefSBob Badour //void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11729*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);
11730*80a68eefSBob Badour //current IA SIMD doesn't support float16
11731*80a68eefSBob Badour 
11732*80a68eefSBob Badour //void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11733*80a68eefSBob Badour _NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane)
11734*80a68eefSBob Badour {
11735*80a68eefSBob Badour     vst1q_lane_f32(ptr,   val->val[0], lane);
11736*80a68eefSBob Badour     vst1q_lane_f32((ptr + 1),   val->val[1], lane);
11737*80a68eefSBob Badour     vst1q_lane_f32((ptr + 2), val->val[2], lane);
11738*80a68eefSBob Badour }
11739*80a68eefSBob Badour #define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
11740*80a68eefSBob Badour 
11741*80a68eefSBob Badour //void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11742*80a68eefSBob Badour _NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);
11743*80a68eefSBob Badour #define vst3q_lane_p16 vst3q_lane_s16
11744*80a68eefSBob Badour 
11745*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11746*80a68eefSBob Badour _NEON2SSE_INLINE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)
11747*80a68eefSBob Badour {
11748*80a68eefSBob Badour     *(ptr) =     val.val[0].m64_u8[lane];
11749*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u8[lane];
11750*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_u8[lane];
11751*80a68eefSBob Badour }
11752*80a68eefSBob Badour 
11753*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11754*80a68eefSBob Badour _NEON2SSE_INLINE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)
11755*80a68eefSBob Badour {
11756*80a68eefSBob Badour     *(ptr) =     val.val[0].m64_u16[lane];
11757*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u16[lane];
11758*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_u16[lane];
11759*80a68eefSBob Badour }
11760*80a68eefSBob Badour 
11761*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11762*80a68eefSBob Badour _NEON2SSE_INLINE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)
11763*80a68eefSBob Badour {
11764*80a68eefSBob Badour     *(ptr) =     val.val[0].m64_u32[lane];
11765*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u32[lane];
11766*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_u32[lane];
11767*80a68eefSBob Badour }
11768*80a68eefSBob Badour 
11769*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11770*80a68eefSBob Badour #define  vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
11771*80a68eefSBob Badour 
11772*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11773*80a68eefSBob Badour #define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
11774*80a68eefSBob Badour 
11775*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11776*80a68eefSBob Badour #define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
11777*80a68eefSBob Badour 
11778*80a68eefSBob Badour //void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11779*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);
11780*80a68eefSBob Badour //current IA SIMD doesn't support float16
11781*80a68eefSBob Badour 
11782*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11783*80a68eefSBob Badour _NEON2SSE_INLINE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)
11784*80a68eefSBob Badour {
11785*80a68eefSBob Badour     *(ptr) = val.val[0].m64_f32[lane];
11786*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_f32[lane];
11787*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_f32[lane];
11788*80a68eefSBob Badour }
11789*80a68eefSBob Badour 
11790*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11791*80a68eefSBob Badour #define vst3_lane_p8 vst3_lane_u8
11792*80a68eefSBob Badour 
11793*80a68eefSBob Badour _NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11794*80a68eefSBob Badour #define vst3_lane_p16 vst3_lane_u16
11795*80a68eefSBob Badour 
11796*80a68eefSBob Badour //******************************** Quadruple lanes stores ***********************************************
11797*80a68eefSBob Badour //*******************************************************************************************************
11798*80a68eefSBob Badour //void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11799*80a68eefSBob Badour _NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane)
11800*80a68eefSBob Badour {
11801*80a68eefSBob Badour     vst2q_lane_u16_ptr(ptr,    (uint16x8x2_t*)val4->val, lane);
11802*80a68eefSBob Badour     vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
11803*80a68eefSBob Badour }
11804*80a68eefSBob Badour #define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
11805*80a68eefSBob Badour 
11806*80a68eefSBob Badour //void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11807*80a68eefSBob Badour _NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane)
11808*80a68eefSBob Badour {
11809*80a68eefSBob Badour     vst2q_lane_u32_ptr(ptr,     (uint32x4x2_t*)val4->val, lane);
11810*80a68eefSBob Badour     vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
11811*80a68eefSBob Badour }
11812*80a68eefSBob Badour #define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
11813*80a68eefSBob Badour 
11814*80a68eefSBob Badour //void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11815*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);
11816*80a68eefSBob Badour #define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
11817*80a68eefSBob Badour 
11818*80a68eefSBob Badour //void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11819*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);
11820*80a68eefSBob Badour #define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
11821*80a68eefSBob Badour 
11822*80a68eefSBob Badour //void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11823*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);
11824*80a68eefSBob Badour //current IA SIMD doesn't support float16
11825*80a68eefSBob Badour 
11826*80a68eefSBob Badour //void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11827*80a68eefSBob Badour _NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane)
11828*80a68eefSBob Badour {
11829*80a68eefSBob Badour     vst1q_lane_f32(ptr,   val->val[0], lane);
11830*80a68eefSBob Badour     vst1q_lane_f32((ptr + 1), val->val[1], lane);
11831*80a68eefSBob Badour     vst1q_lane_f32((ptr + 2), val->val[2], lane);
11832*80a68eefSBob Badour     vst1q_lane_f32((ptr + 3), val->val[3], lane);
11833*80a68eefSBob Badour }
11834*80a68eefSBob Badour #define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
11835*80a68eefSBob Badour 
11836*80a68eefSBob Badour //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11837*80a68eefSBob Badour _NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
11838*80a68eefSBob Badour #define vst4q_lane_p16 vst4q_lane_u16
11839*80a68eefSBob Badour 
11840*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11841*80a68eefSBob Badour _NEON2SSE_INLINE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)
11842*80a68eefSBob Badour {
11843*80a68eefSBob Badour     *(ptr) =     val.val[0].m64_u8[lane];
11844*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u8[lane];
11845*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_u8[lane];
11846*80a68eefSBob Badour     *(ptr + 3) = val.val[3].m64_u8[lane];
11847*80a68eefSBob Badour }
11848*80a68eefSBob Badour 
11849*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11850*80a68eefSBob Badour _NEON2SSE_INLINE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)
11851*80a68eefSBob Badour {
11852*80a68eefSBob Badour     *(ptr) =     val.val[0].m64_u16[lane];
11853*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u16[lane];
11854*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_u16[lane];
11855*80a68eefSBob Badour     *(ptr + 3) = val.val[3].m64_u16[lane];
11856*80a68eefSBob Badour }
11857*80a68eefSBob Badour 
11858*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11859*80a68eefSBob Badour _NEON2SSE_INLINE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)
11860*80a68eefSBob Badour {
11861*80a68eefSBob Badour     *(ptr) =     val.val[0].m64_u32[lane];
11862*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_u32[lane];
11863*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_u32[lane];
11864*80a68eefSBob Badour     *(ptr + 3) = val.val[3].m64_u32[lane];
11865*80a68eefSBob Badour }
11866*80a68eefSBob Badour 
11867*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11868*80a68eefSBob Badour #define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
11869*80a68eefSBob Badour 
11870*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11871*80a68eefSBob Badour #define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
11872*80a68eefSBob Badour 
11873*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11874*80a68eefSBob Badour #define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
11875*80a68eefSBob Badour 
11876*80a68eefSBob Badour //void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11877*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
11878*80a68eefSBob Badour //current IA SIMD doesn't support float16
11879*80a68eefSBob Badour 
11880*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t  val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11881*80a68eefSBob Badour _NEON2SSE_INLINE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)
11882*80a68eefSBob Badour {
11883*80a68eefSBob Badour     *(ptr) = val.val[0].m64_f32[lane];
11884*80a68eefSBob Badour     *(ptr + 1) = val.val[1].m64_f32[lane];
11885*80a68eefSBob Badour     *(ptr + 2) = val.val[2].m64_f32[lane];
11886*80a68eefSBob Badour     *(ptr + 3) = val.val[3].m64_f32[lane];
11887*80a68eefSBob Badour }
11888*80a68eefSBob Badour 
11889*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11890*80a68eefSBob Badour #define vst4_lane_p8 vst4_lane_u8
11891*80a68eefSBob Badour 
11892*80a68eefSBob Badour _NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11893*80a68eefSBob Badour #define vst4_lane_p16 vst4_lane_u16
11894*80a68eefSBob Badour 
11895*80a68eefSBob Badour //**************************************************************************************************
11896*80a68eefSBob Badour //************************ Extract lanes from a vector ********************************************
11897*80a68eefSBob Badour //**************************************************************************************************
11898*80a68eefSBob Badour //These intrinsics extract a single lane (element) from a vector.
11899*80a68eefSBob Badour _NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
11900*80a68eefSBob Badour #define vget_lane_u8(vec, lane) vec.m64_u8[lane]
11901*80a68eefSBob Badour 
11902*80a68eefSBob Badour _NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
11903*80a68eefSBob Badour #define vget_lane_u16(vec, lane) vec.m64_u16[lane]
11904*80a68eefSBob Badour 
11905*80a68eefSBob Badour 
11906*80a68eefSBob Badour _NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11907*80a68eefSBob Badour #define vget_lane_u32(vec, lane) vec.m64_u32[lane]
11908*80a68eefSBob Badour 
11909*80a68eefSBob Badour _NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
11910*80a68eefSBob Badour #define vget_lane_s8(vec, lane) vec.m64_i8[lane]
11911*80a68eefSBob Badour 
11912*80a68eefSBob Badour _NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
11913*80a68eefSBob Badour #define vget_lane_s16(vec, lane) vec.m64_i16[lane]
11914*80a68eefSBob Badour 
11915*80a68eefSBob Badour _NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11916*80a68eefSBob Badour #define vget_lane_s32(vec, lane) vec.m64_i32[lane]
11917*80a68eefSBob Badour 
11918*80a68eefSBob Badour _NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
11919*80a68eefSBob Badour #define vget_lane_p8 vget_lane_u8
11920*80a68eefSBob Badour 
11921*80a68eefSBob Badour _NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
11922*80a68eefSBob Badour #define vget_lane_p16 vget_lane_u16
11923*80a68eefSBob Badour 
11924*80a68eefSBob Badour _NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11925*80a68eefSBob Badour #define vget_lane_f32(vec, lane) vec.m64_f32[lane]
11926*80a68eefSBob Badour 
11927*80a68eefSBob Badour _NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
11928*80a68eefSBob Badour #define vgetq_lane_u8 (uint8_t) _MM_EXTRACT_EPI8
11929*80a68eefSBob Badour 
11930*80a68eefSBob Badour _NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
11931*80a68eefSBob Badour #define  vgetq_lane_u16 (uint16_t) _MM_EXTRACT_EPI16
11932*80a68eefSBob Badour 
11933*80a68eefSBob Badour _NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11934*80a68eefSBob Badour #define vgetq_lane_u32 (uint32_t) _MM_EXTRACT_EPI32
11935*80a68eefSBob Badour 
11936*80a68eefSBob Badour _NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
11937*80a68eefSBob Badour #define vgetq_lane_s8 _MM_EXTRACT_EPI8
11938*80a68eefSBob Badour 
11939*80a68eefSBob Badour _NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
11940*80a68eefSBob Badour #define vgetq_lane_s16 _MM_EXTRACT_EPI16
11941*80a68eefSBob Badour 
11942*80a68eefSBob Badour _NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11943*80a68eefSBob Badour #define vgetq_lane_s32 _MM_EXTRACT_EPI32
11944*80a68eefSBob Badour 
11945*80a68eefSBob Badour _NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
11946*80a68eefSBob Badour #define vgetq_lane_p8 vgetq_lane_u8
11947*80a68eefSBob Badour 
11948*80a68eefSBob Badour _NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
11949*80a68eefSBob Badour #define vgetq_lane_p16 vgetq_lane_u16
11950*80a68eefSBob Badour 
11951*80a68eefSBob Badour _NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11952*80a68eefSBob Badour _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
11953*80a68eefSBob Badour {
11954*80a68eefSBob Badour     int32_t ilane;
11955*80a68eefSBob Badour     ilane = _MM_EXTRACT_PS(vec,lane);
11956*80a68eefSBob Badour     return *(float*)&ilane;
11957*80a68eefSBob Badour }
11958*80a68eefSBob Badour 
11959*80a68eefSBob Badour _NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
11960*80a68eefSBob Badour #define vget_lane_s64(vec, lane) vec.m64_i64[0]
11961*80a68eefSBob Badour 
11962*80a68eefSBob Badour _NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
11963*80a68eefSBob Badour #define vget_lane_u64(vec, lane) vec.m64_u64[0]
11964*80a68eefSBob Badour 
11965*80a68eefSBob Badour 
11966*80a68eefSBob Badour _NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
11967*80a68eefSBob Badour #define vgetq_lane_s64 _MM_EXTRACT_EPI64
11968*80a68eefSBob Badour 
11969*80a68eefSBob Badour _NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
11970*80a68eefSBob Badour #define vgetq_lane_u64 (uint64_t) _MM_EXTRACT_EPI64
11971*80a68eefSBob Badour 
11972*80a68eefSBob Badour // ***************** Set lanes within a vector ********************************************
11973*80a68eefSBob Badour // **************************************************************************************
11974*80a68eefSBob Badour //These intrinsics set a single lane (element) within a vector.
11975*80a68eefSBob Badour //same functions as vld1_lane_xx ones, but take the value to be set directly.
11976*80a68eefSBob Badour 
11977*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
11978*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
11979*80a68eefSBob Badour {
11980*80a68eefSBob Badour     uint8_t val;
11981*80a68eefSBob Badour     val = value;
11982*80a68eefSBob Badour     return vld1_lane_u8(&val, vec,  lane);
11983*80a68eefSBob Badour }
11984*80a68eefSBob Badour 
11985*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
11986*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
11987*80a68eefSBob Badour {
11988*80a68eefSBob Badour     uint16_t val;
11989*80a68eefSBob Badour     val = value;
11990*80a68eefSBob Badour     return vld1_lane_u16(&val, vec,  lane);
11991*80a68eefSBob Badour }
11992*80a68eefSBob Badour 
11993*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
11994*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
11995*80a68eefSBob Badour {
11996*80a68eefSBob Badour     uint32_t val;
11997*80a68eefSBob Badour     val = value;
11998*80a68eefSBob Badour     return vld1_lane_u32(&val, vec,  lane);
11999*80a68eefSBob Badour }
12000*80a68eefSBob Badour 
12001*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12002*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
12003*80a68eefSBob Badour {
12004*80a68eefSBob Badour     int8_t val;
12005*80a68eefSBob Badour     val = value;
12006*80a68eefSBob Badour     return vld1_lane_s8(&val, vec,  lane);
12007*80a68eefSBob Badour }
12008*80a68eefSBob Badour 
12009*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12010*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
12011*80a68eefSBob Badour {
12012*80a68eefSBob Badour     int16_t val;
12013*80a68eefSBob Badour     val = value;
12014*80a68eefSBob Badour     return vld1_lane_s16(&val, vec,  lane);
12015*80a68eefSBob Badour }
12016*80a68eefSBob Badour 
12017*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12018*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
12019*80a68eefSBob Badour {
12020*80a68eefSBob Badour     int32_t val;
12021*80a68eefSBob Badour     val = value;
12022*80a68eefSBob Badour     return vld1_lane_s32(&val, vec,  lane);
12023*80a68eefSBob Badour }
12024*80a68eefSBob Badour 
12025*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12026*80a68eefSBob Badour #define vset_lane_p8  vset_lane_u8
12027*80a68eefSBob Badour 
12028*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12029*80a68eefSBob Badour #define vset_lane_p16  vset_lane_u16
12030*80a68eefSBob Badour 
12031*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12032*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
12033*80a68eefSBob Badour {
12034*80a68eefSBob Badour     float32_t val;
12035*80a68eefSBob Badour     val = value;
12036*80a68eefSBob Badour     return vld1_lane_f32(&val, vec,  lane);
12037*80a68eefSBob Badour }
12038*80a68eefSBob Badour 
12039*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12040*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
12041*80a68eefSBob Badour {
12042*80a68eefSBob Badour     uint8_t val;
12043*80a68eefSBob Badour     val = value;
12044*80a68eefSBob Badour     return vld1q_lane_u8(&val, vec,  lane);
12045*80a68eefSBob Badour }
12046*80a68eefSBob Badour 
12047*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12048*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
12049*80a68eefSBob Badour {
12050*80a68eefSBob Badour     uint16_t val;
12051*80a68eefSBob Badour     val = value;
12052*80a68eefSBob Badour     return vld1q_lane_u16(&val, vec,  lane);
12053*80a68eefSBob Badour }
12054*80a68eefSBob Badour 
12055*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12056*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
12057*80a68eefSBob Badour {
12058*80a68eefSBob Badour     uint32_t val;
12059*80a68eefSBob Badour     val = value;
12060*80a68eefSBob Badour     return vld1q_lane_u32(&val, vec,  lane);
12061*80a68eefSBob Badour }
12062*80a68eefSBob Badour 
12063*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12064*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
12065*80a68eefSBob Badour {
12066*80a68eefSBob Badour     int8_t val;
12067*80a68eefSBob Badour     val = value;
12068*80a68eefSBob Badour     return vld1q_lane_s8(&val, vec,  lane);
12069*80a68eefSBob Badour }
12070*80a68eefSBob Badour 
12071*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12072*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
12073*80a68eefSBob Badour {
12074*80a68eefSBob Badour     int16_t val;
12075*80a68eefSBob Badour     val = value;
12076*80a68eefSBob Badour     return vld1q_lane_s16(&val, vec,  lane);
12077*80a68eefSBob Badour }
12078*80a68eefSBob Badour 
12079*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12080*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
12081*80a68eefSBob Badour {
12082*80a68eefSBob Badour     int32_t val;
12083*80a68eefSBob Badour     val = value;
12084*80a68eefSBob Badour     return vld1q_lane_s32(&val, vec,  lane);
12085*80a68eefSBob Badour }
12086*80a68eefSBob Badour 
12087*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12088*80a68eefSBob Badour #define vsetq_lane_p8 vsetq_lane_u8
12089*80a68eefSBob Badour 
12090*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12091*80a68eefSBob Badour #define vsetq_lane_p16 vsetq_lane_u16
12092*80a68eefSBob Badour 
12093*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12094*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
12095*80a68eefSBob Badour {
12096*80a68eefSBob Badour     float32_t val;
12097*80a68eefSBob Badour     val = value;
12098*80a68eefSBob Badour     return vld1q_lane_f32(&val, vec,  lane);
12099*80a68eefSBob Badour }
12100*80a68eefSBob Badour 
12101*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
12102*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
12103*80a68eefSBob Badour {
12104*80a68eefSBob Badour     int64_t val;
12105*80a68eefSBob Badour     val = value;
12106*80a68eefSBob Badour     return vld1_lane_s64(&val, vec,  lane);
12107*80a68eefSBob Badour }
12108*80a68eefSBob Badour 
12109*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
12110*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
12111*80a68eefSBob Badour {
12112*80a68eefSBob Badour     uint64_t val;
12113*80a68eefSBob Badour     val = value;
12114*80a68eefSBob Badour     return vld1_lane_u64(&val, vec,  lane);
12115*80a68eefSBob Badour }
12116*80a68eefSBob Badour 
12117*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
12118*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
12119*80a68eefSBob Badour {
12120*80a68eefSBob Badour     uint64_t val;
12121*80a68eefSBob Badour     val = value;
12122*80a68eefSBob Badour     return vld1q_lane_s64(&val, vec,  lane);
12123*80a68eefSBob Badour }
12124*80a68eefSBob Badour 
12125*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
12126*80a68eefSBob Badour #define vsetq_lane_u64 vsetq_lane_s64
12127*80a68eefSBob Badour 
12128*80a68eefSBob Badour // *******************************************************************************
12129*80a68eefSBob Badour // **************** Initialize a vector from bit pattern ***************************
12130*80a68eefSBob Badour // *******************************************************************************
12131*80a68eefSBob Badour //These intrinsics create a vector from a literal bit pattern.
12132*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
vcreate_s8(uint64_t a)12133*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vcreate_s8(uint64_t a)
12134*80a68eefSBob Badour {
12135*80a68eefSBob Badour     return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
12136*80a68eefSBob Badour }
12137*80a68eefSBob Badour 
12138*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
12139*80a68eefSBob Badour #define vcreate_s16  vcreate_s8
12140*80a68eefSBob Badour 
12141*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
12142*80a68eefSBob Badour #define vcreate_s32  vcreate_s8
12143*80a68eefSBob Badour 
12144*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
12145*80a68eefSBob Badour //no IA32 SIMD avalilable
12146*80a68eefSBob Badour 
12147*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
vcreate_f32(uint64_t a)12148*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vcreate_f32(uint64_t a)
12149*80a68eefSBob Badour {
12150*80a68eefSBob Badour     return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
12151*80a68eefSBob Badour }
12152*80a68eefSBob Badour 
12153*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
12154*80a68eefSBob Badour #define vcreate_u8 vcreate_s8
12155*80a68eefSBob Badour 
12156*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
12157*80a68eefSBob Badour #define vcreate_u16 vcreate_s16
12158*80a68eefSBob Badour 
12159*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
12160*80a68eefSBob Badour #define vcreate_u32 vcreate_s32
12161*80a68eefSBob Badour 
12162*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
12163*80a68eefSBob Badour #define vcreate_u64  vcreate_s8
12164*80a68eefSBob Badour 
12165*80a68eefSBob Badour 
12166*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
12167*80a68eefSBob Badour #define vcreate_p8 vcreate_u8
12168*80a68eefSBob Badour 
12169*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
12170*80a68eefSBob Badour #define vcreate_p16 vcreate_u16
12171*80a68eefSBob Badour 
12172*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
12173*80a68eefSBob Badour #define vcreate_s64 vcreate_u64
12174*80a68eefSBob Badour 
12175*80a68eefSBob Badour //********************* Set all lanes to same value ********************************
12176*80a68eefSBob Badour //*********************************************************************************
12177*80a68eefSBob Badour //These intrinsics set all lanes to the same value.
12178*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t   vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8 (uint8_t value),_NEON2SSE_REASON_SLOW_SERIAL)12179*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t  vdup_n_u8(uint8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12180*80a68eefSBob Badour {
12181*80a68eefSBob Badour     uint8x8_t res;
12182*80a68eefSBob Badour     int i;
12183*80a68eefSBob Badour     for (i = 0; i<8; i++) {
12184*80a68eefSBob Badour         res.m64_u8[i] = value;
12185*80a68eefSBob Badour     }
12186*80a68eefSBob Badour     return res;
12187*80a68eefSBob Badour }
12188*80a68eefSBob Badour 
12189*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t   vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16 (uint16_t value),_NEON2SSE_REASON_SLOW_SERIAL)12190*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t  vdup_n_u16(uint16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12191*80a68eefSBob Badour {
12192*80a68eefSBob Badour     uint16x4_t res;
12193*80a68eefSBob Badour     int i;
12194*80a68eefSBob Badour     for (i = 0; i<4; i++) {
12195*80a68eefSBob Badour         res.m64_u16[i] = value;
12196*80a68eefSBob Badour     }
12197*80a68eefSBob Badour     return res;
12198*80a68eefSBob Badour }
12199*80a68eefSBob Badour 
12200*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t   vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32 (uint32_t value),_NEON2SSE_REASON_SLOW_SERIAL)12201*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t  vdup_n_u32(uint32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12202*80a68eefSBob Badour {
12203*80a68eefSBob Badour     uint32x2_t res;
12204*80a68eefSBob Badour     res.m64_u32[0] = value;
12205*80a68eefSBob Badour     res.m64_u32[1] = value;
12206*80a68eefSBob Badour     return res;
12207*80a68eefSBob Badour }
12208*80a68eefSBob Badour 
12209*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t   vdup_n_s8(int8_t value); // VDUP.8 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8 (int8_t value),_NEON2SSE_REASON_SLOW_SERIAL)12210*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t  vdup_n_s8(int8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12211*80a68eefSBob Badour {
12212*80a68eefSBob Badour     int8x8_t res;
12213*80a68eefSBob Badour     int i;
12214*80a68eefSBob Badour     for (i = 0; i<8; i++) {
12215*80a68eefSBob Badour         res.m64_i8[i] = value;
12216*80a68eefSBob Badour     }
12217*80a68eefSBob Badour     return res;
12218*80a68eefSBob Badour }
12219*80a68eefSBob Badour 
12220*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t   vdup_n_s16(int16_t value); // VDUP.16 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16 (int16_t value),_NEON2SSE_REASON_SLOW_SERIAL)12221*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t  vdup_n_s16(int16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12222*80a68eefSBob Badour {
12223*80a68eefSBob Badour     int16x4_t res;
12224*80a68eefSBob Badour     int i;
12225*80a68eefSBob Badour     for (i = 0; i<4; i++) {
12226*80a68eefSBob Badour         res.m64_i16[i] = value;
12227*80a68eefSBob Badour     }
12228*80a68eefSBob Badour     return res;
12229*80a68eefSBob Badour }
12230*80a68eefSBob Badour 
12231*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t   vdup_n_s32(int32_t value); // VDUP.32 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32 (int32_t value),_NEON2SSE_REASON_SLOW_SERIAL)12232*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t  vdup_n_s32(int32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
12233*80a68eefSBob Badour {
12234*80a68eefSBob Badour     int32x2_t res;
12235*80a68eefSBob Badour     res.m64_i32[0] = value;
12236*80a68eefSBob Badour     res.m64_i32[1] = value;
12237*80a68eefSBob Badour     return res;
12238*80a68eefSBob Badour }
12239*80a68eefSBob Badour 
12240*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
12241*80a68eefSBob Badour #define vdup_n_p8 vdup_n_u8
12242*80a68eefSBob Badour 
12243*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
12244*80a68eefSBob Badour #define vdup_n_p16 vdup_n_s16
12245*80a68eefSBob Badour 
12246*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
vdup_n_f32(float32_t value)12247*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
12248*80a68eefSBob Badour {
12249*80a68eefSBob Badour     float32x2_t res;
12250*80a68eefSBob Badour     res.m64_f32[0] = value;
12251*80a68eefSBob Badour     res.m64_f32[1] = value;
12252*80a68eefSBob Badour     return res;
12253*80a68eefSBob Badour }
12254*80a68eefSBob Badour 
12255*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
12256*80a68eefSBob Badour #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
12257*80a68eefSBob Badour 
12258*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
12259*80a68eefSBob Badour #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
12260*80a68eefSBob Badour 
12261*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
12262*80a68eefSBob Badour #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
12263*80a68eefSBob Badour 
12264*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
12265*80a68eefSBob Badour #define vdupq_n_s8 _mm_set1_epi8
12266*80a68eefSBob Badour 
12267*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
12268*80a68eefSBob Badour #define vdupq_n_s16 _mm_set1_epi16
12269*80a68eefSBob Badour 
12270*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
12271*80a68eefSBob Badour #define vdupq_n_s32 _mm_set1_epi32
12272*80a68eefSBob Badour 
12273*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
12274*80a68eefSBob Badour #define  vdupq_n_p8 vdupq_n_u8
12275*80a68eefSBob Badour 
12276*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
12277*80a68eefSBob Badour #define  vdupq_n_p16 vdupq_n_u16
12278*80a68eefSBob Badour 
12279*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
12280*80a68eefSBob Badour #define vdupq_n_f32 _mm_set1_ps
12281*80a68eefSBob Badour 
12282*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
vdup_n_s64(int64_t value)12283*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
12284*80a68eefSBob Badour {
12285*80a68eefSBob Badour     int64x1_t res;
12286*80a68eefSBob Badour     res.m64_i64[0] = value;
12287*80a68eefSBob Badour     return res;
12288*80a68eefSBob Badour }
12289*80a68eefSBob Badour 
12290*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
vdup_n_u64(uint64_t value)12291*80a68eefSBob Badour _NEON2SSE_INLINE uint64x1_t  vdup_n_u64(uint64_t value)
12292*80a68eefSBob Badour {
12293*80a68eefSBob Badour     uint64x1_t res;
12294*80a68eefSBob Badour     res.m64_u64[0] = value;
12295*80a68eefSBob Badour     return res;
12296*80a68eefSBob Badour }
12297*80a68eefSBob Badour 
12298*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t   vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
vdupq_n_s64(int64_t value)12299*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t   vdupq_n_s64(int64_t value)
12300*80a68eefSBob Badour {
12301*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
12302*80a68eefSBob Badour     return LOAD_SI128(value2);
12303*80a68eefSBob Badour }
12304*80a68eefSBob Badour 
12305*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t   vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
vdupq_n_u64(uint64_t value)12306*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t   vdupq_n_u64(uint64_t value)
12307*80a68eefSBob Badour {
12308*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
12309*80a68eefSBob Badour     return LOAD_SI128(val);
12310*80a68eefSBob Badour }
12311*80a68eefSBob Badour 
12312*80a68eefSBob Badour //****  Set all lanes to same value  ************************
12313*80a68eefSBob Badour //Same functions as above - just aliaces.********************
12314*80a68eefSBob Badour //Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
12315*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
12316*80a68eefSBob Badour #define vmov_n_u8 vdup_n_s8
12317*80a68eefSBob Badour 
12318*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
12319*80a68eefSBob Badour #define vmov_n_u16 vdup_n_s16
12320*80a68eefSBob Badour 
12321*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
12322*80a68eefSBob Badour #define vmov_n_u32 vdup_n_u32
12323*80a68eefSBob Badour 
12324*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
12325*80a68eefSBob Badour #define vmov_n_s8 vdup_n_s8
12326*80a68eefSBob Badour 
12327*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
12328*80a68eefSBob Badour #define vmov_n_s16 vdup_n_s16
12329*80a68eefSBob Badour 
12330*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
12331*80a68eefSBob Badour #define vmov_n_s32 vdup_n_s32
12332*80a68eefSBob Badour 
12333*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
12334*80a68eefSBob Badour #define vmov_n_p8 vdup_n_u8
12335*80a68eefSBob Badour 
12336*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
12337*80a68eefSBob Badour #define vmov_n_p16 vdup_n_s16
12338*80a68eefSBob Badour 
12339*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
12340*80a68eefSBob Badour #define vmov_n_f32 vdup_n_f32
12341*80a68eefSBob Badour 
12342*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
12343*80a68eefSBob Badour #define vmovq_n_u8 vdupq_n_u8
12344*80a68eefSBob Badour 
12345*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
12346*80a68eefSBob Badour #define vmovq_n_u16 vdupq_n_s16
12347*80a68eefSBob Badour 
12348*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
12349*80a68eefSBob Badour #define vmovq_n_u32 vdupq_n_u32
12350*80a68eefSBob Badour 
12351*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
12352*80a68eefSBob Badour #define vmovq_n_s8 vdupq_n_s8
12353*80a68eefSBob Badour 
12354*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
12355*80a68eefSBob Badour #define vmovq_n_s16 vdupq_n_s16
12356*80a68eefSBob Badour 
12357*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
12358*80a68eefSBob Badour #define vmovq_n_s32 vdupq_n_s32
12359*80a68eefSBob Badour 
12360*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
12361*80a68eefSBob Badour #define vmovq_n_p8 vdupq_n_u8
12362*80a68eefSBob Badour 
12363*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
12364*80a68eefSBob Badour #define vmovq_n_p16 vdupq_n_s16
12365*80a68eefSBob Badour 
12366*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
12367*80a68eefSBob Badour #define vmovq_n_f32 vdupq_n_f32
12368*80a68eefSBob Badour 
12369*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
12370*80a68eefSBob Badour #define vmov_n_s64 vdup_n_s64
12371*80a68eefSBob Badour 
12372*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
12373*80a68eefSBob Badour #define vmov_n_u64 vdup_n_u64
12374*80a68eefSBob Badour 
12375*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
12376*80a68eefSBob Badour #define vmovq_n_s64 vdupq_n_s64
12377*80a68eefSBob Badour 
12378*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
12379*80a68eefSBob Badour #define vmovq_n_u64 vdupq_n_u64
12380*80a68eefSBob Badour 
12381*80a68eefSBob Badour //**************Set all lanes to the value of one lane of a vector *************
12382*80a68eefSBob Badour //****************************************************************************
12383*80a68eefSBob Badour //here shuffle is better solution than lane extraction followed by set1 function
12384*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12385*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
12386*80a68eefSBob Badour {
12387*80a68eefSBob Badour     uint8x8_t res;
12388*80a68eefSBob Badour     uint8_t valane;
12389*80a68eefSBob Badour     int i = 0;
12390*80a68eefSBob Badour     valane = vec.m64_u8[lane];
12391*80a68eefSBob Badour     for (i = 0; i<8; i++) {
12392*80a68eefSBob Badour         res.m64_u8[i] = valane;
12393*80a68eefSBob Badour     }
12394*80a68eefSBob Badour     return res;
12395*80a68eefSBob Badour }
12396*80a68eefSBob Badour 
12397*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12398*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
12399*80a68eefSBob Badour {
12400*80a68eefSBob Badour     uint16x4_t res;
12401*80a68eefSBob Badour     uint16_t valane;
12402*80a68eefSBob Badour     valane = vec.m64_u16[lane];
12403*80a68eefSBob Badour     res.m64_u16[0] = valane;
12404*80a68eefSBob Badour     res.m64_u16[1] = valane;
12405*80a68eefSBob Badour     res.m64_u16[2] = valane;
12406*80a68eefSBob Badour     res.m64_u16[3] = valane;
12407*80a68eefSBob Badour     return res;
12408*80a68eefSBob Badour }
12409*80a68eefSBob Badour 
12410*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12411*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
12412*80a68eefSBob Badour {
12413*80a68eefSBob Badour     uint32x2_t res;
12414*80a68eefSBob Badour     res.m64_u32[0] = vec.m64_u32[lane];
12415*80a68eefSBob Badour     res.m64_u32[1] = res.m64_u32[0];
12416*80a68eefSBob Badour     return res;
12417*80a68eefSBob Badour }
12418*80a68eefSBob Badour 
12419*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec,  __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12420*80a68eefSBob Badour #define vdup_lane_s8 vdup_lane_u8
12421*80a68eefSBob Badour 
12422*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec,  __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12423*80a68eefSBob Badour #define vdup_lane_s16 vdup_lane_u16
12424*80a68eefSBob Badour 
12425*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec,  __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12426*80a68eefSBob Badour #define vdup_lane_s32 vdup_lane_u32
12427*80a68eefSBob Badour 
12428*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12429*80a68eefSBob Badour #define vdup_lane_p8 vdup_lane_u8
12430*80a68eefSBob Badour 
12431*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12432*80a68eefSBob Badour #define vdup_lane_p16 vdup_lane_s16
12433*80a68eefSBob Badour 
12434*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12435*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
12436*80a68eefSBob Badour {
12437*80a68eefSBob Badour     float32x2_t res;
12438*80a68eefSBob Badour     res.m64_f32[0] = vec.m64_f32[lane];
12439*80a68eefSBob Badour     res.m64_f32[1] = res.m64_f32[0];
12440*80a68eefSBob Badour     return res;
12441*80a68eefSBob Badour }
12442*80a68eefSBob Badour 
12443*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12444*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
12445*80a68eefSBob Badour {
12446*80a68eefSBob Badour     const int8_t lane8 = (int8_t) lane;
12447*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8};
12448*80a68eefSBob Badour     return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
12449*80a68eefSBob Badour }
12450*80a68eefSBob Badour 
12451*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12452*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
12453*80a68eefSBob Badour {
12454*80a68eefSBob Badour     //we could use 8bit shuffle for 16 bit as well
12455*80a68eefSBob Badour     const int8_t lane16 = ((int8_t) lane) << 1;
12456*80a68eefSBob Badour     const int8_t lane16_1 = lane16 + 1;
12457*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1,
12458*80a68eefSBob Badour                                                 lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1};
12459*80a68eefSBob Badour     return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
12460*80a68eefSBob Badour }
12461*80a68eefSBob Badour 
12462*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12463*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
12464*80a68eefSBob Badour {
12465*80a68eefSBob Badour     //need to use function not macro to make it gcc friendly and meet the immediate const requirement for _mm_shuffle_epi32
12466*80a68eefSBob Badour     if (lane == 1)
12467*80a68eefSBob Badour         return _mm_shuffle_epi32 (_pM128i(vec), (1 | (1 << 2) | (1 << 4) | (1 << 6)) );
12468*80a68eefSBob Badour     else
12469*80a68eefSBob Badour         return _mm_shuffle_epi32 (_pM128i(vec), 0);
12470*80a68eefSBob Badour }
12471*80a68eefSBob Badour 
12472*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12473*80a68eefSBob Badour #define vdupq_lane_s8 vdupq_lane_u8
12474*80a68eefSBob Badour 
12475*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12476*80a68eefSBob Badour #define vdupq_lane_s16 vdupq_lane_u16
12477*80a68eefSBob Badour 
12478*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12479*80a68eefSBob Badour #define vdupq_lane_s32 vdupq_lane_u32
12480*80a68eefSBob Badour 
12481*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12482*80a68eefSBob Badour #define vdupq_lane_p8 vdupq_lane_u8
12483*80a68eefSBob Badour 
12484*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12485*80a68eefSBob Badour #define vdupq_lane_p16 vdupq_lane_s16
12486*80a68eefSBob Badour 
12487*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12488*80a68eefSBob Badour #define  vdupq_lane_f32(vec, lane)  _mm_load1_ps((vec.m64_f32 + lane))
12489*80a68eefSBob Badour 
12490*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
12491*80a68eefSBob Badour #define vdup_lane_s64(vec,lane) vec
12492*80a68eefSBob Badour 
12493*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
12494*80a68eefSBob Badour #define vdup_lane_u64(vec,lane) vec
12495*80a68eefSBob Badour 
12496*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
12497*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
12498*80a68eefSBob Badour {
12499*80a68eefSBob Badour     __m128i vec128;
12500*80a68eefSBob Badour     vec128 = _pM128i(vec);
12501*80a68eefSBob Badour     return _mm_unpacklo_epi64(vec128,vec128);
12502*80a68eefSBob Badour }
12503*80a68eefSBob Badour 
12504*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
12505*80a68eefSBob Badour #define vdupq_lane_u64 vdupq_lane_s64
12506*80a68eefSBob Badour 
12507*80a68eefSBob Badour // ********************************************************************
12508*80a68eefSBob Badour // ********************  Combining vectors *****************************
12509*80a68eefSBob Badour // ********************************************************************
12510*80a68eefSBob Badour //These intrinsics join two 64 bit vectors into a single 128bit vector.
12511*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
vcombine_s8(int8x8_t low,int8x8_t high)12512*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t  vcombine_s8(int8x8_t low, int8x8_t high)
12513*80a68eefSBob Badour {
12514*80a68eefSBob Badour    return _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) );
12515*80a68eefSBob Badour }
12516*80a68eefSBob Badour 
12517*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
12518*80a68eefSBob Badour #define vcombine_s16 vcombine_s8
12519*80a68eefSBob Badour 
12520*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
12521*80a68eefSBob Badour #define vcombine_s32 vcombine_s8
12522*80a68eefSBob Badour 
12523*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t   vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
12524*80a68eefSBob Badour #define vcombine_s64 vcombine_s8
12525*80a68eefSBob Badour 
12526*80a68eefSBob Badour _NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
12527*80a68eefSBob Badour //current IA SIMD doesn't support float16
12528*80a68eefSBob Badour 
12529*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
vcombine_f32(float32x2_t low,float32x2_t high)12530*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
12531*80a68eefSBob Badour {
12532*80a68eefSBob Badour     __m128i res;
12533*80a68eefSBob Badour     res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
12534*80a68eefSBob Badour     return _M128(res);
12535*80a68eefSBob Badour }
12536*80a68eefSBob Badour 
12537*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
12538*80a68eefSBob Badour #define vcombine_u8 vcombine_s8
12539*80a68eefSBob Badour 
12540*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
12541*80a68eefSBob Badour #define vcombine_u16 vcombine_s16
12542*80a68eefSBob Badour 
12543*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
12544*80a68eefSBob Badour #define vcombine_u32 vcombine_s32
12545*80a68eefSBob Badour 
12546*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t   vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
12547*80a68eefSBob Badour #define vcombine_u64 vcombine_s64
12548*80a68eefSBob Badour 
12549*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t   vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
12550*80a68eefSBob Badour #define vcombine_p8 vcombine_u8
12551*80a68eefSBob Badour 
12552*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t   vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
12553*80a68eefSBob Badour #define vcombine_p16 vcombine_u16
12554*80a68eefSBob Badour 
12555*80a68eefSBob Badour //**********************************************************************
12556*80a68eefSBob Badour //************************* Splitting vectors **************************
12557*80a68eefSBob Badour //**********************************************************************
12558*80a68eefSBob Badour //**************** Get high part ******************************************
12559*80a68eefSBob Badour //These intrinsics split a 128 bit vector into 2 component 64 bit vectors
12560*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
vget_high_s8(int8x16_t a)12561*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
12562*80a68eefSBob Badour {
12563*80a68eefSBob Badour     int8x8_t res64;
12564*80a68eefSBob Badour     __m128i res;
12565*80a68eefSBob Badour     res = _mm_unpackhi_epi64(a,a); //SSE2
12566*80a68eefSBob Badour     return64(res);
12567*80a68eefSBob Badour }
12568*80a68eefSBob Badour 
12569*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
vget_high_s16(int16x8_t a)12570*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
12571*80a68eefSBob Badour {
12572*80a68eefSBob Badour     int16x4_t res64;
12573*80a68eefSBob Badour     __m128i res;
12574*80a68eefSBob Badour     res =  _mm_unpackhi_epi64(a,a); //SSE2
12575*80a68eefSBob Badour     return64(res);
12576*80a68eefSBob Badour }
12577*80a68eefSBob Badour 
12578*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
vget_high_s32(int32x4_t a)12579*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
12580*80a68eefSBob Badour {
12581*80a68eefSBob Badour     int32x2_t res64;
12582*80a68eefSBob Badour     __m128i res;
12583*80a68eefSBob Badour     res =  _mm_unpackhi_epi64(a,a); //SSE2
12584*80a68eefSBob Badour     return64(res);
12585*80a68eefSBob Badour }
12586*80a68eefSBob Badour 
12587*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
vget_high_s64(int64x2_t a)12588*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
12589*80a68eefSBob Badour {
12590*80a68eefSBob Badour     int64x1_t res64;
12591*80a68eefSBob Badour     __m128i res;
12592*80a68eefSBob Badour     res =  _mm_unpackhi_epi64(a,a); //SSE2
12593*80a68eefSBob Badour     return64(res);
12594*80a68eefSBob Badour }
12595*80a68eefSBob Badour 
12596*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
12597*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently
12598*80a68eefSBob Badour 
12599*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
vget_high_f32(float32x4_t a)12600*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
12601*80a68eefSBob Badour {
12602*80a68eefSBob Badour     __m128i res;
12603*80a68eefSBob Badour     __m64_128 res64;
12604*80a68eefSBob Badour     res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
12605*80a68eefSBob Badour     return64(res);
12606*80a68eefSBob Badour }
12607*80a68eefSBob Badour 
12608*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
12609*80a68eefSBob Badour #define vget_high_u8 vget_high_s8
12610*80a68eefSBob Badour 
12611*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
12612*80a68eefSBob Badour #define vget_high_u16 vget_high_s16
12613*80a68eefSBob Badour 
12614*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
12615*80a68eefSBob Badour #define vget_high_u32 vget_high_s32
12616*80a68eefSBob Badour 
12617*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
12618*80a68eefSBob Badour #define vget_high_u64 vget_high_s64
12619*80a68eefSBob Badour 
12620*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
12621*80a68eefSBob Badour #define vget_high_p8 vget_high_u8
12622*80a68eefSBob Badour 
12623*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
12624*80a68eefSBob Badour #define vget_high_p16 vget_high_u16
12625*80a68eefSBob Badour 
12626*80a68eefSBob Badour //********************** Get low part **********************
12627*80a68eefSBob Badour //**********************************************************
12628*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
vget_low_s8(int8x16_t a)12629*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
12630*80a68eefSBob Badour {
12631*80a68eefSBob Badour     int16x4_t res64;
12632*80a68eefSBob Badour     return64(a);
12633*80a68eefSBob Badour }
12634*80a68eefSBob Badour 
12635*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
vget_low_s16(int16x8_t a)12636*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
12637*80a68eefSBob Badour {
12638*80a68eefSBob Badour     int16x4_t res64;
12639*80a68eefSBob Badour     return64(a);
12640*80a68eefSBob Badour }
12641*80a68eefSBob Badour 
12642*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
vget_low_s32(int32x4_t a)12643*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
12644*80a68eefSBob Badour {
12645*80a68eefSBob Badour     int32x2_t res64;
12646*80a68eefSBob Badour     return64(a);
12647*80a68eefSBob Badour }
12648*80a68eefSBob Badour 
12649*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
vget_low_s64(int64x2_t a)12650*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
12651*80a68eefSBob Badour {
12652*80a68eefSBob Badour     int64x1_t res64;
12653*80a68eefSBob Badour     return64 (a);
12654*80a68eefSBob Badour }
12655*80a68eefSBob Badour 
12656*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
12657*80a68eefSBob Badour // IA32 SIMD doesn't work with 16bit floats currently
12658*80a68eefSBob Badour 
12659*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
vget_low_f32(float32x4_t a)12660*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
12661*80a68eefSBob Badour {
12662*80a68eefSBob Badour     float32x2_t res64;
12663*80a68eefSBob Badour     _M64f(res64, a);
12664*80a68eefSBob Badour     return res64;
12665*80a68eefSBob Badour }
12666*80a68eefSBob Badour 
12667*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
12668*80a68eefSBob Badour #define vget_low_u8 vget_low_s8
12669*80a68eefSBob Badour 
12670*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
12671*80a68eefSBob Badour #define vget_low_u16 vget_low_s16
12672*80a68eefSBob Badour 
12673*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
12674*80a68eefSBob Badour #define vget_low_u32 vget_low_s32
12675*80a68eefSBob Badour 
12676*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
12677*80a68eefSBob Badour #define vget_low_u64 vget_low_s64
12678*80a68eefSBob Badour 
12679*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
12680*80a68eefSBob Badour #define vget_low_p8 vget_low_u8
12681*80a68eefSBob Badour 
12682*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
12683*80a68eefSBob Badour #define vget_low_p16 vget_low_s16
12684*80a68eefSBob Badour 
12685*80a68eefSBob Badour //**************************************************************************
12686*80a68eefSBob Badour //************************ Converting vectors **********************************
12687*80a68eefSBob Badour //**************************************************************************
12688*80a68eefSBob Badour //************* Convert from float ***************************************
12689*80a68eefSBob Badour // need to set _MM_SET_ROUNDING_MODE ( x) accordingly
12690*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t   vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
vcvt_s32_f32(float32x2_t a)12691*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t   vcvt_s32_f32(float32x2_t a)
12692*80a68eefSBob Badour {
12693*80a68eefSBob Badour     int32x2_t res64;
12694*80a68eefSBob Badour     __m128i res;
12695*80a68eefSBob Badour     res =  _mm_cvtps_epi32(_pM128(a)); //use low 64 bits of result only
12696*80a68eefSBob Badour     return64(res);
12697*80a68eefSBob Badour }
12698*80a68eefSBob Badour 
12699*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
vcvt_u32_f32(float32x2_t a)12700*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
12701*80a68eefSBob Badour {
12702*80a68eefSBob Badour     uint32x2_t res64;
12703*80a68eefSBob Badour     __m128i res;
12704*80a68eefSBob Badour     res = vcvtq_u32_f32(_pM128(a));
12705*80a68eefSBob Badour     return64(res);
12706*80a68eefSBob Badour }
12707*80a68eefSBob Badour 
12708*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t  vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
vcvtq_s32_f32(float32x4_t a)12709*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t  vcvtq_s32_f32(float32x4_t a)
12710*80a68eefSBob Badour {
12711*80a68eefSBob Badour     __m128 dif;
12712*80a68eefSBob Badour     __m128i res;
12713*80a68eefSBob Badour     //_mm_cvttps_epi32 incorrectly treats the case a > =2.14748364e+009, therefore the special processing is necessary
12714*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
12715*80a68eefSBob Badour     dif = _mm_cmpge_ps(a, *(__m128*)fmax);
12716*80a68eefSBob Badour     res = _mm_cvttps_epi32(a);
12717*80a68eefSBob Badour     return _mm_xor_si128(res, _M128i(dif));
12718*80a68eefSBob Badour }
12719*80a68eefSBob Badour 
12720*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
vcvtq_u32_f32(float32x4_t a)12721*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
12722*80a68eefSBob Badour {
12723*80a68eefSBob Badour     //No single instruction SSE solution  but we could implement it as following:
12724*80a68eefSBob Badour     __m128i res1, res2, zero, mask;
12725*80a68eefSBob Badour     __m128  max, min, dif;
12726*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
12727*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const float32_t fmax_unsigned[] = { 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f };
12728*80a68eefSBob Badour     zero = _mm_setzero_si128();
12729*80a68eefSBob Badour     mask = _mm_cmpgt_epi32(_M128i(a), zero);
12730*80a68eefSBob Badour     min = _mm_and_ps(_M128(mask), a);
12731*80a68eefSBob Badour     max = _mm_min_ps(min, *(__m128*)fmax_unsigned); //clamped in 0 - 4.29496729+009
12732*80a68eefSBob Badour 
12733*80a68eefSBob Badour     dif = _mm_sub_ps(max, *(__m128*)fmax);
12734*80a68eefSBob Badour     mask = _mm_cmpgt_epi32(_M128i(dif),zero);
12735*80a68eefSBob Badour     dif = _mm_and_ps(_M128(mask), dif);
12736*80a68eefSBob Badour 
12737*80a68eefSBob Badour     res1 = _mm_cvttps_epi32(dif);
12738*80a68eefSBob Badour     res2 = vcvtq_s32_f32(max);
12739*80a68eefSBob Badour     return _mm_add_epi32(res1, res2);
12740*80a68eefSBob Badour }
12741*80a68eefSBob Badour 
12742*80a68eefSBob Badour // ***** Convert to the fixed point  with   the number of fraction bits specified by b ***********
12743*80a68eefSBob Badour //*************************************************************************************************
12744*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
12745*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
12746*80a68eefSBob Badour {
12747*80a68eefSBob Badour     int32x2_t res64;
12748*80a68eefSBob Badour     return64(vcvtq_n_s32_f32(_pM128(a),b));
12749*80a68eefSBob Badour }
12750*80a68eefSBob Badour 
12751*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
12752*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
12753*80a68eefSBob Badour {
12754*80a68eefSBob Badour     uint32x2_t res;
12755*80a68eefSBob Badour     float convconst;
12756*80a68eefSBob Badour     convconst = (float)((uint32_t)1 << b);
12757*80a68eefSBob Badour     res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
12758*80a68eefSBob Badour     res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
12759*80a68eefSBob Badour     return res;
12760*80a68eefSBob Badour }
12761*80a68eefSBob Badour 
12762*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
12763*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
12764*80a68eefSBob Badour {
12765*80a68eefSBob Badour     float convconst;
12766*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
12767*80a68eefSBob Badour     __m128 cconst128;
12768*80a68eefSBob Badour     __m128i mask, res;
12769*80a68eefSBob Badour     convconst = (float)(1 << b);
12770*80a68eefSBob Badour     cconst128 = vdupq_n_f32(convconst);
12771*80a68eefSBob Badour     res =  _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
12772*80a68eefSBob Badour     mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
12773*80a68eefSBob Badour     return _mm_xor_si128 (res,  mask); //res saturated for 0x80000000
12774*80a68eefSBob Badour }
12775*80a68eefSBob Badour 
12776*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
12777*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
12778*80a68eefSBob Badour {
12779*80a68eefSBob Badour     float convconst;
12780*80a68eefSBob Badour     __m128 cconst128;
12781*80a68eefSBob Badour     convconst = (float)(1 << b);
12782*80a68eefSBob Badour     cconst128 = vdupq_n_f32(convconst);
12783*80a68eefSBob Badour     return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
12784*80a68eefSBob Badour }
12785*80a68eefSBob Badour 
12786*80a68eefSBob Badour 
12787*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
vcvtnq_s32_f32(float32x4_t a)12788*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t a)
12789*80a68eefSBob Badour {
12790*80a68eefSBob Badour   return _mm_cvtps_epi32(a);
12791*80a68eefSBob Badour }
12792*80a68eefSBob Badour 
12793*80a68eefSBob Badour //***************** Convert to float *************************
12794*80a68eefSBob Badour //*************************************************************
12795*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
vcvt_f32_s32(int32x2_t a)12796*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
12797*80a68eefSBob Badour {
12798*80a68eefSBob Badour     float32x2_t res;
12799*80a68eefSBob Badour     res.m64_f32[0] = (float) a.m64_i32[0];
12800*80a68eefSBob Badour     res.m64_f32[1] = (float) a.m64_i32[1];
12801*80a68eefSBob Badour     return res;
12802*80a68eefSBob Badour }
12803*80a68eefSBob Badour 
12804*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
vcvt_f32_u32(uint32x2_t a)12805*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
12806*80a68eefSBob Badour {
12807*80a68eefSBob Badour     float32x2_t res;
12808*80a68eefSBob Badour     res.m64_f32[0] = (float) a.m64_u32[0];
12809*80a68eefSBob Badour     res.m64_f32[1] = (float) a.m64_u32[1];
12810*80a68eefSBob Badour     return res;
12811*80a68eefSBob Badour }
12812*80a68eefSBob Badour 
12813*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
12814*80a68eefSBob Badour #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
12815*80a68eefSBob Badour 
12816*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
vcvtq_f32_u32(uint32x4_t a)12817*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
12818*80a68eefSBob Badour {
12819*80a68eefSBob Badour     //solution may be not optimal
12820*80a68eefSBob Badour     __m128 two16, fHi, fLo;
12821*80a68eefSBob Badour     __m128i hi, lo;
12822*80a68eefSBob Badour     two16 = _mm_set1_ps((float)0x10000); //2^16
12823*80a68eefSBob Badour     // Avoid double rounding by doing two exact conversions
12824*80a68eefSBob Badour     // of high and low 16-bit segments
12825*80a68eefSBob Badour     hi = _mm_srli_epi32(a, 16);
12826*80a68eefSBob Badour     lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
12827*80a68eefSBob Badour     fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
12828*80a68eefSBob Badour     fLo = _mm_cvtepi32_ps(lo);
12829*80a68eefSBob Badour     // do single rounding according to current rounding mode
12830*80a68eefSBob Badour     return _mm_add_ps(fHi, fLo);
12831*80a68eefSBob Badour }
12832*80a68eefSBob Badour 
12833*80a68eefSBob Badour // ***** Convert to the float from fixed point  with   the number of fraction bits specified by b ***********
12834*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
12835*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
12836*80a68eefSBob Badour {
12837*80a68eefSBob Badour     float32x2_t res;
12838*80a68eefSBob Badour     float convconst;
12839*80a68eefSBob Badour     convconst = (float)(1. / ((uint32_t)1 << b));
12840*80a68eefSBob Badour     res.m64_f32[0] =  a.m64_i32[0] * convconst;
12841*80a68eefSBob Badour     res.m64_f32[1] = a.m64_i32[1] * convconst;
12842*80a68eefSBob Badour     return res;
12843*80a68eefSBob Badour }
12844*80a68eefSBob Badour 
12845*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
12846*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
12847*80a68eefSBob Badour {
12848*80a68eefSBob Badour     float32x2_t res;
12849*80a68eefSBob Badour     float convconst;
12850*80a68eefSBob Badour     convconst = (float)(1. / ((uint32_t)1 << b));
12851*80a68eefSBob Badour     res.m64_f32[0] =  a.m64_u32[0] * convconst;
12852*80a68eefSBob Badour     res.m64_f32[1] = a.m64_u32[1] * convconst;
12853*80a68eefSBob Badour     return res;
12854*80a68eefSBob Badour }
12855*80a68eefSBob Badour 
12856*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
12857*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
12858*80a68eefSBob Badour {
12859*80a68eefSBob Badour     float convconst;
12860*80a68eefSBob Badour     __m128 cconst128, af;
12861*80a68eefSBob Badour     convconst = (float)(1. / ((uint32_t)1 << b));
12862*80a68eefSBob Badour     af = _mm_cvtepi32_ps(a);
12863*80a68eefSBob Badour     cconst128 = vdupq_n_f32(convconst);
12864*80a68eefSBob Badour     return _mm_mul_ps(af,cconst128);
12865*80a68eefSBob Badour }
12866*80a68eefSBob Badour 
12867*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
12868*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
12869*80a68eefSBob Badour {
12870*80a68eefSBob Badour     float convconst;
12871*80a68eefSBob Badour     __m128 cconst128, af;
12872*80a68eefSBob Badour     convconst = (float)(1. / (1 << b));
12873*80a68eefSBob Badour     af = vcvtq_f32_u32(a);
12874*80a68eefSBob Badour     cconst128 = vdupq_n_f32(convconst);
12875*80a68eefSBob Badour     return _mm_mul_ps(af,cconst128);
12876*80a68eefSBob Badour }
12877*80a68eefSBob Badour 
12878*80a68eefSBob Badour //**************Convert between floats ***********************
12879*80a68eefSBob Badour //************************************************************
12880*80a68eefSBob Badour _NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
12881*80a68eefSBob Badour //Intel SIMD doesn't support 16bits floats curently
12882*80a68eefSBob Badour 
12883*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
12884*80a68eefSBob Badour //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
12885*80a68eefSBob Badour 
12886*80a68eefSBob Badour //************Vector narrow integer conversion (truncation) ******************
12887*80a68eefSBob Badour //****************************************************************************
12888*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
vmovn_s16(int16x8_t a)12889*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
12890*80a68eefSBob Badour {
12891*80a68eefSBob Badour     int8x8_t res64;
12892*80a68eefSBob Badour     __m128i res;
12893*80a68eefSBob Badour     res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
12894*80a68eefSBob Badour     return64(res);
12895*80a68eefSBob Badour }
12896*80a68eefSBob Badour 
12897*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
vmovn_s32(int32x4_t a)12898*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
12899*80a68eefSBob Badour {
12900*80a68eefSBob Badour     int16x4_t res64;
12901*80a68eefSBob Badour     __m128i res;
12902*80a68eefSBob Badour     res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
12903*80a68eefSBob Badour     return64(res);
12904*80a68eefSBob Badour }
12905*80a68eefSBob Badour 
12906*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
vmovn_s64(int64x2_t a)12907*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
12908*80a68eefSBob Badour {
12909*80a68eefSBob Badour     //may be not effective compared with a serial implementation
12910*80a68eefSBob Badour     int32x2_t res64;
12911*80a68eefSBob Badour     __m128i res;
12912*80a68eefSBob Badour     res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
12913*80a68eefSBob Badour     return64(res);
12914*80a68eefSBob Badour }
12915*80a68eefSBob Badour 
12916*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
12917*80a68eefSBob Badour #define vmovn_u16 vmovn_s16
12918*80a68eefSBob Badour 
12919*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
12920*80a68eefSBob Badour #define vmovn_u32 vmovn_s32
12921*80a68eefSBob Badour 
12922*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
12923*80a68eefSBob Badour #define vmovn_u64 vmovn_s64
12924*80a68eefSBob Badour 
12925*80a68eefSBob Badour //**************** Vector long move   ***********************
12926*80a68eefSBob Badour //***********************************************************
12927*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
vmovl_s8(int8x8_t a)12928*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmovl_s8(int8x8_t a)
12929*80a68eefSBob Badour {
12930*80a68eefSBob Badour     return _MM_CVTEPI8_EPI16(_pM128i(a)); //SSE4.1
12931*80a68eefSBob Badour }
12932*80a68eefSBob Badour 
12933*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
vmovl_s16(int16x4_t a)12934*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmovl_s16(int16x4_t a)
12935*80a68eefSBob Badour {
12936*80a68eefSBob Badour     return _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1
12937*80a68eefSBob Badour }
12938*80a68eefSBob Badour 
12939*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
vmovl_s32(int32x2_t a)12940*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t  vmovl_s32(int32x2_t a)
12941*80a68eefSBob Badour {
12942*80a68eefSBob Badour     return _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1
12943*80a68eefSBob Badour }
12944*80a68eefSBob Badour 
12945*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
vmovl_u8(uint8x8_t a)12946*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vmovl_u8(uint8x8_t a)
12947*80a68eefSBob Badour {
12948*80a68eefSBob Badour     return _MM_CVTEPU8_EPI16(_pM128i(a)); //SSE4.1
12949*80a68eefSBob Badour }
12950*80a68eefSBob Badour 
12951*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
vmovl_u16(uint16x4_t a)12952*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t  vmovl_u16(uint16x4_t a)
12953*80a68eefSBob Badour {
12954*80a68eefSBob Badour     return _MM_CVTEPU16_EPI32(_pM128i(a)); //SSE4.1
12955*80a68eefSBob Badour }
12956*80a68eefSBob Badour 
12957*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
vmovl_u32(uint32x2_t a)12958*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t  vmovl_u32(uint32x2_t a)
12959*80a68eefSBob Badour {
12960*80a68eefSBob Badour     return _MM_CVTEPU32_EPI64(_pM128i(a)); //SSE4.1
12961*80a68eefSBob Badour }
12962*80a68eefSBob Badour 
12963*80a68eefSBob Badour //*************Vector saturating narrow integer*****************
12964*80a68eefSBob Badour //**************************************************************
12965*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t   vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
vqmovn_s16(int16x8_t a)12966*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t   vqmovn_s16(int16x8_t a)
12967*80a68eefSBob Badour {
12968*80a68eefSBob Badour     int8x8_t res64;
12969*80a68eefSBob Badour     __m128i res;
12970*80a68eefSBob Badour     res = _mm_packs_epi16(a, a);
12971*80a68eefSBob Badour     return64(res);
12972*80a68eefSBob Badour }
12973*80a68eefSBob Badour 
12974*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t   vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
vqmovn_s32(int32x4_t a)12975*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t   vqmovn_s32(int32x4_t a)
12976*80a68eefSBob Badour {
12977*80a68eefSBob Badour     int16x4_t res64;
12978*80a68eefSBob Badour     __m128i res;
12979*80a68eefSBob Badour     res = _mm_packs_epi32(a, a);
12980*80a68eefSBob Badour     return64(res);
12981*80a68eefSBob Badour }
12982*80a68eefSBob Badour 
12983*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64 (int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)12984*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
12985*80a68eefSBob Badour {
12986*80a68eefSBob Badour     int32x2_t res;
12987*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 int64_t atmp[2];
12988*80a68eefSBob Badour     _mm_store_si128((__m128i*)atmp, a);
12989*80a68eefSBob Badour     if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
12990*80a68eefSBob Badour     if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
12991*80a68eefSBob Badour     if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
12992*80a68eefSBob Badour     if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
12993*80a68eefSBob Badour     res.m64_i32[0] = (int32_t)atmp[0];
12994*80a68eefSBob Badour     res.m64_i32[1] = (int32_t)atmp[1];
12995*80a68eefSBob Badour     return res;
12996*80a68eefSBob Badour }
12997*80a68eefSBob Badour 
12998*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
vqmovn_u16(uint16x8_t a)12999*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
13000*80a68eefSBob Badour {
13001*80a68eefSBob Badour     //no uint16 to uint8 conversion in SSE, need truncate to max signed first. Also trying to avoid _mm_shuffle_epi8 because of its big latency for old Atom CPUs
13002*80a68eefSBob Badour     uint8x8_t res64;
13003*80a68eefSBob Badour     __m128i c7fff, a_trunc, mask_trunc;
13004*80a68eefSBob Badour     c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
13005*80a68eefSBob Badour     a_trunc =  _mm_and_si128(a,  c7fff); // a truncated to max signed
13006*80a68eefSBob Badour     mask_trunc =  _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
13007*80a68eefSBob Badour     mask_trunc =  _mm_and_si128(mask_trunc,  c7fff);  //zero or c7fff if the 15-th bit had been set initially
13008*80a68eefSBob Badour     a_trunc = _mm_or_si128(a_trunc,  mask_trunc);
13009*80a68eefSBob Badour     a_trunc =  _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
13010*80a68eefSBob Badour     return64(a_trunc);
13011*80a68eefSBob Badour }
13012*80a68eefSBob Badour 
13013*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
vqmovn_u32(uint32x4_t a)13014*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
13015*80a68eefSBob Badour {
13016*80a68eefSBob Badour      #ifdef USE_SSE4
13017*80a68eefSBob Badour         //no uint32 to uint16 conversion in SSE, need truncate to max signed first
13018*80a68eefSBob Badour         uint16x4_t res64;
13019*80a68eefSBob Badour         __m128i c7fffffff, a_trunc, mask_trunc;
13020*80a68eefSBob Badour         c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
13021*80a68eefSBob Badour         a_trunc =  _mm_and_si128(a,  c7fffffff); // a truncated to max signed
13022*80a68eefSBob Badour         mask_trunc =  _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
13023*80a68eefSBob Badour         mask_trunc =  _mm_and_si128(mask_trunc,  c7fffffff);  //zero or c7fff if the 15-th bit had been set initially
13024*80a68eefSBob Badour         a_trunc = _mm_or_si128(a_trunc,  mask_trunc);
13025*80a68eefSBob Badour         a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
13026*80a68eefSBob Badour         return64(a_trunc);
13027*80a68eefSBob Badour     #else
13028*80a68eefSBob Badour         uint16x4_t res64;
13029*80a68eefSBob Badour        __m128i res_hi, mask;
13030*80a68eefSBob Badour         mask = _mm_setzero_si128();
13031*80a68eefSBob Badour         res_hi = _mm_srli_epi32(a, 16);
13032*80a68eefSBob Badour         res_hi = _mm_cmpeq_epi16(res_hi, mask);
13033*80a68eefSBob Badour         mask = _mm_cmpeq_epi16(mask,mask); //all fff
13034*80a68eefSBob Badour         mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >16 bits numbers
13035*80a68eefSBob Badour         res_hi = _mm_or_si128(a, mask); //saturated res
13036*80a68eefSBob Badour         res_hi = _mm_shuffle_epi8 (res_hi, *(__m128i*) mask8_32_even_odd); //go to 16 bits
13037*80a68eefSBob Badour         return64(res_hi);
13038*80a68eefSBob Badour     #endif
13039*80a68eefSBob Badour }
13040*80a68eefSBob Badour 
13041*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
vqmovn_u64(uint64x2_t a)13042*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
13043*80a68eefSBob Badour {
13044*80a68eefSBob Badour     //serial solution may be faster
13045*80a68eefSBob Badour     uint32x2_t res64;
13046*80a68eefSBob Badour     __m128i res_hi, mask;
13047*80a68eefSBob Badour     mask = _mm_setzero_si128();
13048*80a68eefSBob Badour     res_hi = _mm_srli_epi64(a, 32);
13049*80a68eefSBob Badour     res_hi = _mm_cmpeq_epi32(res_hi, mask);
13050*80a68eefSBob Badour     mask = _mm_cmpeq_epi32(mask,mask); //all fff
13051*80a68eefSBob Badour     mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
13052*80a68eefSBob Badour     res_hi = _mm_or_si128(a, mask);
13053*80a68eefSBob Badour     res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
13054*80a68eefSBob Badour     return64(res_hi);
13055*80a68eefSBob Badour }
13056*80a68eefSBob Badour //************* Vector saturating narrow integer signed->unsigned **************
13057*80a68eefSBob Badour //*****************************************************************************
13058*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
vqmovun_s16(int16x8_t a)13059*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
13060*80a68eefSBob Badour {
13061*80a68eefSBob Badour     uint8x8_t res64;
13062*80a68eefSBob Badour     __m128i res;
13063*80a68eefSBob Badour     res = _mm_packus_epi16(a, a); //use low 64bits only
13064*80a68eefSBob Badour     return64(res);
13065*80a68eefSBob Badour }
13066*80a68eefSBob Badour 
13067*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
vqmovun_s32(int32x4_t a)13068*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
13069*80a68eefSBob Badour {
13070*80a68eefSBob Badour     uint16x4_t res64;
13071*80a68eefSBob Badour     __m128i res;
13072*80a68eefSBob Badour     res = _MM_PACKUS1_EPI32(a); //use low 64bits only
13073*80a68eefSBob Badour     return64(res);
13074*80a68eefSBob Badour }
13075*80a68eefSBob Badour 
13076*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
vqmovun_s64(int64x2_t a)13077*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
13078*80a68eefSBob Badour {
13079*80a68eefSBob Badour     uint32x2_t res64;
13080*80a68eefSBob Badour     __m128i res_hi,res_lo, zero, cmp;
13081*80a68eefSBob Badour     zero = _mm_setzero_si128();
13082*80a68eefSBob Badour     res_hi = _mm_srli_epi64(a,  32);
13083*80a68eefSBob Badour     cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
13084*80a68eefSBob Badour     res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
13085*80a68eefSBob Badour     cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
13086*80a68eefSBob Badour     res_lo =  _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
13087*80a68eefSBob Badour     res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
13088*80a68eefSBob Badour     return64(res_lo);
13089*80a68eefSBob Badour }
13090*80a68eefSBob Badour 
13091*80a68eefSBob Badour // ********************************************************
13092*80a68eefSBob Badour // **************** Table look up **************************
13093*80a68eefSBob Badour // ********************************************************
13094*80a68eefSBob Badour //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
13095*80a68eefSBob Badour //in a table and generate a new vector. Indexes out of range return 0.
13096*80a68eefSBob Badour //for Intel SIMD we need to set the MSB to 1 for zero return
13097*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
vtbl1_u8(uint8x8_t a,uint8x8_t b)13098*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
13099*80a68eefSBob Badour {
13100*80a68eefSBob Badour     uint8x8_t res64;
13101*80a68eefSBob Badour     __m128i c7, maskgt, bmask, b128;
13102*80a68eefSBob Badour     c7 = _mm_set1_epi8 (7);
13103*80a68eefSBob Badour     b128 = _pM128i(b);
13104*80a68eefSBob Badour     maskgt = _mm_cmpgt_epi8(b128,c7);
13105*80a68eefSBob Badour     bmask = _mm_or_si128(b128,maskgt);
13106*80a68eefSBob Badour     bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
13107*80a68eefSBob Badour     return64(bmask);
13108*80a68eefSBob Badour }
13109*80a68eefSBob Badour 
13110*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a,  int8x8_t b); // VTBL.8 d0, {d0}, d0
13111*80a68eefSBob Badour #define vtbl1_s8 vtbl1_u8
13112*80a68eefSBob Badour 
13113*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
13114*80a68eefSBob Badour #define vtbl1_p8 vtbl1_u8
13115*80a68eefSBob Badour 
13116*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
vtbl2_u8(uint8x8x2_t a,uint8x8_t b)13117*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b)
13118*80a68eefSBob Badour {
13119*80a68eefSBob Badour     uint8x8_t res64;
13120*80a68eefSBob Badour     __m128i c15, a01, maskgt15, bmask, b128;
13121*80a68eefSBob Badour     c15 = _mm_set1_epi8 (15);
13122*80a68eefSBob Badour     b128 = _pM128i(b);
13123*80a68eefSBob Badour     maskgt15 = _mm_cmpgt_epi8(b128,c15);
13124*80a68eefSBob Badour     bmask = _mm_or_si128(b128, maskgt15);
13125*80a68eefSBob Badour     a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]), _pM128i(a.val[1]));
13126*80a68eefSBob Badour     a01 =  _mm_shuffle_epi8(a01, bmask);
13127*80a68eefSBob Badour     return64(a01);
13128*80a68eefSBob Badour }
13129*80a68eefSBob Badour 
13130*80a68eefSBob Badour //int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13131*80a68eefSBob Badour #define vtbl2_s8 vtbl2_u8
13132*80a68eefSBob Badour 
13133*80a68eefSBob Badour //poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13134*80a68eefSBob Badour #define vtbl2_p8 vtbl2_u8
13135*80a68eefSBob Badour 
13136*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
vtbl3_u8(uint8x8x3_t a,uint8x8_t b)13137*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b)
13138*80a68eefSBob Badour {
13139*80a68eefSBob Badour     //solution may be not optimal
13140*80a68eefSBob Badour     uint8x8_t res64;
13141*80a68eefSBob Badour     __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
13142*80a68eefSBob Badour     c15 = _mm_set1_epi8 (15);
13143*80a68eefSBob Badour     c23 = _mm_set1_epi8 (23);
13144*80a68eefSBob Badour     b128 = _pM128i(b);
13145*80a68eefSBob Badour     maskgt23 = _mm_cmpgt_epi8(b128,c23);
13146*80a68eefSBob Badour     bmask = _mm_or_si128(b128, maskgt23);
13147*80a68eefSBob Badour     maskgt15 = _mm_cmpgt_epi8(b128,c15);
13148*80a68eefSBob Badour     a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
13149*80a68eefSBob Badour     sh0 =  _mm_shuffle_epi8(a01, bmask);
13150*80a68eefSBob Badour     sh1 =  _mm_shuffle_epi8(_pM128i(a.val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
13151*80a68eefSBob Badour     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
13152*80a68eefSBob Badour     return64(sh0);
13153*80a68eefSBob Badour }
13154*80a68eefSBob Badour 
13155*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13156*80a68eefSBob Badour #define vtbl3_s8 vtbl3_u8
13157*80a68eefSBob Badour 
13158*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13159*80a68eefSBob Badour #define vtbl3_p8 vtbl3_u8
13160*80a68eefSBob Badour 
13161*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
vtbl4_u8(uint8x8x4_t a,uint8x8_t b)13162*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b)
13163*80a68eefSBob Badour {
13164*80a68eefSBob Badour     //solution may be not optimal
13165*80a68eefSBob Badour     uint8x8_t res64;
13166*80a68eefSBob Badour     __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
13167*80a68eefSBob Badour     c15 = _mm_set1_epi8 (15);
13168*80a68eefSBob Badour     c31 = _mm_set1_epi8 (31);
13169*80a68eefSBob Badour     b128 = _pM128i(b);
13170*80a68eefSBob Badour     maskgt31 = _mm_cmpgt_epi8(b128,c31);
13171*80a68eefSBob Badour     bmask = _mm_or_si128(b128, maskgt31);
13172*80a68eefSBob Badour     maskgt15 = _mm_cmpgt_epi8(b128,c15);
13173*80a68eefSBob Badour     a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
13174*80a68eefSBob Badour     a23 = _mm_unpacklo_epi64(_pM128i(a.val[2]),_pM128i(a.val[3]));
13175*80a68eefSBob Badour     sh0 =  _mm_shuffle_epi8(a01, bmask);
13176*80a68eefSBob Badour     sh1 =  _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
13177*80a68eefSBob Badour     sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
13178*80a68eefSBob Badour     return64(sh0);
13179*80a68eefSBob Badour }
13180*80a68eefSBob Badour 
13181*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13182*80a68eefSBob Badour #define vtbl4_s8 vtbl4_u8
13183*80a68eefSBob Badour 
13184*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13185*80a68eefSBob Badour #define vtbl4_p8 vtbl4_u8
13186*80a68eefSBob Badour 
13187*80a68eefSBob Badour //****************** Extended table look up intrinsics ***************************
13188*80a68eefSBob Badour //**********************************************************************************
13189*80a68eefSBob Badour //VTBX (Vector Table Extension) works in the same way as VTBL do,
13190*80a68eefSBob Badour // except that indexes out of range leave the destination element unchanged.
13191*80a68eefSBob Badour 
13192*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
vtbx1_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)13193*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
13194*80a68eefSBob Badour {
13195*80a68eefSBob Badour     uint8x8_t res64;
13196*80a68eefSBob Badour     __m128i c7, maskgt, sh, c128;
13197*80a68eefSBob Badour     c7 = _mm_set1_epi8 (7);
13198*80a68eefSBob Badour     c128 = _pM128i(c);
13199*80a68eefSBob Badour     maskgt = _mm_cmpgt_epi8(c128,c7);
13200*80a68eefSBob Badour     c7 = _mm_and_si128(maskgt,_pM128i(a));
13201*80a68eefSBob Badour     sh = _mm_shuffle_epi8(_pM128i(b),c128);
13202*80a68eefSBob Badour     sh = _mm_andnot_si128(maskgt,sh);
13203*80a68eefSBob Badour     sh =  _mm_or_si128(sh,c7);
13204*80a68eefSBob Badour     return64(sh);
13205*80a68eefSBob Badour }
13206*80a68eefSBob Badour 
13207*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
13208*80a68eefSBob Badour #define vtbx1_s8 vtbx1_u8
13209*80a68eefSBob Badour 
13210*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
13211*80a68eefSBob Badour #define vtbx1_p8 vtbx1_u8
13212*80a68eefSBob Badour 
13213*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
vtbx2_u8(uint8x8_t a,uint8x8x2_t b,uint8x8_t c)13214*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c)
13215*80a68eefSBob Badour {
13216*80a68eefSBob Badour     uint8x8_t res64;
13217*80a68eefSBob Badour     __m128i c15, b01, maskgt15, sh, c128;
13218*80a68eefSBob Badour     c15 = _mm_set1_epi8 (15);
13219*80a68eefSBob Badour     c128 = _pM128i(c);
13220*80a68eefSBob Badour     maskgt15 = _mm_cmpgt_epi8(c128, c15);
13221*80a68eefSBob Badour     c15 = _mm_and_si128(maskgt15, _pM128i(a));
13222*80a68eefSBob Badour     b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]), _pM128i(b.val[1]));
13223*80a68eefSBob Badour     sh =  _mm_shuffle_epi8(b01, c128);
13224*80a68eefSBob Badour     sh = _mm_andnot_si128(maskgt15, sh);
13225*80a68eefSBob Badour     sh =  _mm_or_si128(sh,c15);
13226*80a68eefSBob Badour     return64(sh);
13227*80a68eefSBob Badour }
13228*80a68eefSBob Badour 
13229*80a68eefSBob Badour //int8x8_t vtbx2_s8(int8x8_t a,  int8x8x2_t b, int8x8_t c);  // VTBX.8 d0, {d0, d1}, d0
13230*80a68eefSBob Badour #define vtbx2_s8 vtbx2_u8
13231*80a68eefSBob Badour 
13232*80a68eefSBob Badour //poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13233*80a68eefSBob Badour #define vtbx2_p8 vtbx2_u8
13234*80a68eefSBob Badour 
13235*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
vtbx3_u8(uint8x8_t a,uint8x8x3_t b,uint8x8_t c)13236*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c)
13237*80a68eefSBob Badour {
13238*80a68eefSBob Badour     //solution may be not optimal
13239*80a68eefSBob Badour     uint8x8_t res64;
13240*80a68eefSBob Badour     __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128;
13241*80a68eefSBob Badour     c15 = _mm_set1_epi8 (15);
13242*80a68eefSBob Badour     c23 = _mm_set1_epi8 (23);
13243*80a68eefSBob Badour     c128 = _pM128i(c);
13244*80a68eefSBob Badour     maskgt15 = _mm_cmpgt_epi8(c128,c15);
13245*80a68eefSBob Badour     maskgt23 = _mm_cmpgt_epi8(c128,c23);
13246*80a68eefSBob Badour     c23 = _mm_and_si128(maskgt23, _pM128i(a));
13247*80a68eefSBob Badour     b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
13248*80a68eefSBob Badour     sh0 =  _mm_shuffle_epi8(b01, c128);
13249*80a68eefSBob Badour     sh1 =  _mm_shuffle_epi8(_pM128i(b.val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
13250*80a68eefSBob Badour     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13251*80a68eefSBob Badour     sh0 = _mm_andnot_si128(maskgt23,sh0);
13252*80a68eefSBob Badour     sh0 = _mm_or_si128(sh0,c23);
13253*80a68eefSBob Badour     return64(sh0);
13254*80a68eefSBob Badour }
13255*80a68eefSBob Badour 
13256*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13257*80a68eefSBob Badour #define vtbx3_s8 vtbx3_u8
13258*80a68eefSBob Badour 
13259*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13260*80a68eefSBob Badour #define vtbx3_p8 vtbx3_u8
13261*80a68eefSBob Badour 
13262*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
vtbx4_u8(uint8x8_t a,uint8x8x4_t b,uint8x8_t c)13263*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c)
13264*80a68eefSBob Badour {
13265*80a68eefSBob Badour     //solution may be not optimal
13266*80a68eefSBob Badour     uint8x8_t res64;
13267*80a68eefSBob Badour     __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
13268*80a68eefSBob Badour     c15 = _mm_set1_epi8 (15);
13269*80a68eefSBob Badour     c31 = _mm_set1_epi8 (31);
13270*80a68eefSBob Badour     c128 = _pM128i(c);
13271*80a68eefSBob Badour     maskgt15 = _mm_cmpgt_epi8(c128,c15);
13272*80a68eefSBob Badour     maskgt31 = _mm_cmpgt_epi8(c128,c31);
13273*80a68eefSBob Badour     c31 = _mm_and_si128(maskgt31, _pM128i(a));
13274*80a68eefSBob Badour 
13275*80a68eefSBob Badour     b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
13276*80a68eefSBob Badour     b23 = _mm_unpacklo_epi64(_pM128i(b.val[2]),_pM128i(b.val[3]));
13277*80a68eefSBob Badour     sh0 =  _mm_shuffle_epi8(b01, c128);
13278*80a68eefSBob Badour     sh1 =  _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
13279*80a68eefSBob Badour     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13280*80a68eefSBob Badour     sh0 = _mm_andnot_si128(maskgt31,sh0);
13281*80a68eefSBob Badour     sh0 =  _mm_or_si128(sh0,c31);
13282*80a68eefSBob Badour     return64(sh0);
13283*80a68eefSBob Badour }
13284*80a68eefSBob Badour 
13285*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13286*80a68eefSBob Badour #define vtbx4_s8 vtbx4_u8
13287*80a68eefSBob Badour 
13288*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13289*80a68eefSBob Badour #define vtbx4_p8 vtbx4_u8
13290*80a68eefSBob Badour 
13291*80a68eefSBob Badour //*************************************************************************************************
13292*80a68eefSBob Badour // *************************** Operations with a scalar value *********************************
13293*80a68eefSBob Badour //*************************************************************************************************
13294*80a68eefSBob Badour 
13295*80a68eefSBob Badour //******* Vector multiply accumulate by scalar *************************************************
13296*80a68eefSBob Badour //**********************************************************************************************
13297*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
13298*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
13299*80a68eefSBob Badour {
13300*80a68eefSBob Badour     int16_t c;
13301*80a68eefSBob Badour     int16x4_t scalar;
13302*80a68eefSBob Badour     c = vget_lane_s16(v, l);
13303*80a68eefSBob Badour     scalar = vdup_n_s16(c);
13304*80a68eefSBob Badour     return vmla_s16(a, b, scalar);
13305*80a68eefSBob Badour }
13306*80a68eefSBob Badour 
13307*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
13308*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
13309*80a68eefSBob Badour {
13310*80a68eefSBob Badour     int32_t c;
13311*80a68eefSBob Badour     int32x2_t scalar;
13312*80a68eefSBob Badour     c = vget_lane_s32(v, l);
13313*80a68eefSBob Badour     scalar = vdup_n_s32(c);
13314*80a68eefSBob Badour     return vmla_s32(a, b, scalar);
13315*80a68eefSBob Badour }
13316*80a68eefSBob Badour 
13317*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
13318*80a68eefSBob Badour #define vmla_lane_u16 vmla_lane_s16
13319*80a68eefSBob Badour 
13320*80a68eefSBob Badour 
13321*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
13322*80a68eefSBob Badour #define vmla_lane_u32 vmla_lane_s32
13323*80a68eefSBob Badour 
13324*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
13325*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
13326*80a68eefSBob Badour {
13327*80a68eefSBob Badour     float32_t vlane;
13328*80a68eefSBob Badour     float32x2_t c;
13329*80a68eefSBob Badour     vlane = vget_lane_f32(v, l);
13330*80a68eefSBob Badour     c = vdup_n_f32(vlane);
13331*80a68eefSBob Badour     return vmla_f32(a,b,c);
13332*80a68eefSBob Badour }
13333*80a68eefSBob Badour 
13334*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13335*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
13336*80a68eefSBob Badour {
13337*80a68eefSBob Badour     int16_t vlane;
13338*80a68eefSBob Badour     int16x8_t c;
13339*80a68eefSBob Badour     vlane = vget_lane_s16(v, l);
13340*80a68eefSBob Badour     c = vdupq_n_s16(vlane);
13341*80a68eefSBob Badour     return vmlaq_s16(a,b,c);
13342*80a68eefSBob Badour }
13343*80a68eefSBob Badour 
13344*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13345*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
13346*80a68eefSBob Badour {
13347*80a68eefSBob Badour     int32_t vlane;
13348*80a68eefSBob Badour     int32x4_t c;
13349*80a68eefSBob Badour     vlane = vget_lane_s32(v, l);
13350*80a68eefSBob Badour     c = vdupq_n_s32(vlane);
13351*80a68eefSBob Badour     return vmlaq_s32(a,b,c);
13352*80a68eefSBob Badour }
13353*80a68eefSBob Badour 
13354*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13355*80a68eefSBob Badour #define vmlaq_lane_u16 vmlaq_lane_s16
13356*80a68eefSBob Badour 
13357*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13358*80a68eefSBob Badour #define vmlaq_lane_u32 vmlaq_lane_s32
13359*80a68eefSBob Badour 
13360*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
13361*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
13362*80a68eefSBob Badour {
13363*80a68eefSBob Badour     float32_t vlane;
13364*80a68eefSBob Badour     float32x4_t c;
13365*80a68eefSBob Badour     vlane = vget_lane_f32(v, l);
13366*80a68eefSBob Badour     c = vdupq_n_f32(vlane);
13367*80a68eefSBob Badour     return vmlaq_f32(a,b,c);
13368*80a68eefSBob Badour }
13369*80a68eefSBob Badour 
13370*80a68eefSBob Badour //***************** Vector widening multiply accumulate by scalar **********************
13371*80a68eefSBob Badour //***************************************************************************************
13372*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
13373*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
13374*80a68eefSBob Badour {
13375*80a68eefSBob Badour     int16_t vlane;
13376*80a68eefSBob Badour     int16x4_t c;
13377*80a68eefSBob Badour     vlane = vget_lane_s16(v, l);
13378*80a68eefSBob Badour     c = vdup_n_s16(vlane);
13379*80a68eefSBob Badour     return vmlal_s16(a, b, c);
13380*80a68eefSBob Badour }
13381*80a68eefSBob Badour 
13382*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
13383*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
13384*80a68eefSBob Badour {
13385*80a68eefSBob Badour     int32_t vlane;
13386*80a68eefSBob Badour     int32x2_t c;
13387*80a68eefSBob Badour     vlane = vget_lane_s32(v, l);
13388*80a68eefSBob Badour     c = vdup_n_s32(vlane);
13389*80a68eefSBob Badour     return vmlal_s32(a, b, c);
13390*80a68eefSBob Badour }
13391*80a68eefSBob Badour 
13392*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
13393*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
13394*80a68eefSBob Badour {
13395*80a68eefSBob Badour     uint16_t vlane;
13396*80a68eefSBob Badour     uint16x4_t c;
13397*80a68eefSBob Badour     vlane = vget_lane_u16(v, l);
13398*80a68eefSBob Badour     c = vdup_n_u16(vlane);
13399*80a68eefSBob Badour     return vmlal_u16(a, b, c);
13400*80a68eefSBob Badour }
13401*80a68eefSBob Badour 
13402*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
13403*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
13404*80a68eefSBob Badour {
13405*80a68eefSBob Badour     uint32_t vlane;
13406*80a68eefSBob Badour     uint32x2_t c;
13407*80a68eefSBob Badour     vlane = vget_lane_u32(v, l);
13408*80a68eefSBob Badour     c = vdup_n_u32(vlane);
13409*80a68eefSBob Badour     return vmlal_u32(a, b, c);
13410*80a68eefSBob Badour }
13411*80a68eefSBob Badour 
13412*80a68eefSBob Badour // ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
13413*80a68eefSBob Badour // ************************************************************************************************
13414*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
13415*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
13416*80a68eefSBob Badour {
13417*80a68eefSBob Badour     int16_t vlane;
13418*80a68eefSBob Badour     int16x4_t c;
13419*80a68eefSBob Badour     vlane = vget_lane_s16(v, l);
13420*80a68eefSBob Badour     c = vdup_n_s16(vlane);
13421*80a68eefSBob Badour     return vqdmlal_s16(a, b, c);
13422*80a68eefSBob Badour }
13423*80a68eefSBob Badour 
13424*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
13425*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
13426*80a68eefSBob Badour {
13427*80a68eefSBob Badour     int32_t vlane;
13428*80a68eefSBob Badour     uint32x2_t c;
13429*80a68eefSBob Badour     vlane = vget_lane_s32(v, l);
13430*80a68eefSBob Badour     c = vdup_n_s32(vlane);
13431*80a68eefSBob Badour     return vqdmlal_s32(a, b, c);
13432*80a68eefSBob Badour }
13433*80a68eefSBob Badour 
13434*80a68eefSBob Badour // ****** Vector multiply subtract by scalar *****************
13435*80a68eefSBob Badour // *************************************************************
13436*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
13437*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
13438*80a68eefSBob Badour {
13439*80a68eefSBob Badour     int16_t vlane;
13440*80a68eefSBob Badour     int16x4_t c;
13441*80a68eefSBob Badour     vlane = vget_lane_s16(v, l);
13442*80a68eefSBob Badour     c = vdup_n_s16(vlane);
13443*80a68eefSBob Badour     return vmls_s16(a, b, c);
13444*80a68eefSBob Badour }
13445*80a68eefSBob Badour 
13446*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
13447*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
13448*80a68eefSBob Badour {
13449*80a68eefSBob Badour     int32_t vlane;
13450*80a68eefSBob Badour     int32x2_t c;
13451*80a68eefSBob Badour     vlane = vget_lane_s32(v, l);
13452*80a68eefSBob Badour     c = vdup_n_s32(vlane);
13453*80a68eefSBob Badour     return vmls_s32(a, b, c);
13454*80a68eefSBob Badour }
13455*80a68eefSBob Badour 
13456*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
13457*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
13458*80a68eefSBob Badour {
13459*80a68eefSBob Badour     uint16_t vlane;
13460*80a68eefSBob Badour     uint16x4_t c;
13461*80a68eefSBob Badour     vlane = vget_lane_s16(v, l);
13462*80a68eefSBob Badour     c = vdup_n_s16(vlane);
13463*80a68eefSBob Badour     return vmls_s16(a, b, c);
13464*80a68eefSBob Badour }
13465*80a68eefSBob Badour 
13466*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
13467*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
13468*80a68eefSBob Badour {
13469*80a68eefSBob Badour     uint32_t vlane;
13470*80a68eefSBob Badour     uint32x2_t c;
13471*80a68eefSBob Badour     vlane = vget_lane_u32(v, l);
13472*80a68eefSBob Badour     c = vdup_n_u32(vlane);
13473*80a68eefSBob Badour     return vmls_u32(a, b, c);
13474*80a68eefSBob Badour }
13475*80a68eefSBob Badour 
13476*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
13477*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
13478*80a68eefSBob Badour {
13479*80a68eefSBob Badour     float32_t vlane;
13480*80a68eefSBob Badour     float32x2_t c;
13481*80a68eefSBob Badour     vlane = (float) vget_lane_f32(v, l);
13482*80a68eefSBob Badour     c = vdup_n_f32(vlane);
13483*80a68eefSBob Badour     return vmls_f32(a,b,c);
13484*80a68eefSBob Badour }
13485*80a68eefSBob Badour 
13486*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
13487*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
13488*80a68eefSBob Badour {
13489*80a68eefSBob Badour     int16_t vlane;
13490*80a68eefSBob Badour     int16x8_t c;
13491*80a68eefSBob Badour     vlane = vget_lane_s16(v, l);
13492*80a68eefSBob Badour     c = vdupq_n_s16(vlane);
13493*80a68eefSBob Badour     return vmlsq_s16(a, b,c);
13494*80a68eefSBob Badour }
13495*80a68eefSBob Badour 
13496*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
13497*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
13498*80a68eefSBob Badour {
13499*80a68eefSBob Badour     int32_t vlane;
13500*80a68eefSBob Badour     int32x4_t c;
13501*80a68eefSBob Badour     vlane = vget_lane_s32(v, l);
13502*80a68eefSBob Badour     c = vdupq_n_s32(vlane);
13503*80a68eefSBob Badour     return vmlsq_s32(a,b,c);
13504*80a68eefSBob Badour }
13505*80a68eefSBob Badour 
13506*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13507*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
13508*80a68eefSBob Badour {
13509*80a68eefSBob Badour     uint16_t vlane;
13510*80a68eefSBob Badour     uint16x8_t c;
13511*80a68eefSBob Badour     vlane = vget_lane_u16(v, l);
13512*80a68eefSBob Badour     c = vdupq_n_u16(vlane);
13513*80a68eefSBob Badour     return vmlsq_u16(a,b,c);
13514*80a68eefSBob Badour }
13515*80a68eefSBob Badour 
13516*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13517*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
13518*80a68eefSBob Badour {
13519*80a68eefSBob Badour     uint32_t vlane;
13520*80a68eefSBob Badour     uint32x4_t c;
13521*80a68eefSBob Badour     vlane = vget_lane_u32(v, l);
13522*80a68eefSBob Badour     c = vdupq_n_u32(vlane);
13523*80a68eefSBob Badour     return vmlsq_u32(a,b,c);
13524*80a68eefSBob Badour }
13525*80a68eefSBob Badour 
13526*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
13527*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
13528*80a68eefSBob Badour {
13529*80a68eefSBob Badour     float32_t vlane;
13530*80a68eefSBob Badour     float32x4_t c;
13531*80a68eefSBob Badour     vlane = (float) vget_lane_f32(v, l);
13532*80a68eefSBob Badour     c = vdupq_n_f32(vlane);
13533*80a68eefSBob Badour     return vmlsq_f32(a,b,c);
13534*80a68eefSBob Badour }
13535*80a68eefSBob Badour 
13536*80a68eefSBob Badour // **** Vector widening multiply subtract by scalar ****
13537*80a68eefSBob Badour // ****************************************************
13538*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
13539*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
13540*80a68eefSBob Badour {
13541*80a68eefSBob Badour     int16_t vlane;
13542*80a68eefSBob Badour     int16x4_t c;
13543*80a68eefSBob Badour     vlane = vget_lane_s16(v, l);
13544*80a68eefSBob Badour     c = vdup_n_s16(vlane);
13545*80a68eefSBob Badour     return vmlsl_s16(a, b, c);
13546*80a68eefSBob Badour }
13547*80a68eefSBob Badour 
13548*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
13549*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
13550*80a68eefSBob Badour {
13551*80a68eefSBob Badour     int32_t vlane;
13552*80a68eefSBob Badour     int32x2_t c;
13553*80a68eefSBob Badour     vlane = vget_lane_s32(v, l);
13554*80a68eefSBob Badour     c = vdup_n_s32(vlane);
13555*80a68eefSBob Badour     return vmlsl_s32(a, b, c);
13556*80a68eefSBob Badour }
13557*80a68eefSBob Badour 
13558*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
13559*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
13560*80a68eefSBob Badour {
13561*80a68eefSBob Badour     uint16_t vlane;
13562*80a68eefSBob Badour     uint16x4_t c;
13563*80a68eefSBob Badour     vlane = vget_lane_s16(v, l);
13564*80a68eefSBob Badour     c = vdup_n_s16(vlane);
13565*80a68eefSBob Badour     return vmlsl_s16(a, b, c);
13566*80a68eefSBob Badour }
13567*80a68eefSBob Badour 
13568*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
13569*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
13570*80a68eefSBob Badour {
13571*80a68eefSBob Badour     uint32_t vlane;
13572*80a68eefSBob Badour     uint32x2_t c;
13573*80a68eefSBob Badour     vlane = vget_lane_u32(v, l);
13574*80a68eefSBob Badour     c = vdup_n_u32(vlane);
13575*80a68eefSBob Badour     return vmlsl_u32(a, b, c);
13576*80a68eefSBob Badour }
13577*80a68eefSBob Badour 
13578*80a68eefSBob Badour //********* Vector widening saturating doubling multiply subtract by scalar **************************
13579*80a68eefSBob Badour //******************************************************************************************************
13580*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
13581*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
13582*80a68eefSBob Badour {
13583*80a68eefSBob Badour     int16_t vlane;
13584*80a68eefSBob Badour     int16x4_t c;
13585*80a68eefSBob Badour     vlane = vget_lane_s16(v, l);
13586*80a68eefSBob Badour     c = vdup_n_s16(vlane);
13587*80a68eefSBob Badour     return vqdmlsl_s16(a, b, c);
13588*80a68eefSBob Badour }
13589*80a68eefSBob Badour 
13590*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
13591*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
13592*80a68eefSBob Badour {
13593*80a68eefSBob Badour     int32_t vlane;
13594*80a68eefSBob Badour     int32x2_t c;
13595*80a68eefSBob Badour     vlane = vget_lane_s32(v, l);
13596*80a68eefSBob Badour     c = vdup_n_s32(vlane);
13597*80a68eefSBob Badour     return vqdmlsl_s32(a, b, c);
13598*80a68eefSBob Badour }
13599*80a68eefSBob Badour //********** Vector multiply with scalar *****************************
13600*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
vmul_n_s16(int16x4_t a,int16_t b)13601*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
13602*80a68eefSBob Badour {
13603*80a68eefSBob Badour     int16x4_t b16x4;
13604*80a68eefSBob Badour     b16x4 = vdup_n_s16(b);
13605*80a68eefSBob Badour     return vmul_s16(a, b16x4);
13606*80a68eefSBob Badour }
13607*80a68eefSBob Badour 
13608*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
vmul_n_s32(int32x2_t a,int32_t b)13609*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
13610*80a68eefSBob Badour {
13611*80a68eefSBob Badour     //serial solution looks faster
13612*80a68eefSBob Badour     int32x2_t b32x2;
13613*80a68eefSBob Badour     b32x2 = vdup_n_s32(b);
13614*80a68eefSBob Badour     return vmul_s32(a, b32x2);
13615*80a68eefSBob Badour }
13616*80a68eefSBob Badour 
13617*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
vmul_n_f32(float32x2_t a,float32_t b)13618*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
13619*80a68eefSBob Badour {
13620*80a68eefSBob Badour     float32x2_t b32x2;
13621*80a68eefSBob Badour     b32x2 = vdup_n_f32(b);
13622*80a68eefSBob Badour     return vmul_f32(a, b32x2);
13623*80a68eefSBob Badour }
13624*80a68eefSBob Badour 
13625*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
vmul_n_u16(uint16x4_t a,uint16_t b)13626*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
13627*80a68eefSBob Badour {
13628*80a68eefSBob Badour     uint16x4_t b16x4;
13629*80a68eefSBob Badour     b16x4 = vdup_n_s16(b);
13630*80a68eefSBob Badour     return vmul_s16(a, b16x4);
13631*80a68eefSBob Badour }
13632*80a68eefSBob Badour 
13633*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
vmul_n_u32(uint32x2_t a,uint32_t b)13634*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
13635*80a68eefSBob Badour {
13636*80a68eefSBob Badour     //serial solution looks faster
13637*80a68eefSBob Badour     uint32x2_t b32x2;
13638*80a68eefSBob Badour     b32x2 = vdup_n_u32(b);
13639*80a68eefSBob Badour     return vmul_u32(a, b32x2);
13640*80a68eefSBob Badour }
13641*80a68eefSBob Badour 
13642*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
vmulq_n_s16(int16x8_t a,int16_t b)13643*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
13644*80a68eefSBob Badour {
13645*80a68eefSBob Badour     int16x8_t b16x8;
13646*80a68eefSBob Badour     b16x8 = vdupq_n_s16(b);
13647*80a68eefSBob Badour     return vmulq_s16(a, b16x8);
13648*80a68eefSBob Badour }
13649*80a68eefSBob Badour 
13650*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
vmulq_n_s32(int32x4_t a,int32_t b)13651*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
13652*80a68eefSBob Badour {
13653*80a68eefSBob Badour     int32x4_t b32x4;
13654*80a68eefSBob Badour     b32x4 = vdupq_n_s32(b);
13655*80a68eefSBob Badour     return vmulq_s32(a, b32x4);
13656*80a68eefSBob Badour }
13657*80a68eefSBob Badour 
13658*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
vmulq_n_f32(float32x4_t a,float32_t b)13659*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
13660*80a68eefSBob Badour {
13661*80a68eefSBob Badour     float32x4_t b32x4;
13662*80a68eefSBob Badour     b32x4 = vdupq_n_f32(b);
13663*80a68eefSBob Badour     return vmulq_f32(a, b32x4);
13664*80a68eefSBob Badour }
13665*80a68eefSBob Badour 
13666*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
vmulq_n_u16(uint16x8_t a,uint16_t b)13667*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
13668*80a68eefSBob Badour {
13669*80a68eefSBob Badour     uint16x8_t b16x8;
13670*80a68eefSBob Badour     b16x8 = vdupq_n_s16(b);
13671*80a68eefSBob Badour     return vmulq_s16(a, b16x8);
13672*80a68eefSBob Badour }
13673*80a68eefSBob Badour 
13674*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
vmulq_n_u32(uint32x4_t a,uint32_t b)13675*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
13676*80a68eefSBob Badour {
13677*80a68eefSBob Badour     uint32x4_t b32x4;
13678*80a68eefSBob Badour     b32x4 = vdupq_n_u32(b);
13679*80a68eefSBob Badour     return vmulq_u32(a, b32x4);
13680*80a68eefSBob Badour }
13681*80a68eefSBob Badour 
13682*80a68eefSBob Badour //********** Vector multiply lane *****************************
13683*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
13684*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
13685*80a68eefSBob Badour {
13686*80a68eefSBob Badour     int16x4_t b16x4;
13687*80a68eefSBob Badour     int16_t vlane;
13688*80a68eefSBob Badour     vlane = vget_lane_s16(b, c);
13689*80a68eefSBob Badour     b16x4 = vdup_n_s16(vlane);
13690*80a68eefSBob Badour     return vmul_s16(a, b16x4);
13691*80a68eefSBob Badour }
13692*80a68eefSBob Badour 
13693*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
13694*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
13695*80a68eefSBob Badour {
13696*80a68eefSBob Badour     int32x2_t b32x2;
13697*80a68eefSBob Badour     int32_t vlane;
13698*80a68eefSBob Badour     vlane = vget_lane_s32(b, c);
13699*80a68eefSBob Badour     b32x2 = vdup_n_s32(vlane);
13700*80a68eefSBob Badour     return vmul_s32(a, b32x2);
13701*80a68eefSBob Badour }
13702*80a68eefSBob Badour 
13703*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
13704*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
13705*80a68eefSBob Badour {
13706*80a68eefSBob Badour     float32x2_t b32x2;
13707*80a68eefSBob Badour     float32_t vlane;
13708*80a68eefSBob Badour     vlane = vget_lane_f32(b, c);
13709*80a68eefSBob Badour     b32x2 = vdup_n_f32(vlane);
13710*80a68eefSBob Badour     return vmul_f32(a, b32x2);
13711*80a68eefSBob Badour }
13712*80a68eefSBob Badour 
13713*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
13714*80a68eefSBob Badour #define vmul_lane_u16 vmul_lane_s16
13715*80a68eefSBob Badour 
13716*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
13717*80a68eefSBob Badour #define vmul_lane_u32 vmul_lane_s32
13718*80a68eefSBob Badour 
13719*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
13720*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
13721*80a68eefSBob Badour {
13722*80a68eefSBob Badour     int16x8_t b16x8;
13723*80a68eefSBob Badour     int16_t vlane;
13724*80a68eefSBob Badour     vlane = vget_lane_s16(b, c);
13725*80a68eefSBob Badour     b16x8 = vdupq_n_s16(vlane);
13726*80a68eefSBob Badour     return vmulq_s16(a, b16x8);
13727*80a68eefSBob Badour }
13728*80a68eefSBob Badour 
13729*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
13730*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
13731*80a68eefSBob Badour {
13732*80a68eefSBob Badour     int32x4_t b32x4;
13733*80a68eefSBob Badour     int32_t vlane;
13734*80a68eefSBob Badour     vlane = vget_lane_s32(b, c);
13735*80a68eefSBob Badour     b32x4 = vdupq_n_s32(vlane);
13736*80a68eefSBob Badour     return vmulq_s32(a, b32x4);
13737*80a68eefSBob Badour }
13738*80a68eefSBob Badour 
13739*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
13740*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
13741*80a68eefSBob Badour {
13742*80a68eefSBob Badour     float32x4_t b32x4;
13743*80a68eefSBob Badour     float32_t vlane;
13744*80a68eefSBob Badour     vlane = vget_lane_f32(b, c);
13745*80a68eefSBob Badour     b32x4 = vdupq_n_f32(vlane);
13746*80a68eefSBob Badour     return vmulq_f32(a, b32x4);
13747*80a68eefSBob Badour }
13748*80a68eefSBob Badour 
13749*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
13750*80a68eefSBob Badour #define vmulq_lane_u16 vmulq_lane_s16
13751*80a68eefSBob Badour 
13752*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
13753*80a68eefSBob Badour #define vmulq_lane_u32 vmulq_lane_s32
13754*80a68eefSBob Badour 
13755*80a68eefSBob Badour //**** Vector long multiply with scalar ************
13756*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
vmull_n_s16(int16x4_t vec1,int16_t val2)13757*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
13758*80a68eefSBob Badour {
13759*80a68eefSBob Badour     int16x4_t b16x4;
13760*80a68eefSBob Badour     b16x4 = vdup_n_s16(val2);
13761*80a68eefSBob Badour     return vmull_s16(vec1, b16x4);
13762*80a68eefSBob Badour }
13763*80a68eefSBob Badour 
13764*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
vmull_n_s32(int32x2_t vec1,int32_t val2)13765*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
13766*80a68eefSBob Badour {
13767*80a68eefSBob Badour     int32x2_t b32x2;
13768*80a68eefSBob Badour     b32x2 = vdup_n_s32(val2);
13769*80a68eefSBob Badour     return vmull_s32(vec1, b32x2);
13770*80a68eefSBob Badour }
13771*80a68eefSBob Badour 
13772*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
vmull_n_u16(uint16x4_t vec1,uint16_t val2)13773*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
13774*80a68eefSBob Badour {
13775*80a68eefSBob Badour     uint16x4_t b16x4;
13776*80a68eefSBob Badour     b16x4 = vdup_n_s16(val2);
13777*80a68eefSBob Badour     return vmull_s16(vec1, b16x4);
13778*80a68eefSBob Badour }
13779*80a68eefSBob Badour 
13780*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
vmull_n_u32(uint32x2_t vec1,uint32_t val2)13781*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
13782*80a68eefSBob Badour {
13783*80a68eefSBob Badour     uint32x2_t b32x2;
13784*80a68eefSBob Badour     b32x2 = vdup_n_u32(val2);
13785*80a68eefSBob Badour     return vmull_u32(vec1, b32x2);
13786*80a68eefSBob Badour }
13787*80a68eefSBob Badour 
13788*80a68eefSBob Badour //**** Vector long multiply by scalar ****
13789*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
13790*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
13791*80a68eefSBob Badour {
13792*80a68eefSBob Badour     int16_t vlane;
13793*80a68eefSBob Badour     int16x4_t b;
13794*80a68eefSBob Badour     vlane = vget_lane_s16(val2, val3);
13795*80a68eefSBob Badour     b = vdup_n_s16(vlane);
13796*80a68eefSBob Badour     return vmull_s16(vec1, b);
13797*80a68eefSBob Badour }
13798*80a68eefSBob Badour 
13799*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
13800*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
13801*80a68eefSBob Badour {
13802*80a68eefSBob Badour     int32_t vlane;
13803*80a68eefSBob Badour     int32x2_t b;
13804*80a68eefSBob Badour     vlane = vget_lane_s32(val2, val3);
13805*80a68eefSBob Badour     b = vdup_n_s32(vlane);
13806*80a68eefSBob Badour     return vmull_s32(vec1, b);
13807*80a68eefSBob Badour }
13808*80a68eefSBob Badour 
13809*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
13810*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
13811*80a68eefSBob Badour {
13812*80a68eefSBob Badour     uint16_t vlane;
13813*80a68eefSBob Badour     uint16x4_t b;
13814*80a68eefSBob Badour     vlane = vget_lane_s16(val2, val3);
13815*80a68eefSBob Badour     b = vdup_n_s16(vlane);
13816*80a68eefSBob Badour     return vmull_s16(vec1, b);
13817*80a68eefSBob Badour }
13818*80a68eefSBob Badour 
13819*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
13820*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
13821*80a68eefSBob Badour {
13822*80a68eefSBob Badour     uint32_t vlane;
13823*80a68eefSBob Badour     uint32x2_t b;
13824*80a68eefSBob Badour     vlane = vget_lane_u32(val2, val3);
13825*80a68eefSBob Badour     b = vdup_n_u32(vlane);
13826*80a68eefSBob Badour     return vmull_u32(vec1, b);
13827*80a68eefSBob Badour }
13828*80a68eefSBob Badour 
13829*80a68eefSBob Badour //********* Vector saturating doubling long multiply with scalar  *******************
13830*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
vqdmull_n_s16(int16x4_t vec1,int16_t val2)13831*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
13832*80a68eefSBob Badour {
13833*80a68eefSBob Badour     //the serial soulution may be faster due to saturation
13834*80a68eefSBob Badour     int16x4_t b;
13835*80a68eefSBob Badour     b = vdup_n_s16(val2);
13836*80a68eefSBob Badour     return vqdmull_s16(vec1, b);
13837*80a68eefSBob Badour }
13838*80a68eefSBob Badour 
13839*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32 (int32x2_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_SERIAL)13840*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
13841*80a68eefSBob Badour {
13842*80a68eefSBob Badour     int32x2_t b;
13843*80a68eefSBob Badour     b = vdup_n_s32(val2);
13844*80a68eefSBob Badour     return vqdmull_s32(vec1,b); //slow serial function!!!!
13845*80a68eefSBob Badour }
13846*80a68eefSBob Badour 
13847*80a68eefSBob Badour //************* Vector saturating doubling long multiply by scalar ***********************************************
13848*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
13849*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
13850*80a68eefSBob Badour {
13851*80a68eefSBob Badour     int16_t c;
13852*80a68eefSBob Badour     int16x4_t scalar;
13853*80a68eefSBob Badour     c = vget_lane_s16(val2, val3);
13854*80a68eefSBob Badour     scalar = vdup_n_s16(c);
13855*80a68eefSBob Badour     return vqdmull_s16(vec1, scalar);
13856*80a68eefSBob Badour }
13857*80a68eefSBob Badour 
13858*80a68eefSBob Badour 
13859*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULL.S32 q0,d0,d0[0]
13860*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
13861*80a68eefSBob Badour {
13862*80a68eefSBob Badour     int32_t c;
13863*80a68eefSBob Badour     int32x2_t scalar;
13864*80a68eefSBob Badour     c = vget_lane_s32(val2, val3);
13865*80a68eefSBob Badour     scalar = vdup_n_s32(c);
13866*80a68eefSBob Badour     return vqdmull_s32(vec1,scalar); //slow serial function!!!!
13867*80a68eefSBob Badour }
13868*80a68eefSBob Badour 
13869*80a68eefSBob Badour // *****Vector saturating doubling multiply high with scalar *****
13870*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2); //  VQDMULH.S16 d0,d0,d0[0]
vqdmulh_n_s16(int16x4_t vec1,int16_t val2)13871*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2)
13872*80a68eefSBob Badour {
13873*80a68eefSBob Badour     int16x4_t res64;
13874*80a68eefSBob Badour     return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
13875*80a68eefSBob Badour }
13876*80a68eefSBob Badour 
13877*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2); //  VQDMULH.S32 d0,d0,d0[0]
vqdmulh_n_s32(int32x2_t vec1,int32_t val2)13878*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2)
13879*80a68eefSBob Badour {
13880*80a68eefSBob Badour     int32x2_t res64;
13881*80a68eefSBob Badour     return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
13882*80a68eefSBob Badour }
13883*80a68eefSBob Badour 
13884*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); //  VQDMULH.S16 q0,q0,d0[0]
vqdmulhq_n_s16(int16x8_t vec1,int16_t val2)13885*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) //  VQDMULH.S16 q0,q0,d0[0]
13886*80a68eefSBob Badour {
13887*80a68eefSBob Badour     //solution may be not optimal
13888*80a68eefSBob Badour     int16x8_t scalar;
13889*80a68eefSBob Badour     scalar = vdupq_n_s16(val2);
13890*80a68eefSBob Badour     return vqdmulhq_s16(vec1, scalar);
13891*80a68eefSBob Badour }
13892*80a68eefSBob Badour 
13893*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); //  VQDMULH.S32 q0,q0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32 (int32x4_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)13894*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13895*80a68eefSBob Badour {
13896*80a68eefSBob Badour     int32x4_t scalar;
13897*80a68eefSBob Badour     scalar = vdupq_n_s32(val2);
13898*80a68eefSBob Badour     return vqdmulhq_s32(vec1, scalar);
13899*80a68eefSBob Badour }
13900*80a68eefSBob Badour 
13901*80a68eefSBob Badour //***** Vector saturating doubling multiply high by scalar ****************
13902*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 d0,d0,d0[0]
13903*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 d0,d0,d0[0]
13904*80a68eefSBob Badour {
13905*80a68eefSBob Badour     //solution may be not optimal
13906*80a68eefSBob Badour     int16_t vlane;
13907*80a68eefSBob Badour     int16x4_t scalar;
13908*80a68eefSBob Badour     vlane = vget_lane_s16(val2, val3);
13909*80a68eefSBob Badour     scalar = vdup_n_s16(vlane);
13910*80a68eefSBob Badour     return vqdmulh_s16(vec1, scalar);
13911*80a68eefSBob Badour }
13912*80a68eefSBob Badour 
13913*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 d0,d0,d0[0]
13914*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13915*80a68eefSBob Badour {
13916*80a68eefSBob Badour     int32_t vlane;
13917*80a68eefSBob Badour     int32x2_t scalar;
13918*80a68eefSBob Badour     vlane = vget_lane_s32(val2, val3);
13919*80a68eefSBob Badour     scalar = vdup_n_s32(vlane);
13920*80a68eefSBob Badour     return vqdmulh_s32(vec1, scalar);
13921*80a68eefSBob Badour }
13922*80a68eefSBob Badour 
13923*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 q0,q0,d0[0]
13924*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 q0,q0,d0[0]
13925*80a68eefSBob Badour {
13926*80a68eefSBob Badour     //solution may be not optimal
13927*80a68eefSBob Badour     int16_t vlane;
13928*80a68eefSBob Badour     int16x8_t scalar;
13929*80a68eefSBob Badour     vlane = vget_lane_s16(val2, val3);
13930*80a68eefSBob Badour     scalar = vdupq_n_s16(vlane );
13931*80a68eefSBob Badour     return vqdmulhq_s16(vec1, scalar);
13932*80a68eefSBob Badour }
13933*80a68eefSBob Badour 
13934*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 q0,q0,d0[0]
13935*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13936*80a68eefSBob Badour {
13937*80a68eefSBob Badour     //solution may be not optimal
13938*80a68eefSBob Badour     int32_t vlane;
13939*80a68eefSBob Badour     int32x4_t scalar;
13940*80a68eefSBob Badour     vlane = vgetq_lane_s32(_pM128i(val2), val3);
13941*80a68eefSBob Badour     scalar = vdupq_n_s32(vlane );
13942*80a68eefSBob Badour     return vqdmulhq_s32(vec1, scalar);
13943*80a68eefSBob Badour }
13944*80a68eefSBob Badour 
13945*80a68eefSBob Badour //******** Vector saturating rounding doubling multiply high with scalar ***
13946*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
vqrdmulh_n_s16(int16x4_t vec1,int16_t val2)13947*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
13948*80a68eefSBob Badour {
13949*80a68eefSBob Badour     //solution may be not optimal
13950*80a68eefSBob Badour     int16x4_t scalar;
13951*80a68eefSBob Badour     scalar = vdup_n_s16(val2);
13952*80a68eefSBob Badour     return vqrdmulh_s16(vec1, scalar);
13953*80a68eefSBob Badour }
13954*80a68eefSBob Badour 
13955*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32 (int32x2_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)13956*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13957*80a68eefSBob Badour {
13958*80a68eefSBob Badour     int32x2_t scalar;
13959*80a68eefSBob Badour     scalar = vdup_n_s32(val2);
13960*80a68eefSBob Badour     return vqrdmulh_s32(vec1, scalar);
13961*80a68eefSBob Badour }
13962*80a68eefSBob Badour 
13963*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
vqrdmulhq_n_s16(int16x8_t vec1,int16_t val2)13964*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
13965*80a68eefSBob Badour {
13966*80a68eefSBob Badour     //solution may be not optimal
13967*80a68eefSBob Badour     int16x8_t scalar;
13968*80a68eefSBob Badour     scalar = vdupq_n_s16(val2);
13969*80a68eefSBob Badour     return vqrdmulhq_s16(vec1, scalar);
13970*80a68eefSBob Badour }
13971*80a68eefSBob Badour 
13972*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32 (int32x4_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)13973*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13974*80a68eefSBob Badour {
13975*80a68eefSBob Badour     int32x4_t scalar;
13976*80a68eefSBob Badour     scalar = vdupq_n_s32(val2);
13977*80a68eefSBob Badour     return vqrdmulhq_s32(vec1, scalar);
13978*80a68eefSBob Badour }
13979*80a68eefSBob Badour 
13980*80a68eefSBob Badour //********* Vector rounding saturating doubling multiply high by scalar  ****
13981*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
13982*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
13983*80a68eefSBob Badour {
13984*80a68eefSBob Badour     //solution may be not optimal
13985*80a68eefSBob Badour     int16_t vlane;
13986*80a68eefSBob Badour     int16x4_t scalar;
13987*80a68eefSBob Badour     vlane = vget_lane_s16(val2, val3);
13988*80a68eefSBob Badour     scalar = vdup_n_s16(vlane);
13989*80a68eefSBob Badour     return vqrdmulh_s16(vec1, scalar);
13990*80a68eefSBob Badour }
13991*80a68eefSBob Badour 
13992*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
13993*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13994*80a68eefSBob Badour {
13995*80a68eefSBob Badour     int32_t vlane;
13996*80a68eefSBob Badour     int32x2_t scalar;
13997*80a68eefSBob Badour     vlane = vget_lane_s32(val2, val3);
13998*80a68eefSBob Badour     scalar = vdup_n_s32(vlane);
13999*80a68eefSBob Badour     return vqrdmulh_s32(vec1, scalar);
14000*80a68eefSBob Badour }
14001*80a68eefSBob Badour 
14002*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
14003*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
14004*80a68eefSBob Badour {
14005*80a68eefSBob Badour     //solution may be not optimal
14006*80a68eefSBob Badour     int16_t vlane;
14007*80a68eefSBob Badour     int16x8_t scalar;
14008*80a68eefSBob Badour     vlane = vget_lane_s16(val2, val3);
14009*80a68eefSBob Badour     scalar = vdupq_n_s16(vlane);
14010*80a68eefSBob Badour     return vqrdmulhq_s16(vec1, scalar);
14011*80a68eefSBob Badour }
14012*80a68eefSBob Badour 
14013*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
14014*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14015*80a68eefSBob Badour {
14016*80a68eefSBob Badour     //solution may be not optimal
14017*80a68eefSBob Badour     int32_t vlane;
14018*80a68eefSBob Badour     int32x4_t scalar;
14019*80a68eefSBob Badour     vlane = vgetq_lane_s32(_pM128i(val2), val3);
14020*80a68eefSBob Badour     scalar = vdupq_n_s32(vlane );
14021*80a68eefSBob Badour     return vqrdmulhq_s32(vec1, scalar);
14022*80a68eefSBob Badour }
14023*80a68eefSBob Badour 
14024*80a68eefSBob Badour //**************Vector multiply accumulate with scalar *******************
14025*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
vmla_n_s16(int16x4_t a,int16x4_t b,int16_t c)14026*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
14027*80a68eefSBob Badour {
14028*80a68eefSBob Badour     int16x4_t scalar;
14029*80a68eefSBob Badour     scalar = vdup_n_s16(c);
14030*80a68eefSBob Badour     return vmla_s16(a, b, scalar);
14031*80a68eefSBob Badour }
14032*80a68eefSBob Badour 
14033*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
vmla_n_s32(int32x2_t a,int32x2_t b,int32_t c)14034*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
14035*80a68eefSBob Badour {
14036*80a68eefSBob Badour     int32x2_t scalar;
14037*80a68eefSBob Badour     scalar = vdup_n_s32(c);
14038*80a68eefSBob Badour     return vmla_s32(a, b, scalar);
14039*80a68eefSBob Badour }
14040*80a68eefSBob Badour 
14041*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a,  uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
14042*80a68eefSBob Badour #define vmla_n_u16 vmla_n_s16
14043*80a68eefSBob Badour 
14044*80a68eefSBob Badour 
14045*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a,  uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
14046*80a68eefSBob Badour #define vmla_n_u32 vmla_n_s32
14047*80a68eefSBob Badour 
14048*80a68eefSBob Badour 
14049*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
vmla_n_f32(float32x2_t a,float32x2_t b,float32_t c)14050*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
14051*80a68eefSBob Badour {
14052*80a68eefSBob Badour     float32x2_t scalar;
14053*80a68eefSBob Badour     scalar = vdup_n_f32(c);
14054*80a68eefSBob Badour     return vmla_f32(a, b, scalar);
14055*80a68eefSBob Badour }
14056*80a68eefSBob Badour 
14057*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
vmlaq_n_s16(int16x8_t a,int16x8_t b,int16_t c)14058*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
14059*80a68eefSBob Badour {
14060*80a68eefSBob Badour     int16x8_t scalar;
14061*80a68eefSBob Badour     scalar = vdupq_n_s16(c);
14062*80a68eefSBob Badour     return vmlaq_s16(a,b,scalar);
14063*80a68eefSBob Badour }
14064*80a68eefSBob Badour 
14065*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
vmlaq_n_s32(int32x4_t a,int32x4_t b,int32_t c)14066*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
14067*80a68eefSBob Badour {
14068*80a68eefSBob Badour     int32x4_t scalar;
14069*80a68eefSBob Badour     scalar = vdupq_n_s32(c);
14070*80a68eefSBob Badour     return vmlaq_s32(a,b,scalar);
14071*80a68eefSBob Badour }
14072*80a68eefSBob Badour 
14073*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
14074*80a68eefSBob Badour #define vmlaq_n_u16 vmlaq_n_s16
14075*80a68eefSBob Badour 
14076*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
14077*80a68eefSBob Badour #define vmlaq_n_u32 vmlaq_n_s32
14078*80a68eefSBob Badour 
14079*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
vmlaq_n_f32(float32x4_t a,float32x4_t b,float32_t c)14080*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
14081*80a68eefSBob Badour {
14082*80a68eefSBob Badour     float32x4_t scalar;
14083*80a68eefSBob Badour     scalar = vdupq_n_f32(c);
14084*80a68eefSBob Badour     return vmlaq_f32(a,b,scalar);
14085*80a68eefSBob Badour }
14086*80a68eefSBob Badour 
14087*80a68eefSBob Badour //************Vector widening multiply accumulate with scalar****************************
14088*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
vmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)14089*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
14090*80a68eefSBob Badour {
14091*80a68eefSBob Badour     int16x4_t vc;
14092*80a68eefSBob Badour     vc = vdup_n_s16(c);
14093*80a68eefSBob Badour     return vmlal_s16(a, b, vc);
14094*80a68eefSBob Badour }
14095*80a68eefSBob Badour 
14096*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
vmlal_n_s32(int64x2_t a,int32x2_t b,int32_t c)14097*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
14098*80a68eefSBob Badour {
14099*80a68eefSBob Badour     int32x2_t vc;
14100*80a68eefSBob Badour     vc = vdup_n_s32(c);
14101*80a68eefSBob Badour     return vmlal_s32(a, b, vc);
14102*80a68eefSBob Badour }
14103*80a68eefSBob Badour 
14104*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
vmlal_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)14105*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
14106*80a68eefSBob Badour {
14107*80a68eefSBob Badour     uint16x4_t vc;
14108*80a68eefSBob Badour     vc = vdup_n_u16(c);
14109*80a68eefSBob Badour     return vmlal_u16(a, b, vc);
14110*80a68eefSBob Badour }
14111*80a68eefSBob Badour 
14112*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
vmlal_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)14113*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
14114*80a68eefSBob Badour {
14115*80a68eefSBob Badour     uint32x2_t vc;
14116*80a68eefSBob Badour     vc = vdup_n_u32(c);
14117*80a68eefSBob Badour     return vmlal_u32(a, b, vc);
14118*80a68eefSBob Badour }
14119*80a68eefSBob Badour 
14120*80a68eefSBob Badour //************ Vector widening saturating doubling multiply accumulate with scalar **************
14121*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
vqdmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)14122*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14123*80a68eefSBob Badour {
14124*80a68eefSBob Badour     //not optimal SIMD soulution, serial may be faster
14125*80a68eefSBob Badour     int16x4_t vc;
14126*80a68eefSBob Badour     vc = vdup_n_s16(c);
14127*80a68eefSBob Badour     return vqdmlal_s16(a, b, vc);
14128*80a68eefSBob Badour }
14129*80a68eefSBob Badour 
14130*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32 (int64x2_t a,int32x2_t b,int32_t c),_NEON2SSE_REASON_SLOW_SERIAL)14131*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14132*80a68eefSBob Badour {
14133*80a68eefSBob Badour     int32x2_t vc;
14134*80a68eefSBob Badour     vc = vdup_n_s32(c);
14135*80a68eefSBob Badour     return vqdmlal_s32(a, b, vc);
14136*80a68eefSBob Badour }
14137*80a68eefSBob Badour 
14138*80a68eefSBob Badour //******** Vector multiply subtract with scalar **************
14139*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
vmls_n_s16(int16x4_t a,int16x4_t b,int16_t c)14140*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
14141*80a68eefSBob Badour {
14142*80a68eefSBob Badour     int16x4_t vc;
14143*80a68eefSBob Badour     vc = vdup_n_s16(c);
14144*80a68eefSBob Badour     return vmls_s16(a, b, vc);
14145*80a68eefSBob Badour }
14146*80a68eefSBob Badour 
14147*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
vmls_n_s32(int32x2_t a,int32x2_t b,int32_t c)14148*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
14149*80a68eefSBob Badour {
14150*80a68eefSBob Badour     int32x2_t vc;
14151*80a68eefSBob Badour     vc = vdup_n_s32(c);
14152*80a68eefSBob Badour     return vmls_s32(a, b, vc);
14153*80a68eefSBob Badour }
14154*80a68eefSBob Badour 
14155*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
vmls_n_u16(uint16x4_t a,uint16x4_t b,uint16_t c)14156*80a68eefSBob Badour _NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
14157*80a68eefSBob Badour {
14158*80a68eefSBob Badour     uint16x4_t vc;
14159*80a68eefSBob Badour     vc = vdup_n_s16(c);
14160*80a68eefSBob Badour     return vmls_s16(a, b, vc);
14161*80a68eefSBob Badour }
14162*80a68eefSBob Badour 
14163*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
vmls_n_u32(uint32x2_t a,uint32x2_t b,uint32_t c)14164*80a68eefSBob Badour _NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
14165*80a68eefSBob Badour {
14166*80a68eefSBob Badour     uint32x2_t vc;
14167*80a68eefSBob Badour     vc = vdup_n_u32(c);
14168*80a68eefSBob Badour     return vmls_u32(a, b, vc);
14169*80a68eefSBob Badour }
14170*80a68eefSBob Badour 
14171*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
vmls_n_f32(float32x2_t a,float32x2_t b,float32_t c)14172*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
14173*80a68eefSBob Badour {
14174*80a68eefSBob Badour     float32x2_t res;
14175*80a68eefSBob Badour     res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
14176*80a68eefSBob Badour     res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
14177*80a68eefSBob Badour     return res;
14178*80a68eefSBob Badour }
14179*80a68eefSBob Badour 
14180*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
vmlsq_n_s16(int16x8_t a,int16x8_t b,int16_t c)14181*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
14182*80a68eefSBob Badour {
14183*80a68eefSBob Badour     int16x8_t vc;
14184*80a68eefSBob Badour     vc = vdupq_n_s16(c);
14185*80a68eefSBob Badour     return vmlsq_s16(a, b,vc);
14186*80a68eefSBob Badour }
14187*80a68eefSBob Badour 
14188*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
vmlsq_n_s32(int32x4_t a,int32x4_t b,int32_t c)14189*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
14190*80a68eefSBob Badour {
14191*80a68eefSBob Badour     int32x4_t vc;
14192*80a68eefSBob Badour     vc = vdupq_n_s32(c);
14193*80a68eefSBob Badour     return vmlsq_s32(a,b,vc);
14194*80a68eefSBob Badour }
14195*80a68eefSBob Badour 
14196*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
vmlsq_n_u16(uint16x8_t a,uint16x8_t b,uint16_t c)14197*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
14198*80a68eefSBob Badour {
14199*80a68eefSBob Badour     uint16x8_t vc;
14200*80a68eefSBob Badour     vc = vdupq_n_u16(c);
14201*80a68eefSBob Badour     return vmlsq_u16(a,b,vc);
14202*80a68eefSBob Badour }
14203*80a68eefSBob Badour 
14204*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
vmlsq_n_u32(uint32x4_t a,uint32x4_t b,uint32_t c)14205*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
14206*80a68eefSBob Badour {
14207*80a68eefSBob Badour     uint32x4_t vc;
14208*80a68eefSBob Badour     vc = vdupq_n_u32(c);
14209*80a68eefSBob Badour     return vmlsq_u32(a,b,vc);
14210*80a68eefSBob Badour }
14211*80a68eefSBob Badour 
14212*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
vmlsq_n_f32(float32x4_t a,float32x4_t b,float32_t c)14213*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
14214*80a68eefSBob Badour {
14215*80a68eefSBob Badour     float32x4_t vc;
14216*80a68eefSBob Badour     vc = vdupq_n_f32(c);
14217*80a68eefSBob Badour     return vmlsq_f32(a,b,vc);
14218*80a68eefSBob Badour }
14219*80a68eefSBob Badour 
14220*80a68eefSBob Badour //**** Vector widening multiply subtract with scalar ******
14221*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
vmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)14222*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
14223*80a68eefSBob Badour {
14224*80a68eefSBob Badour     int16x4_t vc;
14225*80a68eefSBob Badour     vc = vdup_n_s16(c);
14226*80a68eefSBob Badour     return vmlsl_s16(a, b, vc);
14227*80a68eefSBob Badour }
14228*80a68eefSBob Badour 
14229*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
vmlsl_n_s32(int64x2_t a,int32x2_t b,int32_t c)14230*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
14231*80a68eefSBob Badour {
14232*80a68eefSBob Badour     int32x2_t vc;
14233*80a68eefSBob Badour     vc = vdup_n_s32(c);
14234*80a68eefSBob Badour     return vmlsl_s32(a, b, vc);
14235*80a68eefSBob Badour }
14236*80a68eefSBob Badour 
14237*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
vmlsl_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)14238*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
14239*80a68eefSBob Badour {
14240*80a68eefSBob Badour     uint16x4_t vc;
14241*80a68eefSBob Badour     vc = vdup_n_u16(c);
14242*80a68eefSBob Badour     return vmlsl_u16(a, b, vc);
14243*80a68eefSBob Badour }
14244*80a68eefSBob Badour 
14245*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
vmlsl_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)14246*80a68eefSBob Badour _NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
14247*80a68eefSBob Badour {
14248*80a68eefSBob Badour     uint32x2_t vc;
14249*80a68eefSBob Badour     vc = vdup_n_u32(c);
14250*80a68eefSBob Badour     return vmlsl_u32(a, b, vc);
14251*80a68eefSBob Badour }
14252*80a68eefSBob Badour 
14253*80a68eefSBob Badour //***** Vector widening saturating doubling multiply subtract with scalar *********
14254*80a68eefSBob Badour //**********************************************************************************
14255*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
vqdmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)14256*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14257*80a68eefSBob Badour {
14258*80a68eefSBob Badour     int16x4_t vc;
14259*80a68eefSBob Badour     vc = vdup_n_s16(c);
14260*80a68eefSBob Badour     return vqdmlsl_s16(a, b, vc);
14261*80a68eefSBob Badour }
14262*80a68eefSBob Badour 
14263*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32 (int64x2_t a,int32x2_t b,int32_t c),_NEON2SSE_REASON_SLOW_SERIAL)14264*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14265*80a68eefSBob Badour {
14266*80a68eefSBob Badour     int32x2_t vc;
14267*80a68eefSBob Badour     vc = vdup_n_s32(c);
14268*80a68eefSBob Badour     return vqdmlsl_s32(a, b, vc);
14269*80a68eefSBob Badour }
14270*80a68eefSBob Badour 
14271*80a68eefSBob Badour //*******************  Vector extract ***********************************************
14272*80a68eefSBob Badour //*************************************************************************************
14273*80a68eefSBob Badour //VEXT (Vector Extract) extracts  elements from the bottom end of the second operand
14274*80a68eefSBob Badour //vector and the top end of the first, concatenates them, and places the result in the destination vector
14275*80a68eefSBob Badour //c elements from the bottom end of the second operand and (8-c) from the top end of the first
14276*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14277*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
14278*80a68eefSBob Badour {
14279*80a68eefSBob Badour     int8x8_t res;
14280*80a68eefSBob Badour     int i;
14281*80a68eefSBob Badour     for (i = 0; i<8 - c; i++) {
14282*80a68eefSBob Badour         res.m64_i8[i] = a.m64_i8[i + c];
14283*80a68eefSBob Badour     }
14284*80a68eefSBob Badour     for(i = 0; i<c; i++) {
14285*80a68eefSBob Badour         res.m64_i8[8 - c + i] = b.m64_i8[i];
14286*80a68eefSBob Badour     }
14287*80a68eefSBob Badour     return res;
14288*80a68eefSBob Badour }
14289*80a68eefSBob Badour 
14290*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14291*80a68eefSBob Badour #define vext_u8 vext_s8
14292*80a68eefSBob Badour //same result tested
14293*80a68eefSBob Badour 
14294*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14295*80a68eefSBob Badour #define vext_p8 vext_u8
14296*80a68eefSBob Badour 
14297*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14298*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t  _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14299*80a68eefSBob Badour {
14300*80a68eefSBob Badour     int16x4_t res;
14301*80a68eefSBob Badour     int i;
14302*80a68eefSBob Badour     for (i = 0; i<4 - c; i++) {
14303*80a68eefSBob Badour         res.m64_i16[i] = a.m64_i16[i + c];
14304*80a68eefSBob Badour     }
14305*80a68eefSBob Badour     for(i = 0; i<c; i++) {
14306*80a68eefSBob Badour         res.m64_i16[4 - c + i] = b.m64_i16[i];
14307*80a68eefSBob Badour     }
14308*80a68eefSBob Badour     return res;
14309*80a68eefSBob Badour }
14310*80a68eefSBob Badour 
14311*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14312*80a68eefSBob Badour #define vext_u16 vext_s16
14313*80a68eefSBob Badour 
14314*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14315*80a68eefSBob Badour #define vext_p16 vext_s16
14316*80a68eefSBob Badour 
14317*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14318*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14319*80a68eefSBob Badour {
14320*80a68eefSBob Badour     int32x2_t res;
14321*80a68eefSBob Badour     if (c==0) {
14322*80a68eefSBob Badour         res.m64_i32[0] = a.m64_i32[0];
14323*80a68eefSBob Badour         res.m64_i32[1] = a.m64_i32[1];
14324*80a68eefSBob Badour     } else {
14325*80a68eefSBob Badour         res.m64_i32[0] = a.m64_i32[1];
14326*80a68eefSBob Badour         res.m64_i32[1] = b.m64_i32[0];
14327*80a68eefSBob Badour     }
14328*80a68eefSBob Badour     return res;
14329*80a68eefSBob Badour }
14330*80a68eefSBob Badour 
14331*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14332*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14333*80a68eefSBob Badour {
14334*80a68eefSBob Badour     float32x2_t res;
14335*80a68eefSBob Badour     if (c==0) {
14336*80a68eefSBob Badour         res.m64_f32[0] = a.m64_f32[0];
14337*80a68eefSBob Badour         res.m64_f32[1] = a.m64_f32[1];
14338*80a68eefSBob Badour     } else {
14339*80a68eefSBob Badour         res.m64_f32[0] = a.m64_f32[1];
14340*80a68eefSBob Badour         res.m64_f32[1] = b.m64_f32[0];
14341*80a68eefSBob Badour     }
14342*80a68eefSBob Badour     return res;
14343*80a68eefSBob Badour }
14344*80a68eefSBob Badour 
14345*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14346*80a68eefSBob Badour #define vext_u32 vext_s32
14347*80a68eefSBob Badour 
14348*80a68eefSBob Badour 
14349*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
14350*80a68eefSBob Badour #define vext_s64(a,b,c) a
14351*80a68eefSBob Badour 
14352*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
14353*80a68eefSBob Badour #define vext_u64(a,b,c) a
14354*80a68eefSBob Badour 
14355*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14356*80a68eefSBob Badour #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14357*80a68eefSBob Badour 
14358*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14359*80a68eefSBob Badour #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14360*80a68eefSBob Badour 
14361*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14362*80a68eefSBob Badour #define vextq_p8 vextq_s8
14363*80a68eefSBob Badour 
14364*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14365*80a68eefSBob Badour #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14366*80a68eefSBob Badour 
14367*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14368*80a68eefSBob Badour #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14369*80a68eefSBob Badour 
14370*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14371*80a68eefSBob Badour #define vextq_p16 vextq_s16
14372*80a68eefSBob Badour 
14373*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
14374*80a68eefSBob Badour #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14375*80a68eefSBob Badour 
14376*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
14377*80a68eefSBob Badour #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14378*80a68eefSBob Badour 
14379*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
14380*80a68eefSBob Badour #define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
14381*80a68eefSBob Badour 
14382*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
14383*80a68eefSBob Badour #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14384*80a68eefSBob Badour 
14385*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
14386*80a68eefSBob Badour #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14387*80a68eefSBob Badour 
14388*80a68eefSBob Badour //************ Reverse vector elements (swap endianness)*****************
14389*80a68eefSBob Badour //*************************************************************************
14390*80a68eefSBob Badour //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
14391*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
vrev64_s8(int8x8_t vec)14392*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
14393*80a68eefSBob Badour {
14394*80a68eefSBob Badour     int8x8_t res64;
14395*80a68eefSBob Badour     __m128i res;
14396*80a68eefSBob Badour     res = vrev64q_s8(_pM128i(vec));
14397*80a68eefSBob Badour     return64(res);
14398*80a68eefSBob Badour }
14399*80a68eefSBob Badour 
14400*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
vrev64_s16(int16x4_t vec)14401*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
14402*80a68eefSBob Badour {
14403*80a68eefSBob Badour     int16x4_t res64;
14404*80a68eefSBob Badour     __m128i res;
14405*80a68eefSBob Badour     res = vrev64q_s16(_pM128i(vec));
14406*80a68eefSBob Badour     return64(res);
14407*80a68eefSBob Badour }
14408*80a68eefSBob Badour 
14409*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
vrev64_s32(int32x2_t vec)14410*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
14411*80a68eefSBob Badour {
14412*80a68eefSBob Badour     int32x2_t res;
14413*80a68eefSBob Badour     res.m64_i32[0] = vec.m64_i32[1];
14414*80a68eefSBob Badour     res.m64_i32[1] = vec.m64_i32[0];
14415*80a68eefSBob Badour     return res;
14416*80a68eefSBob Badour }
14417*80a68eefSBob Badour 
14418*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
14419*80a68eefSBob Badour #define vrev64_u8 vrev64_s8
14420*80a68eefSBob Badour 
14421*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
14422*80a68eefSBob Badour #define vrev64_u16 vrev64_s16
14423*80a68eefSBob Badour 
14424*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
14425*80a68eefSBob Badour #define vrev64_u32 vrev64_s32
14426*80a68eefSBob Badour 
14427*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
14428*80a68eefSBob Badour #define vrev64_p8 vrev64_u8
14429*80a68eefSBob Badour 
14430*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
14431*80a68eefSBob Badour #define vrev64_p16 vrev64_u16
14432*80a68eefSBob Badour 
14433*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
vrev64_f32(float32x2_t vec)14434*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
14435*80a68eefSBob Badour {
14436*80a68eefSBob Badour     float32x2_t res;
14437*80a68eefSBob Badour     res.m64_f32[0] = vec.m64_f32[1];
14438*80a68eefSBob Badour     res.m64_f32[1] = vec.m64_f32[0];
14439*80a68eefSBob Badour     return res;
14440*80a68eefSBob Badour }
14441*80a68eefSBob Badour 
14442*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
vrev64q_s8(int8x16_t vec)14443*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
14444*80a68eefSBob Badour {
14445*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
14446*80a68eefSBob Badour     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
14447*80a68eefSBob Badour }
14448*80a68eefSBob Badour 
14449*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
vrev64q_s16(int16x8_t vec)14450*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
14451*80a68eefSBob Badour {
14452*80a68eefSBob Badour     //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
14453*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
14454*80a68eefSBob Badour     return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
14455*80a68eefSBob Badour }
14456*80a68eefSBob Badour 
14457*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
vrev64q_s32(int32x4_t vec)14458*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
14459*80a68eefSBob Badour {
14460*80a68eefSBob Badour     return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
14461*80a68eefSBob Badour }
14462*80a68eefSBob Badour 
14463*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
14464*80a68eefSBob Badour #define vrev64q_u8 vrev64q_s8
14465*80a68eefSBob Badour 
14466*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
14467*80a68eefSBob Badour #define vrev64q_u16 vrev64q_s16
14468*80a68eefSBob Badour 
14469*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
14470*80a68eefSBob Badour #define vrev64q_u32 vrev64q_s32
14471*80a68eefSBob Badour 
14472*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
14473*80a68eefSBob Badour #define vrev64q_p8 vrev64q_u8
14474*80a68eefSBob Badour 
14475*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
14476*80a68eefSBob Badour #define vrev64q_p16 vrev64q_u16
14477*80a68eefSBob Badour 
14478*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
14479*80a68eefSBob Badour #define vrev64q_f32(vec) _mm_shuffle_ps (vec,  vec, _MM_SHUFFLE(2,3, 0,1))
14480*80a68eefSBob Badour 
14481*80a68eefSBob Badour //********************  32 bit shuffles **********************
14482*80a68eefSBob Badour //************************************************************
14483*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
vrev32_s8(int8x8_t vec)14484*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
14485*80a68eefSBob Badour {
14486*80a68eefSBob Badour     int8x8_t res64;
14487*80a68eefSBob Badour     __m128i res;
14488*80a68eefSBob Badour     res = vrev32q_s8(_pM128i(vec));
14489*80a68eefSBob Badour     return64(res);
14490*80a68eefSBob Badour }
14491*80a68eefSBob Badour 
14492*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
vrev32_s16(int16x4_t vec)14493*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
14494*80a68eefSBob Badour {
14495*80a68eefSBob Badour     int16x4_t res64;
14496*80a68eefSBob Badour     __m128i res;
14497*80a68eefSBob Badour     res = vrev32q_s16(_pM128i(vec));
14498*80a68eefSBob Badour     return64(res);
14499*80a68eefSBob Badour }
14500*80a68eefSBob Badour 
14501*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
14502*80a68eefSBob Badour #define vrev32_u8 vrev32_s8
14503*80a68eefSBob Badour 
14504*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
14505*80a68eefSBob Badour #define vrev32_u16 vrev32_s16
14506*80a68eefSBob Badour 
14507*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
14508*80a68eefSBob Badour #define vrev32_p8 vrev32_u8
14509*80a68eefSBob Badour 
14510*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
14511*80a68eefSBob Badour #define vrev32_p16 vrev32_u16
14512*80a68eefSBob Badour 
14513*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
vrev32q_s8(int8x16_t vec)14514*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
14515*80a68eefSBob Badour {
14516*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
14517*80a68eefSBob Badour     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
14518*80a68eefSBob Badour }
14519*80a68eefSBob Badour 
14520*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
vrev32q_s16(int16x8_t vec)14521*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
14522*80a68eefSBob Badour {
14523*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
14524*80a68eefSBob Badour     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
14525*80a68eefSBob Badour }
14526*80a68eefSBob Badour 
14527*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
14528*80a68eefSBob Badour #define vrev32q_u8 vrev32q_s8
14529*80a68eefSBob Badour 
14530*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
14531*80a68eefSBob Badour #define vrev32q_u16 vrev32q_s16
14532*80a68eefSBob Badour 
14533*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
14534*80a68eefSBob Badour #define vrev32q_p8 vrev32q_u8
14535*80a68eefSBob Badour 
14536*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
14537*80a68eefSBob Badour #define vrev32q_p16 vrev32q_u16
14538*80a68eefSBob Badour 
14539*80a68eefSBob Badour //*************  16 bit shuffles **********************
14540*80a68eefSBob Badour //******************************************************
14541*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
vrev16_s8(int8x8_t vec)14542*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
14543*80a68eefSBob Badour {
14544*80a68eefSBob Badour     int8x8_t res64;
14545*80a68eefSBob Badour     __m128i res;
14546*80a68eefSBob Badour     res = vrev16q_s8(_pM128i(vec));
14547*80a68eefSBob Badour     return64(res);
14548*80a68eefSBob Badour }
14549*80a68eefSBob Badour 
14550*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
14551*80a68eefSBob Badour #define vrev16_u8 vrev16_s8
14552*80a68eefSBob Badour 
14553*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
14554*80a68eefSBob Badour #define vrev16_p8 vrev16_u8
14555*80a68eefSBob Badour 
14556*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
vrev16q_s8(int8x16_t vec)14557*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
14558*80a68eefSBob Badour {
14559*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
14560*80a68eefSBob Badour     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev8);
14561*80a68eefSBob Badour }
14562*80a68eefSBob Badour 
14563*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
14564*80a68eefSBob Badour #define vrev16q_u8 vrev16q_s8
14565*80a68eefSBob Badour 
14566*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
14567*80a68eefSBob Badour #define vrev16q_p8 vrev16q_u8
14568*80a68eefSBob Badour 
14569*80a68eefSBob Badour //*********************************************************************
14570*80a68eefSBob Badour //**************** Other single operand arithmetic *******************
14571*80a68eefSBob Badour //*********************************************************************
14572*80a68eefSBob Badour 
14573*80a68eefSBob Badour //*********** Absolute: Vd[i] = |Va[i]| **********************************
14574*80a68eefSBob Badour //************************************************************************
14575*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t   vabs_s8(int8x8_t a); // VABS.S8 d0,d0
vabs_s8(int8x8_t a)14576*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t   vabs_s8(int8x8_t a)
14577*80a68eefSBob Badour {
14578*80a68eefSBob Badour     int8x8_t res64;
14579*80a68eefSBob Badour     __m128i res;
14580*80a68eefSBob Badour     res = _mm_abs_epi8(_pM128i(a));
14581*80a68eefSBob Badour     return64(res);
14582*80a68eefSBob Badour }
14583*80a68eefSBob Badour 
14584*80a68eefSBob Badour 
14585*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t   vabs_s16(int16x4_t a); // VABS.S16 d0,d0
vabs_s16(int16x4_t a)14586*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t   vabs_s16(int16x4_t a)
14587*80a68eefSBob Badour {
14588*80a68eefSBob Badour     int16x4_t res64;
14589*80a68eefSBob Badour     __m128i res;
14590*80a68eefSBob Badour     res = _mm_abs_epi16(_pM128i(a));
14591*80a68eefSBob Badour     return64(res);
14592*80a68eefSBob Badour }
14593*80a68eefSBob Badour 
14594*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t   vabs_s32(int32x2_t a); // VABS.S32 d0,d0
vabs_s32(int32x2_t a)14595*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t   vabs_s32(int32x2_t a)
14596*80a68eefSBob Badour {
14597*80a68eefSBob Badour     int32x2_t res64;
14598*80a68eefSBob Badour     __m128i res;
14599*80a68eefSBob Badour     res = _mm_abs_epi32(_pM128i(a));
14600*80a68eefSBob Badour     return64(res);
14601*80a68eefSBob Badour }
14602*80a68eefSBob Badour 
14603*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
vabs_f32(float32x2_t a)14604*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
14605*80a68eefSBob Badour {
14606*80a68eefSBob Badour     float32x4_t res;
14607*80a68eefSBob Badour     __m64_128 res64;
14608*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14609*80a68eefSBob Badour     res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
14610*80a68eefSBob Badour     _M64f(res64, res);
14611*80a68eefSBob Badour     return res64;
14612*80a68eefSBob Badour }
14613*80a68eefSBob Badour 
14614*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
14615*80a68eefSBob Badour #define vabsq_s8 _mm_abs_epi8
14616*80a68eefSBob Badour 
14617*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
14618*80a68eefSBob Badour #define vabsq_s16 _mm_abs_epi16
14619*80a68eefSBob Badour 
14620*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
14621*80a68eefSBob Badour #define vabsq_s32 _mm_abs_epi32
14622*80a68eefSBob Badour 
14623*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
vabsq_f32(float32x4_t a)14624*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
14625*80a68eefSBob Badour {
14626*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14627*80a68eefSBob Badour     return _mm_and_ps (a, *(__m128*)c7fffffff);
14628*80a68eefSBob Badour }
14629*80a68eefSBob Badour 
14630*80a68eefSBob Badour #ifdef _NEON2SSE_64BIT
14631*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
vabsq_s64(int64x2_t a)14632*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vabsq_s64(int64x2_t a) // VABS.S64 q0,q0
14633*80a68eefSBob Badour {
14634*80a68eefSBob Badour     __m128i sign = _mm_srai_epi32 (_mm_shuffle_epi32 (a, 0xf5), 31);
14635*80a68eefSBob Badour     return _mm_sub_epi64 (_mm_xor_si128 (a, sign), sign);
14636*80a68eefSBob Badour }
14637*80a68eefSBob Badour 
14638*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
vabsq_f64(float64x2_t a)14639*80a68eefSBob Badour _NEON2SSE_INLINE float64x2_t vabsq_f64(float64x2_t a) // VABS.F64 q0,q0
14640*80a68eefSBob Badour {
14641*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int64_t mask[2] = {0x7fffffffffffffffLL, 0x7fffffffffffffffLL};
14642*80a68eefSBob Badour     return _mm_and_pd (a, *(__m128d*)mask);
14643*80a68eefSBob Badour }
14644*80a68eefSBob Badour #endif
14645*80a68eefSBob Badour 
14646*80a68eefSBob Badour //****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
14647*80a68eefSBob Badour //**********************************************************************
14648*80a68eefSBob Badour //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
14649*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
vqabs_s8(int8x8_t a)14650*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
14651*80a68eefSBob Badour {
14652*80a68eefSBob Badour     int8x8_t res64;
14653*80a68eefSBob Badour     __m128i res;
14654*80a68eefSBob Badour     res = vqabsq_s8(_pM128i(a));
14655*80a68eefSBob Badour     return64(res);
14656*80a68eefSBob Badour }
14657*80a68eefSBob Badour 
14658*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
vqabs_s16(int16x4_t a)14659*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
14660*80a68eefSBob Badour {
14661*80a68eefSBob Badour     int16x4_t res64;
14662*80a68eefSBob Badour     __m128i res;
14663*80a68eefSBob Badour     res = vqabsq_s16(_pM128i(a));
14664*80a68eefSBob Badour     return64(res);
14665*80a68eefSBob Badour }
14666*80a68eefSBob Badour 
14667*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
vqabs_s32(int32x2_t a)14668*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
14669*80a68eefSBob Badour {
14670*80a68eefSBob Badour     int32x2_t res64;
14671*80a68eefSBob Badour     __m128i res;
14672*80a68eefSBob Badour     res = vqabsq_s32(_pM128i(a));
14673*80a68eefSBob Badour     return64(res);
14674*80a68eefSBob Badour }
14675*80a68eefSBob Badour 
14676*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
vqabsq_s8(int8x16_t a)14677*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
14678*80a68eefSBob Badour {
14679*80a68eefSBob Badour     __m128i c_128, abs, abs_cmp;
14680*80a68eefSBob Badour     c_128 = _mm_set1_epi8 ((int8_t)0x80); //-128
14681*80a68eefSBob Badour     abs = _mm_abs_epi8 (a);
14682*80a68eefSBob Badour     abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
14683*80a68eefSBob Badour     return _mm_xor_si128 (abs,  abs_cmp);
14684*80a68eefSBob Badour }
14685*80a68eefSBob Badour 
14686*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
vqabsq_s16(int16x8_t a)14687*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
14688*80a68eefSBob Badour {
14689*80a68eefSBob Badour     __m128i c_32768, abs, abs_cmp;
14690*80a68eefSBob Badour     c_32768 = _mm_set1_epi16 ((int16_t)0x8000); //-32768
14691*80a68eefSBob Badour     abs = _mm_abs_epi16 (a);
14692*80a68eefSBob Badour     abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
14693*80a68eefSBob Badour     return _mm_xor_si128 (abs,  abs_cmp);
14694*80a68eefSBob Badour }
14695*80a68eefSBob Badour 
14696*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
vqabsq_s32(int32x4_t a)14697*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
14698*80a68eefSBob Badour {
14699*80a68eefSBob Badour     __m128i c80000000, abs, abs_cmp;
14700*80a68eefSBob Badour     c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
14701*80a68eefSBob Badour     abs = _mm_abs_epi32 (a);
14702*80a68eefSBob Badour     abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
14703*80a68eefSBob Badour     return _mm_xor_si128 (abs,  abs_cmp);
14704*80a68eefSBob Badour }
14705*80a68eefSBob Badour 
14706*80a68eefSBob Badour //*************** Negate: Vd[i] = - Va[i] *************************************
14707*80a68eefSBob Badour //*****************************************************************************
14708*80a68eefSBob Badour //several Negate implementations possible for SIMD.
14709*80a68eefSBob Badour //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
14710*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
vneg_s8(int8x8_t a)14711*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
14712*80a68eefSBob Badour {
14713*80a68eefSBob Badour     int8x8_t res64;
14714*80a68eefSBob Badour     __m128i res;
14715*80a68eefSBob Badour     res = vnegq_s8(_pM128i(a));
14716*80a68eefSBob Badour     return64(res);
14717*80a68eefSBob Badour }
14718*80a68eefSBob Badour 
14719*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
vneg_s16(int16x4_t a)14720*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
14721*80a68eefSBob Badour {
14722*80a68eefSBob Badour     int16x4_t res64;
14723*80a68eefSBob Badour     __m128i res;
14724*80a68eefSBob Badour     res = vnegq_s16(_pM128i(a));
14725*80a68eefSBob Badour     return64(res);
14726*80a68eefSBob Badour }
14727*80a68eefSBob Badour 
14728*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
vneg_s32(int32x2_t a)14729*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
14730*80a68eefSBob Badour {
14731*80a68eefSBob Badour     int32x2_t res64;
14732*80a68eefSBob Badour     __m128i res;
14733*80a68eefSBob Badour     res = vnegq_s32(_pM128i(a));
14734*80a68eefSBob Badour     return64(res);
14735*80a68eefSBob Badour }
14736*80a68eefSBob Badour 
14737*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
vneg_f32(float32x2_t a)14738*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
14739*80a68eefSBob Badour {
14740*80a68eefSBob Badour     float32x4_t res;
14741*80a68eefSBob Badour     __m64_128 res64;
14742*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14743*80a68eefSBob Badour     res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
14744*80a68eefSBob Badour     _M64f(res64, res);
14745*80a68eefSBob Badour     return res64;
14746*80a68eefSBob Badour }
14747*80a68eefSBob Badour 
14748*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
vnegq_s8(int8x16_t a)14749*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
14750*80a68eefSBob Badour {
14751*80a68eefSBob Badour     __m128i zero;
14752*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
14753*80a68eefSBob Badour     return _mm_sub_epi8 (zero, a);
14754*80a68eefSBob Badour } //or _mm_sign_epi8 (a, negative numbers vector)
14755*80a68eefSBob Badour 
14756*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
vnegq_s16(int16x8_t a)14757*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
14758*80a68eefSBob Badour {
14759*80a68eefSBob Badour     __m128i zero;
14760*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
14761*80a68eefSBob Badour     return _mm_sub_epi16 (zero, a);
14762*80a68eefSBob Badour } //or _mm_sign_epi16 (a, negative numbers vector)
14763*80a68eefSBob Badour 
14764*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
vnegq_s32(int32x4_t a)14765*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
14766*80a68eefSBob Badour {
14767*80a68eefSBob Badour     __m128i zero;
14768*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
14769*80a68eefSBob Badour     return _mm_sub_epi32 (zero, a);
14770*80a68eefSBob Badour } //or _mm_sign_epi32 (a, negative numbers vector)
14771*80a68eefSBob Badour 
14772*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
vnegq_f32(float32x4_t a)14773*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
14774*80a68eefSBob Badour {
14775*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14776*80a68eefSBob Badour     return _mm_xor_ps (a, *(__m128*) c80000000);
14777*80a68eefSBob Badour }
14778*80a68eefSBob Badour 
14779*80a68eefSBob Badour //************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
14780*80a68eefSBob Badour //***************************************************************************************
14781*80a68eefSBob Badour //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
14782*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
vqneg_s8(int8x8_t a)14783*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
14784*80a68eefSBob Badour {
14785*80a68eefSBob Badour     int8x8_t res64;
14786*80a68eefSBob Badour     __m128i res;
14787*80a68eefSBob Badour     res = vqnegq_s8(_pM128i(a));
14788*80a68eefSBob Badour     return64(res);
14789*80a68eefSBob Badour }
14790*80a68eefSBob Badour 
14791*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
vqneg_s16(int16x4_t a)14792*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
14793*80a68eefSBob Badour {
14794*80a68eefSBob Badour     int16x4_t res64;
14795*80a68eefSBob Badour     __m128i res;
14796*80a68eefSBob Badour     res = vqnegq_s16(_pM128i(a));
14797*80a68eefSBob Badour     return64(res);
14798*80a68eefSBob Badour }
14799*80a68eefSBob Badour 
14800*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
vqneg_s32(int32x2_t a)14801*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
14802*80a68eefSBob Badour {
14803*80a68eefSBob Badour     int32x2_t res64;
14804*80a68eefSBob Badour     __m128i res;
14805*80a68eefSBob Badour     res = vqnegq_s32(_pM128i(a));
14806*80a68eefSBob Badour     return64(res);
14807*80a68eefSBob Badour }
14808*80a68eefSBob Badour 
14809*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
vqnegq_s8(int8x16_t a)14810*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
14811*80a68eefSBob Badour {
14812*80a68eefSBob Badour     __m128i zero;
14813*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
14814*80a68eefSBob Badour     return _mm_subs_epi8 (zero, a); //saturating substraction
14815*80a68eefSBob Badour }
14816*80a68eefSBob Badour 
14817*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
vqnegq_s16(int16x8_t a)14818*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
14819*80a68eefSBob Badour {
14820*80a68eefSBob Badour     __m128i zero;
14821*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
14822*80a68eefSBob Badour     return _mm_subs_epi16 (zero, a); //saturating substraction
14823*80a68eefSBob Badour }
14824*80a68eefSBob Badour 
14825*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
vqnegq_s32(int32x4_t a)14826*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
14827*80a68eefSBob Badour {
14828*80a68eefSBob Badour     //solution may be not optimal compared with a serial
14829*80a68eefSBob Badour     __m128i c80000000, zero, sub, cmp;
14830*80a68eefSBob Badour     c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
14831*80a68eefSBob Badour     zero = _mm_setzero_si128 ();
14832*80a68eefSBob Badour     sub =  _mm_sub_epi32 (zero, a); //substraction
14833*80a68eefSBob Badour     cmp = _mm_cmpeq_epi32 (a, c80000000);
14834*80a68eefSBob Badour     return _mm_xor_si128 (sub,  cmp);
14835*80a68eefSBob Badour }
14836*80a68eefSBob Badour 
14837*80a68eefSBob Badour //****************** Count leading zeros ********************************
14838*80a68eefSBob Badour //**************************************************************************
14839*80a68eefSBob Badour //no corresponding vector intrinsics in IA32, need to implement it.  While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14840*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
vclz_s8(int8x8_t a)14841*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
14842*80a68eefSBob Badour {
14843*80a68eefSBob Badour     int8x8_t res64;
14844*80a68eefSBob Badour     __m128i res;
14845*80a68eefSBob Badour     res = vclzq_s8(_pM128i(a));
14846*80a68eefSBob Badour     return64(res);
14847*80a68eefSBob Badour }
14848*80a68eefSBob Badour 
14849*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
vclz_s16(int16x4_t a)14850*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
14851*80a68eefSBob Badour {
14852*80a68eefSBob Badour     int16x4_t res64;
14853*80a68eefSBob Badour     __m128i res;
14854*80a68eefSBob Badour     res = vclzq_s16(_pM128i(a));
14855*80a68eefSBob Badour     return64(res);
14856*80a68eefSBob Badour }
14857*80a68eefSBob Badour 
14858*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
vclz_s32(int32x2_t a)14859*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
14860*80a68eefSBob Badour {
14861*80a68eefSBob Badour     int32x2_t res64;
14862*80a68eefSBob Badour     __m128i res;
14863*80a68eefSBob Badour     res = vclzq_s32(_pM128i(a));
14864*80a68eefSBob Badour     return64(res);
14865*80a68eefSBob Badour }
14866*80a68eefSBob Badour 
14867*80a68eefSBob Badour 
14868*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
14869*80a68eefSBob Badour #define vclz_u8 vclz_s8
14870*80a68eefSBob Badour 
14871*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
14872*80a68eefSBob Badour #define vclz_u16 vclz_s16
14873*80a68eefSBob Badour 
14874*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
14875*80a68eefSBob Badour #define vclz_u32 vclz_s32
14876*80a68eefSBob Badour 
14877*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
vclzq_s8(int8x16_t a)14878*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
14879*80a68eefSBob Badour {
14880*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
14881*80a68eefSBob Badour                                                             /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
14882*80a68eefSBob Badour                                                             /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
14883*80a68eefSBob Badour                                                             /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0                          };
14884*80a68eefSBob Badour     __m128i maskLOW, c4, lowclz, mask, hiclz;
14885*80a68eefSBob Badour     maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
14886*80a68eefSBob Badour     c4 = _mm_set1_epi8(4);
14887*80a68eefSBob Badour     lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
14888*80a68eefSBob Badour     mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
14889*80a68eefSBob Badour     mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
14890*80a68eefSBob Badour     hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
14891*80a68eefSBob Badour     mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
14892*80a68eefSBob Badour     lowclz = _mm_and_si128(lowclz,mask);
14893*80a68eefSBob Badour     return _mm_add_epi8(lowclz, hiclz);
14894*80a68eefSBob Badour }
14895*80a68eefSBob Badour 
14896*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
vclzq_s16(int16x8_t a)14897*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
14898*80a68eefSBob Badour {
14899*80a68eefSBob Badour     __m128i c7, res8x16, res8x16_swap;
14900*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
14901*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
14902*80a68eefSBob Badour     c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
14903*80a68eefSBob Badour     res8x16 = vclzq_s8(a);
14904*80a68eefSBob Badour     res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
14905*80a68eefSBob Badour     res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
14906*80a68eefSBob Badour     res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
14907*80a68eefSBob Badour     c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
14908*80a68eefSBob Badour     res8x16 = _mm_and_si128(res8x16, c7); //lowclz
14909*80a68eefSBob Badour     return _mm_add_epi16(res8x16_swap, res8x16);
14910*80a68eefSBob Badour }
14911*80a68eefSBob Badour 
14912*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
vclzq_s32(int32x4_t a)14913*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
14914*80a68eefSBob Badour {
14915*80a68eefSBob Badour     __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
14916*80a68eefSBob Badour     c55555555 = _mm_set1_epi32(0x55555555);
14917*80a68eefSBob Badour     c33333333 = _mm_set1_epi32(0x33333333);
14918*80a68eefSBob Badour     c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
14919*80a68eefSBob Badour     c3f = _mm_set1_epi32(0x3f);
14920*80a68eefSBob Badour     c32 = _mm_set1_epi32(32);
14921*80a68eefSBob Badour     tmp = _mm_srli_epi32(a, 1);
14922*80a68eefSBob Badour     res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
14923*80a68eefSBob Badour     tmp = _mm_srli_epi32(res, 2);
14924*80a68eefSBob Badour     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
14925*80a68eefSBob Badour     tmp = _mm_srli_epi32(res, 4);
14926*80a68eefSBob Badour     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
14927*80a68eefSBob Badour     tmp = _mm_srli_epi32(res, 8);
14928*80a68eefSBob Badour     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
14929*80a68eefSBob Badour     tmp = _mm_srli_epi32(res, 16);
14930*80a68eefSBob Badour     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
14931*80a68eefSBob Badour 
14932*80a68eefSBob Badour     tmp = _mm_srli_epi32(res, 1);
14933*80a68eefSBob Badour     tmp = _mm_and_si128(tmp, c55555555);
14934*80a68eefSBob Badour     res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
14935*80a68eefSBob Badour 
14936*80a68eefSBob Badour     tmp = _mm_srli_epi32(res, 2);
14937*80a68eefSBob Badour     tmp = _mm_and_si128(tmp, c33333333);
14938*80a68eefSBob Badour     tmp1 = _mm_and_si128(res, c33333333);
14939*80a68eefSBob Badour     res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
14940*80a68eefSBob Badour 
14941*80a68eefSBob Badour     tmp = _mm_srli_epi32(res, 4);
14942*80a68eefSBob Badour     tmp = _mm_add_epi32(tmp, res);
14943*80a68eefSBob Badour     res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
14944*80a68eefSBob Badour 
14945*80a68eefSBob Badour     tmp = _mm_srli_epi32(res, 8);
14946*80a68eefSBob Badour     res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
14947*80a68eefSBob Badour 
14948*80a68eefSBob Badour     tmp = _mm_srli_epi32(res, 16);
14949*80a68eefSBob Badour     res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
14950*80a68eefSBob Badour 
14951*80a68eefSBob Badour     res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
14952*80a68eefSBob Badour 
14953*80a68eefSBob Badour     return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
14954*80a68eefSBob Badour }
14955*80a68eefSBob Badour 
14956*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
14957*80a68eefSBob Badour #define vclzq_u8 vclzq_s8
14958*80a68eefSBob Badour 
14959*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
14960*80a68eefSBob Badour #define vclzq_u16 vclzq_s16
14961*80a68eefSBob Badour 
14962*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
14963*80a68eefSBob Badour #define vclzq_u32 vclzq_s32
14964*80a68eefSBob Badour 
14965*80a68eefSBob Badour //************** Count leading sign bits **************************
14966*80a68eefSBob Badour //********************************************************************
14967*80a68eefSBob Badour //VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
14968*80a68eefSBob Badour // the topmost bit, that are the same as the topmost bit, in each element in a vector
14969*80a68eefSBob Badour //No corresponding vector intrinsics in IA32, need to implement it.
14970*80a68eefSBob Badour //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14971*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
vcls_s8(int8x8_t a)14972*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
14973*80a68eefSBob Badour {
14974*80a68eefSBob Badour     int8x8_t res64;
14975*80a68eefSBob Badour     __m128i res;
14976*80a68eefSBob Badour     res = vclsq_s8(_pM128i(a));
14977*80a68eefSBob Badour     return64(res);
14978*80a68eefSBob Badour }
14979*80a68eefSBob Badour 
14980*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
vcls_s16(int16x4_t a)14981*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
14982*80a68eefSBob Badour {
14983*80a68eefSBob Badour     int16x4_t res64;
14984*80a68eefSBob Badour     __m128i res;
14985*80a68eefSBob Badour     res = vclsq_s16(_pM128i(a));
14986*80a68eefSBob Badour     return64(res);
14987*80a68eefSBob Badour }
14988*80a68eefSBob Badour 
14989*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
vcls_s32(int32x2_t a)14990*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
14991*80a68eefSBob Badour {
14992*80a68eefSBob Badour     int32x2_t res64;
14993*80a68eefSBob Badour     __m128i res;
14994*80a68eefSBob Badour     res = vclsq_s32(_pM128i(a));
14995*80a68eefSBob Badour     return64(res);
14996*80a68eefSBob Badour }
14997*80a68eefSBob Badour 
14998*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
vclsq_s8(int8x16_t a)14999*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
15000*80a68eefSBob Badour {
15001*80a68eefSBob Badour     __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
15002*80a68eefSBob Badour     cff = _mm_cmpeq_epi8 (a,a); //0xff
15003*80a68eefSBob Badour     c80 = _mm_set1_epi8((int8_t)0x80);
15004*80a68eefSBob Badour     c1 = _mm_set1_epi8(1);
15005*80a68eefSBob Badour     a_mask = _mm_and_si128(a, c80);
15006*80a68eefSBob Badour     a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
15007*80a68eefSBob Badour     a_neg = _mm_xor_si128(a, cff);
15008*80a68eefSBob Badour     a_neg = _mm_and_si128(a_mask, a_neg);
15009*80a68eefSBob Badour     a_pos = _mm_andnot_si128(a_mask, a);
15010*80a68eefSBob Badour     a_comb = _mm_or_si128(a_pos, a_neg);
15011*80a68eefSBob Badour     a_comb = vclzq_s8(a_comb);
15012*80a68eefSBob Badour     return _mm_sub_epi8(a_comb, c1);
15013*80a68eefSBob Badour }
15014*80a68eefSBob Badour 
15015*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
vclsq_s16(int16x8_t a)15016*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
15017*80a68eefSBob Badour {
15018*80a68eefSBob Badour     __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
15019*80a68eefSBob Badour     cffff = _mm_cmpeq_epi16(a,a);
15020*80a68eefSBob Badour     c8000 =  _mm_slli_epi16(cffff, 15); //0x8000
15021*80a68eefSBob Badour     c1 = _mm_srli_epi16(cffff,15); //0x1
15022*80a68eefSBob Badour     a_mask = _mm_and_si128(a, c8000);
15023*80a68eefSBob Badour     a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
15024*80a68eefSBob Badour     a_neg = _mm_xor_si128(a, cffff);
15025*80a68eefSBob Badour     a_neg = _mm_and_si128(a_mask, a_neg);
15026*80a68eefSBob Badour     a_pos = _mm_andnot_si128(a_mask, a);
15027*80a68eefSBob Badour     a_comb = _mm_or_si128(a_pos, a_neg);
15028*80a68eefSBob Badour     a_comb = vclzq_s16(a_comb);
15029*80a68eefSBob Badour     return _mm_sub_epi16(a_comb, c1);
15030*80a68eefSBob Badour }
15031*80a68eefSBob Badour 
15032*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
vclsq_s32(int32x4_t a)15033*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
15034*80a68eefSBob Badour {
15035*80a68eefSBob Badour     __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
15036*80a68eefSBob Badour     cffffffff = _mm_cmpeq_epi32(a,a);
15037*80a68eefSBob Badour     c80000000 =  _mm_slli_epi32(cffffffff, 31); //0x80000000
15038*80a68eefSBob Badour     c1 = _mm_srli_epi32(cffffffff,31); //0x1
15039*80a68eefSBob Badour     a_mask = _mm_and_si128(a, c80000000);
15040*80a68eefSBob Badour     a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
15041*80a68eefSBob Badour     a_neg = _mm_xor_si128(a, cffffffff);
15042*80a68eefSBob Badour     a_neg = _mm_and_si128(a_mask, a_neg);
15043*80a68eefSBob Badour     a_pos = _mm_andnot_si128(a_mask, a);
15044*80a68eefSBob Badour     a_comb = _mm_or_si128(a_pos, a_neg);
15045*80a68eefSBob Badour     a_comb = vclzq_s32(a_comb);
15046*80a68eefSBob Badour     return _mm_sub_epi32(a_comb, c1);
15047*80a68eefSBob Badour }
15048*80a68eefSBob Badour 
15049*80a68eefSBob Badour //************************* Count number of set bits   ********************************
15050*80a68eefSBob Badour //*************************************************************************************
15051*80a68eefSBob Badour //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2  _mm_popcnt__u32 (unsigned int v) for each element
15052*80a68eefSBob Badour //another option is to do the following algorithm:
15053*80a68eefSBob Badour 
15054*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
vcnt_u8(uint8x8_t a)15055*80a68eefSBob Badour _NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
15056*80a68eefSBob Badour {
15057*80a68eefSBob Badour     uint8x8_t res64;
15058*80a68eefSBob Badour     __m128i res;
15059*80a68eefSBob Badour     res = vcntq_u8(_pM128i(a));
15060*80a68eefSBob Badour     return64(res);
15061*80a68eefSBob Badour }
15062*80a68eefSBob Badour 
15063*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
15064*80a68eefSBob Badour #define vcnt_s8 vcnt_u8
15065*80a68eefSBob Badour 
15066*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
15067*80a68eefSBob Badour #define vcnt_p8 vcnt_u8
15068*80a68eefSBob Badour 
15069*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
vcntq_u8(uint8x16_t a)15070*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
15071*80a68eefSBob Badour {
15072*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
15073*80a68eefSBob Badour                                                                  /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
15074*80a68eefSBob Badour                                                                  /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
15075*80a68eefSBob Badour                                                                  /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4};
15076*80a68eefSBob Badour     __m128i maskLOW, mask, lowpopcnt, hipopcnt;
15077*80a68eefSBob Badour     maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
15078*80a68eefSBob Badour     mask = _mm_and_si128(a, maskLOW);
15079*80a68eefSBob Badour     lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
15080*80a68eefSBob Badour     mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
15081*80a68eefSBob Badour     mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
15082*80a68eefSBob Badour     hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
15083*80a68eefSBob Badour     return _mm_add_epi8(lowpopcnt, hipopcnt);
15084*80a68eefSBob Badour }
15085*80a68eefSBob Badour 
15086*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
15087*80a68eefSBob Badour #define vcntq_s8 vcntq_u8
15088*80a68eefSBob Badour 
15089*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
15090*80a68eefSBob Badour #define vcntq_p8 vcntq_u8
15091*80a68eefSBob Badour 
15092*80a68eefSBob Badour //**************************************************************************************
15093*80a68eefSBob Badour //*********************** Logical operations ****************************************
15094*80a68eefSBob Badour //**************************************************************************************
15095*80a68eefSBob Badour //************************** Bitwise not ***********************************
15096*80a68eefSBob Badour //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
15097*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
vmvn_s8(int8x8_t a)15098*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
15099*80a68eefSBob Badour {
15100*80a68eefSBob Badour     int8x8_t res64;
15101*80a68eefSBob Badour     __m128i res;
15102*80a68eefSBob Badour     res = vmvnq_s8(_pM128i(a));
15103*80a68eefSBob Badour     return64(res);
15104*80a68eefSBob Badour }
15105*80a68eefSBob Badour 
15106*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
vmvn_s16(int16x4_t a)15107*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
15108*80a68eefSBob Badour {
15109*80a68eefSBob Badour     int16x4_t res64;
15110*80a68eefSBob Badour     __m128i res;
15111*80a68eefSBob Badour     res = vmvnq_s16(_pM128i(a));
15112*80a68eefSBob Badour     return64(res);
15113*80a68eefSBob Badour }
15114*80a68eefSBob Badour 
15115*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
vmvn_s32(int32x2_t a)15116*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
15117*80a68eefSBob Badour {
15118*80a68eefSBob Badour     int32x2_t res64;
15119*80a68eefSBob Badour     __m128i res;
15120*80a68eefSBob Badour     res = vmvnq_s32(_pM128i(a));
15121*80a68eefSBob Badour     return64(res);
15122*80a68eefSBob Badour }
15123*80a68eefSBob Badour 
15124*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
15125*80a68eefSBob Badour #define vmvn_u8 vmvn_s8
15126*80a68eefSBob Badour 
15127*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
15128*80a68eefSBob Badour #define vmvn_u16 vmvn_s16
15129*80a68eefSBob Badour 
15130*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
15131*80a68eefSBob Badour #define vmvn_u32 vmvn_s32
15132*80a68eefSBob Badour 
15133*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
15134*80a68eefSBob Badour #define vmvn_p8 vmvn_u8
15135*80a68eefSBob Badour 
15136*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
vmvnq_s8(int8x16_t a)15137*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
15138*80a68eefSBob Badour {
15139*80a68eefSBob Badour     __m128i c1;
15140*80a68eefSBob Badour     c1 = _mm_cmpeq_epi8 (a,a); //0xff
15141*80a68eefSBob Badour     return _mm_andnot_si128 (a, c1);
15142*80a68eefSBob Badour }
15143*80a68eefSBob Badour 
15144*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
vmvnq_s16(int16x8_t a)15145*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
15146*80a68eefSBob Badour {
15147*80a68eefSBob Badour     __m128i c1;
15148*80a68eefSBob Badour     c1 = _mm_cmpeq_epi16 (a,a); //0xffff
15149*80a68eefSBob Badour     return _mm_andnot_si128 (a, c1);
15150*80a68eefSBob Badour }
15151*80a68eefSBob Badour 
15152*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
vmvnq_s32(int32x4_t a)15153*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
15154*80a68eefSBob Badour {
15155*80a68eefSBob Badour     __m128i c1;
15156*80a68eefSBob Badour     c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
15157*80a68eefSBob Badour     return _mm_andnot_si128 (a, c1);
15158*80a68eefSBob Badour }
15159*80a68eefSBob Badour 
15160*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
15161*80a68eefSBob Badour #define vmvnq_u8 vmvnq_s8
15162*80a68eefSBob Badour 
15163*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
15164*80a68eefSBob Badour #define vmvnq_u16 vmvnq_s16
15165*80a68eefSBob Badour 
15166*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
15167*80a68eefSBob Badour #define vmvnq_u32 vmvnq_s32
15168*80a68eefSBob Badour 
15169*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
15170*80a68eefSBob Badour #define vmvnq_p8 vmvnq_u8
15171*80a68eefSBob Badour 
15172*80a68eefSBob Badour //****************** Bitwise and ***********************
15173*80a68eefSBob Badour //******************************************************
15174*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
vand_s8(int8x8_t a,int8x8_t b)15175*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
15176*80a68eefSBob Badour {
15177*80a68eefSBob Badour     int8x8_t res64;
15178*80a68eefSBob Badour     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15179*80a68eefSBob Badour }
15180*80a68eefSBob Badour 
15181*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
vand_s16(int16x4_t a,int16x4_t b)15182*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
15183*80a68eefSBob Badour {
15184*80a68eefSBob Badour     int16x4_t res64;
15185*80a68eefSBob Badour     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15186*80a68eefSBob Badour }
15187*80a68eefSBob Badour 
15188*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
vand_s32(int32x2_t a,int32x2_t b)15189*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
15190*80a68eefSBob Badour {
15191*80a68eefSBob Badour     int32x2_t res64;
15192*80a68eefSBob Badour     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15193*80a68eefSBob Badour }
15194*80a68eefSBob Badour 
15195*80a68eefSBob Badour 
15196*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a,  int64x1_t b); // VAND d0,d0,d0
vand_s64(int64x1_t a,int64x1_t b)15197*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a,  int64x1_t b)
15198*80a68eefSBob Badour {
15199*80a68eefSBob Badour     int64x1_t res;
15200*80a68eefSBob Badour     res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
15201*80a68eefSBob Badour     return res;
15202*80a68eefSBob Badour }
15203*80a68eefSBob Badour 
15204*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
15205*80a68eefSBob Badour #define vand_u8 vand_s8
15206*80a68eefSBob Badour 
15207*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
15208*80a68eefSBob Badour #define vand_u16 vand_s16
15209*80a68eefSBob Badour 
15210*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
15211*80a68eefSBob Badour #define vand_u32 vand_s32
15212*80a68eefSBob Badour 
15213*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a,  uint64x1_t b); // VAND d0,d0,d0
15214*80a68eefSBob Badour #define vand_u64 vand_s64
15215*80a68eefSBob Badour 
15216*80a68eefSBob Badour 
15217*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
15218*80a68eefSBob Badour #define vandq_s8 _mm_and_si128
15219*80a68eefSBob Badour 
15220*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
15221*80a68eefSBob Badour #define vandq_s16 _mm_and_si128
15222*80a68eefSBob Badour 
15223*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
15224*80a68eefSBob Badour #define vandq_s32 _mm_and_si128
15225*80a68eefSBob Badour 
15226*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t   vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
15227*80a68eefSBob Badour #define vandq_s64 _mm_and_si128
15228*80a68eefSBob Badour 
15229*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
15230*80a68eefSBob Badour #define vandq_u8 _mm_and_si128
15231*80a68eefSBob Badour 
15232*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
15233*80a68eefSBob Badour #define vandq_u16 _mm_and_si128
15234*80a68eefSBob Badour 
15235*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
15236*80a68eefSBob Badour #define vandq_u32 _mm_and_si128
15237*80a68eefSBob Badour 
15238*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
15239*80a68eefSBob Badour #define vandq_u64 _mm_and_si128
15240*80a68eefSBob Badour 
15241*80a68eefSBob Badour //******************** Bitwise or *********************************
15242*80a68eefSBob Badour //******************************************************************
15243*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
vorr_s8(int8x8_t a,int8x8_t b)15244*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
15245*80a68eefSBob Badour {
15246*80a68eefSBob Badour     int8x8_t res64;
15247*80a68eefSBob Badour     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15248*80a68eefSBob Badour }
15249*80a68eefSBob Badour 
15250*80a68eefSBob Badour 
15251*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
vorr_s16(int16x4_t a,int16x4_t b)15252*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
15253*80a68eefSBob Badour {
15254*80a68eefSBob Badour     int16x4_t res64;
15255*80a68eefSBob Badour     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15256*80a68eefSBob Badour }
15257*80a68eefSBob Badour 
15258*80a68eefSBob Badour 
15259*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
vorr_s32(int32x2_t a,int32x2_t b)15260*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
15261*80a68eefSBob Badour {
15262*80a68eefSBob Badour     int32x2_t res64;
15263*80a68eefSBob Badour     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15264*80a68eefSBob Badour }
15265*80a68eefSBob Badour 
15266*80a68eefSBob Badour 
15267*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b); // VORR d0,d0,d0
vorr_s64(int64x1_t a,int64x1_t b)15268*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b)
15269*80a68eefSBob Badour {
15270*80a68eefSBob Badour     int64x1_t res;
15271*80a68eefSBob Badour     res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
15272*80a68eefSBob Badour     return res;
15273*80a68eefSBob Badour }
15274*80a68eefSBob Badour 
15275*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
15276*80a68eefSBob Badour #define vorr_u8 vorr_s8
15277*80a68eefSBob Badour 
15278*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
15279*80a68eefSBob Badour #define vorr_u16 vorr_s16
15280*80a68eefSBob Badour 
15281*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
15282*80a68eefSBob Badour #define vorr_u32 vorr_s32
15283*80a68eefSBob Badour 
15284*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a,  uint64x1_t b); // VORR d0,d0,d0
15285*80a68eefSBob Badour #define vorr_u64 vorr_s64
15286*80a68eefSBob Badour 
15287*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
15288*80a68eefSBob Badour #define vorrq_s8 _mm_or_si128
15289*80a68eefSBob Badour 
15290*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
15291*80a68eefSBob Badour #define vorrq_s16 _mm_or_si128
15292*80a68eefSBob Badour 
15293*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
15294*80a68eefSBob Badour #define vorrq_s32 _mm_or_si128
15295*80a68eefSBob Badour 
15296*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
15297*80a68eefSBob Badour #define vorrq_s64 _mm_or_si128
15298*80a68eefSBob Badour 
15299*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
15300*80a68eefSBob Badour #define vorrq_u8 _mm_or_si128
15301*80a68eefSBob Badour 
15302*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
15303*80a68eefSBob Badour #define vorrq_u16 _mm_or_si128
15304*80a68eefSBob Badour 
15305*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
15306*80a68eefSBob Badour #define vorrq_u32 _mm_or_si128
15307*80a68eefSBob Badour 
15308*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
15309*80a68eefSBob Badour #define vorrq_u64 _mm_or_si128
15310*80a68eefSBob Badour 
15311*80a68eefSBob Badour //************* Bitwise exclusive or (EOR or XOR) ******************
15312*80a68eefSBob Badour //*******************************************************************
15313*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
veor_s8(int8x8_t a,int8x8_t b)15314*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
15315*80a68eefSBob Badour {
15316*80a68eefSBob Badour     int8x8_t res64;
15317*80a68eefSBob Badour     return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
15318*80a68eefSBob Badour }
15319*80a68eefSBob Badour 
15320*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
15321*80a68eefSBob Badour #define veor_s16 veor_s8
15322*80a68eefSBob Badour 
15323*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
15324*80a68eefSBob Badour #define veor_s32 veor_s8
15325*80a68eefSBob Badour 
15326*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a,  int64x1_t b); // VEOR d0,d0,d0
veor_s64(int64x1_t a,int64x1_t b)15327*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a,  int64x1_t b)
15328*80a68eefSBob Badour {
15329*80a68eefSBob Badour     int64x1_t res;
15330*80a68eefSBob Badour     res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
15331*80a68eefSBob Badour     return res;
15332*80a68eefSBob Badour }
15333*80a68eefSBob Badour 
15334*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
15335*80a68eefSBob Badour #define veor_u8 veor_s8
15336*80a68eefSBob Badour 
15337*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
15338*80a68eefSBob Badour #define veor_u16 veor_s16
15339*80a68eefSBob Badour 
15340*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
15341*80a68eefSBob Badour #define veor_u32 veor_s32
15342*80a68eefSBob Badour 
15343*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a,  uint64x1_t b); // VEOR d0,d0,d0
15344*80a68eefSBob Badour #define veor_u64 veor_s64
15345*80a68eefSBob Badour 
15346*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
15347*80a68eefSBob Badour #define veorq_s8 _mm_xor_si128
15348*80a68eefSBob Badour 
15349*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
15350*80a68eefSBob Badour #define veorq_s16 _mm_xor_si128
15351*80a68eefSBob Badour 
15352*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
15353*80a68eefSBob Badour #define veorq_s32 _mm_xor_si128
15354*80a68eefSBob Badour 
15355*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t   veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
15356*80a68eefSBob Badour #define veorq_s64 _mm_xor_si128
15357*80a68eefSBob Badour 
15358*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
15359*80a68eefSBob Badour #define veorq_u8 _mm_xor_si128
15360*80a68eefSBob Badour 
15361*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
15362*80a68eefSBob Badour #define veorq_u16 _mm_xor_si128
15363*80a68eefSBob Badour 
15364*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
15365*80a68eefSBob Badour #define veorq_u32 _mm_xor_si128
15366*80a68eefSBob Badour 
15367*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
15368*80a68eefSBob Badour #define veorq_u64 _mm_xor_si128
15369*80a68eefSBob Badour 
15370*80a68eefSBob Badour //********************** Bit Clear **********************************
15371*80a68eefSBob Badour //*******************************************************************
15372*80a68eefSBob Badour //Logical AND complement (AND negation or AND NOT)
15373*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
vbic_s8(int8x8_t a,int8x8_t b)15374*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b)
15375*80a68eefSBob Badour {
15376*80a68eefSBob Badour     int8x8_t res64;
15377*80a68eefSBob Badour     return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
15378*80a68eefSBob Badour }
15379*80a68eefSBob Badour 
15380*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t   vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
15381*80a68eefSBob Badour #define vbic_s16 vbic_s8
15382*80a68eefSBob Badour 
15383*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t   vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
15384*80a68eefSBob Badour #define vbic_s32 vbic_s8
15385*80a68eefSBob Badour 
15386*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
vbic_s64(int64x1_t a,int64x1_t b)15387*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b)
15388*80a68eefSBob Badour {
15389*80a68eefSBob Badour     int64x1_t res;
15390*80a68eefSBob Badour     res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
15391*80a68eefSBob Badour     return res;
15392*80a68eefSBob Badour }
15393*80a68eefSBob Badour 
15394*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t   vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
15395*80a68eefSBob Badour #define vbic_u8 vbic_s8
15396*80a68eefSBob Badour 
15397*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t   vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
15398*80a68eefSBob Badour #define vbic_u16 vbic_s16
15399*80a68eefSBob Badour 
15400*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t   vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
15401*80a68eefSBob Badour #define vbic_u32 vbic_s32
15402*80a68eefSBob Badour 
15403*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t   vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
15404*80a68eefSBob Badour #define vbic_u64 vbic_s64
15405*80a68eefSBob Badour 
15406*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
15407*80a68eefSBob Badour #define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15408*80a68eefSBob Badour 
15409*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
15410*80a68eefSBob Badour #define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15411*80a68eefSBob Badour 
15412*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
15413*80a68eefSBob Badour #define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15414*80a68eefSBob Badour 
15415*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
15416*80a68eefSBob Badour #define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15417*80a68eefSBob Badour 
15418*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
15419*80a68eefSBob Badour #define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15420*80a68eefSBob Badour 
15421*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
15422*80a68eefSBob Badour #define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15423*80a68eefSBob Badour 
15424*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
15425*80a68eefSBob Badour #define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15426*80a68eefSBob Badour 
15427*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
15428*80a68eefSBob Badour #define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15429*80a68eefSBob Badour 
15430*80a68eefSBob Badour //**************** Bitwise OR complement ********************************
15431*80a68eefSBob Badour //**************************************** ********************************
15432*80a68eefSBob Badour //no exact IA 32 match, need to implement it as following
15433*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b); // VORN d0,d0,d0
vorn_s8(int8x8_t a,int8x8_t b)15434*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b)
15435*80a68eefSBob Badour {
15436*80a68eefSBob Badour     int8x8_t res64;
15437*80a68eefSBob Badour     return64(vornq_s8(_pM128i(a), _pM128i(b)));
15438*80a68eefSBob Badour }
15439*80a68eefSBob Badour 
15440*80a68eefSBob Badour 
15441*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b); // VORN d0,d0,d0
vorn_s16(int16x4_t a,int16x4_t b)15442*80a68eefSBob Badour _NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b)
15443*80a68eefSBob Badour {
15444*80a68eefSBob Badour     int16x4_t res64;
15445*80a68eefSBob Badour     return64(vornq_s16(_pM128i(a), _pM128i(b)));
15446*80a68eefSBob Badour }
15447*80a68eefSBob Badour 
15448*80a68eefSBob Badour 
15449*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b); // VORN d0,d0,d0
vorn_s32(int32x2_t a,int32x2_t b)15450*80a68eefSBob Badour _NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b)
15451*80a68eefSBob Badour {
15452*80a68eefSBob Badour     int32x2_t res64;
15453*80a68eefSBob Badour     return64(vornq_s32(_pM128i(a), _pM128i(b)));
15454*80a68eefSBob Badour }
15455*80a68eefSBob Badour 
15456*80a68eefSBob Badour 
15457*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
vorn_s64(int64x1_t a,int64x1_t b)15458*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
15459*80a68eefSBob Badour {
15460*80a68eefSBob Badour     int64x1_t res;
15461*80a68eefSBob Badour     res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
15462*80a68eefSBob Badour     return res;
15463*80a68eefSBob Badour }
15464*80a68eefSBob Badour 
15465*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a,  uint8x8_t b); // VORN d0,d0,d0
15466*80a68eefSBob Badour #define vorn_u8 vorn_s8
15467*80a68eefSBob Badour 
15468*80a68eefSBob Badour 
15469*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a,  uint16x4_t b); // VORN d0,d0,d0
15470*80a68eefSBob Badour #define vorn_u16 vorn_s16
15471*80a68eefSBob Badour 
15472*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a,  uint32x2_t b); // VORN d0,d0,d0
15473*80a68eefSBob Badour #define vorn_u32 vorn_s32
15474*80a68eefSBob Badour 
15475*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
15476*80a68eefSBob Badour #define vorn_u64 vorn_s64
15477*80a68eefSBob Badour 
15478*80a68eefSBob Badour 
15479*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
vornq_s8(int8x16_t a,int8x16_t b)15480*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
15481*80a68eefSBob Badour {
15482*80a68eefSBob Badour     __m128i b1;
15483*80a68eefSBob Badour     b1 = vmvnq_s8( b); //bitwise not for b
15484*80a68eefSBob Badour     return _mm_or_si128 (a, b1);
15485*80a68eefSBob Badour }
15486*80a68eefSBob Badour 
15487*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
vornq_s16(int16x8_t a,int16x8_t b)15488*80a68eefSBob Badour _NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
15489*80a68eefSBob Badour {
15490*80a68eefSBob Badour     __m128i b1;
15491*80a68eefSBob Badour     b1 = vmvnq_s16( b); //bitwise not for b
15492*80a68eefSBob Badour     return _mm_or_si128 (a, b1);
15493*80a68eefSBob Badour }
15494*80a68eefSBob Badour 
15495*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
vornq_s32(int32x4_t a,int32x4_t b)15496*80a68eefSBob Badour _NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
15497*80a68eefSBob Badour {
15498*80a68eefSBob Badour     __m128i b1;
15499*80a68eefSBob Badour     b1 = vmvnq_s32( b); //bitwise not for b
15500*80a68eefSBob Badour     return _mm_or_si128 (a, b1);
15501*80a68eefSBob Badour }
15502*80a68eefSBob Badour 
15503*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
vornq_s64(int64x2_t a,int64x2_t b)15504*80a68eefSBob Badour _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
15505*80a68eefSBob Badour {
15506*80a68eefSBob Badour     __m128i c1, b1;
15507*80a68eefSBob Badour     c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
15508*80a68eefSBob Badour     b1 = _mm_andnot_si128 (b, c1);
15509*80a68eefSBob Badour     return _mm_or_si128 (a, b1);
15510*80a68eefSBob Badour }
15511*80a68eefSBob Badour 
15512*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
vornq_u8(uint8x16_t a,uint8x16_t b)15513*80a68eefSBob Badour _NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
15514*80a68eefSBob Badour {
15515*80a68eefSBob Badour     __m128i b1;
15516*80a68eefSBob Badour     b1 = vmvnq_u8( b); //bitwise not for b
15517*80a68eefSBob Badour     return _mm_or_si128 (a, b1);
15518*80a68eefSBob Badour }
15519*80a68eefSBob Badour 
15520*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
vornq_u16(uint16x8_t a,uint16x8_t b)15521*80a68eefSBob Badour _NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
15522*80a68eefSBob Badour {
15523*80a68eefSBob Badour     __m128i b1;
15524*80a68eefSBob Badour     b1 = vmvnq_s16( b); //bitwise not for b
15525*80a68eefSBob Badour     return _mm_or_si128 (a, b1);
15526*80a68eefSBob Badour }
15527*80a68eefSBob Badour 
15528*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
vornq_u32(uint32x4_t a,uint32x4_t b)15529*80a68eefSBob Badour _NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
15530*80a68eefSBob Badour {
15531*80a68eefSBob Badour     __m128i b1;
15532*80a68eefSBob Badour     b1 = vmvnq_u32( b); //bitwise not for b
15533*80a68eefSBob Badour     return _mm_or_si128 (a, b1);
15534*80a68eefSBob Badour }
15535*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
15536*80a68eefSBob Badour #define vornq_u64 vornq_s64
15537*80a68eefSBob Badour 
15538*80a68eefSBob Badour //********************* Bitwise Select *****************************
15539*80a68eefSBob Badour //******************************************************************
15540*80a68eefSBob Badour //Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
15541*80a68eefSBob Badour 
15542*80a68eefSBob Badour //VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
15543*80a68eefSBob Badour //corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
15544*80a68eefSBob Badour 
15545*80a68eefSBob Badour //VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
15546*80a68eefSBob Badour //if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
15547*80a68eefSBob Badour 
15548*80a68eefSBob Badour //VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
15549*80a68eefSBob Badour //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
15550*80a68eefSBob Badour 
15551*80a68eefSBob Badour //VBSL only is implemented for SIMD
15552*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
vbsl_s8(uint8x8_t a,int8x8_t b,int8x8_t c)15553*80a68eefSBob Badour _NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
15554*80a68eefSBob Badour {
15555*80a68eefSBob Badour     int8x8_t res64;
15556*80a68eefSBob Badour     __m128i res;
15557*80a68eefSBob Badour     res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
15558*80a68eefSBob Badour     return64(res);
15559*80a68eefSBob Badour }
15560*80a68eefSBob Badour 
15561*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
15562*80a68eefSBob Badour #define vbsl_s16 vbsl_s8
15563*80a68eefSBob Badour 
15564*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
15565*80a68eefSBob Badour #define vbsl_s32 vbsl_s8
15566*80a68eefSBob Badour 
15567*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
vbsl_s64(uint64x1_t a,int64x1_t b,int64x1_t c)15568*80a68eefSBob Badour _NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
15569*80a68eefSBob Badour {
15570*80a68eefSBob Badour     int64x1_t res;
15571*80a68eefSBob Badour     res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
15572*80a68eefSBob Badour     return res;
15573*80a68eefSBob Badour }
15574*80a68eefSBob Badour 
15575*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
15576*80a68eefSBob Badour #define vbsl_u8 vbsl_s8
15577*80a68eefSBob Badour 
15578*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
15579*80a68eefSBob Badour #define vbsl_u16 vbsl_s8
15580*80a68eefSBob Badour 
15581*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
15582*80a68eefSBob Badour #define vbsl_u32 vbsl_s8
15583*80a68eefSBob Badour 
15584*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
15585*80a68eefSBob Badour #define vbsl_u64 vbsl_s64
15586*80a68eefSBob Badour 
15587*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
vbsl_f32(uint32x2_t a,float32x2_t b,float32x2_t c)15588*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
15589*80a68eefSBob Badour {
15590*80a68eefSBob Badour     __m128 sel1, sel2;
15591*80a68eefSBob Badour     __m64_128 res64;
15592*80a68eefSBob Badour     sel1 = _mm_and_ps   (_pM128(a), _pM128(b));
15593*80a68eefSBob Badour     sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
15594*80a68eefSBob Badour     sel1 = _mm_or_ps (sel1, sel2);
15595*80a68eefSBob Badour     _M64f(res64, sel1);
15596*80a68eefSBob Badour     return res64;
15597*80a68eefSBob Badour }
15598*80a68eefSBob Badour 
15599*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
15600*80a68eefSBob Badour #define  vbsl_p8 vbsl_s8
15601*80a68eefSBob Badour 
15602*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
15603*80a68eefSBob Badour #define  vbsl_p16 vbsl_s8
15604*80a68eefSBob Badour 
15605*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
vbslq_s8(uint8x16_t a,int8x16_t b,int8x16_t c)15606*80a68eefSBob Badour _NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
15607*80a68eefSBob Badour {
15608*80a68eefSBob Badour     __m128i sel1, sel2;
15609*80a68eefSBob Badour     sel1 = _mm_and_si128   (a, b);
15610*80a68eefSBob Badour     sel2 = _mm_andnot_si128 (a, c);
15611*80a68eefSBob Badour     return _mm_or_si128 (sel1, sel2);
15612*80a68eefSBob Badour }
15613*80a68eefSBob Badour 
15614*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
15615*80a68eefSBob Badour #define vbslq_s16 vbslq_s8
15616*80a68eefSBob Badour 
15617*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
15618*80a68eefSBob Badour #define vbslq_s32 vbslq_s8
15619*80a68eefSBob Badour 
15620*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
15621*80a68eefSBob Badour #define vbslq_s64 vbslq_s8
15622*80a68eefSBob Badour 
15623*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
15624*80a68eefSBob Badour #define vbslq_u8 vbslq_s8
15625*80a68eefSBob Badour 
15626*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
15627*80a68eefSBob Badour #define vbslq_u16 vbslq_s8
15628*80a68eefSBob Badour 
15629*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
15630*80a68eefSBob Badour #define vbslq_u32 vbslq_s8
15631*80a68eefSBob Badour 
15632*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
15633*80a68eefSBob Badour #define vbslq_u64 vbslq_s8
15634*80a68eefSBob Badour 
15635*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
vbslq_f32(uint32x4_t a,float32x4_t b,float32x4_t c)15636*80a68eefSBob Badour _NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
15637*80a68eefSBob Badour {
15638*80a68eefSBob Badour     __m128 sel1, sel2;
15639*80a68eefSBob Badour     sel1 = _mm_and_ps   (*(__m128*)&a, b);
15640*80a68eefSBob Badour     sel2 = _mm_andnot_ps (*(__m128*)&a, c);
15641*80a68eefSBob Badour     return _mm_or_ps (sel1, sel2);
15642*80a68eefSBob Badour }
15643*80a68eefSBob Badour 
15644*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
15645*80a68eefSBob Badour #define vbslq_p8 vbslq_u8
15646*80a68eefSBob Badour 
15647*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
15648*80a68eefSBob Badour #define vbslq_p16 vbslq_s8
15649*80a68eefSBob Badour 
15650*80a68eefSBob Badour //************************************************************************************
15651*80a68eefSBob Badour //**************** Transposition operations ****************************************
15652*80a68eefSBob Badour //************************************************************************************
15653*80a68eefSBob Badour //*****************  Vector Transpose ************************************************
15654*80a68eefSBob Badour //************************************************************************************
15655*80a68eefSBob Badour //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
15656*80a68eefSBob Badour // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
15657*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
vtrn_s8(int8x8_t a,int8x8_t b)15658*80a68eefSBob Badour _NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
15659*80a68eefSBob Badour {
15660*80a68eefSBob Badour     int8x8x2_t val;
15661*80a68eefSBob Badour     __m128i tmp, val0;
15662*80a68eefSBob Badour     tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15663*80a68eefSBob Badour     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask8_32_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
15664*80a68eefSBob Badour     vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
15665*80a68eefSBob Badour     return val;
15666*80a68eefSBob Badour }
15667*80a68eefSBob Badour 
15668*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
vtrn_s16(int16x4_t a,int16x4_t b)15669*80a68eefSBob Badour _NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
15670*80a68eefSBob Badour {
15671*80a68eefSBob Badour     int16x4x2_t val;
15672*80a68eefSBob Badour     __m128i tmp, val0;
15673*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
15674*80a68eefSBob Badour     tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
15675*80a68eefSBob Badour     val0 =  _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
15676*80a68eefSBob Badour     vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
15677*80a68eefSBob Badour     return val;
15678*80a68eefSBob Badour }
15679*80a68eefSBob Badour 
15680*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
vtrn_s32(int32x2_t a,int32x2_t b)15681*80a68eefSBob Badour _NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
15682*80a68eefSBob Badour {
15683*80a68eefSBob Badour     int32x2x2_t val;
15684*80a68eefSBob Badour     __m128i val0;
15685*80a68eefSBob Badour     val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
15686*80a68eefSBob Badour     vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
15687*80a68eefSBob Badour     return val;
15688*80a68eefSBob Badour }
15689*80a68eefSBob Badour 
15690*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
15691*80a68eefSBob Badour #define vtrn_u8 vtrn_s8
15692*80a68eefSBob Badour 
15693*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
15694*80a68eefSBob Badour #define vtrn_u16 vtrn_s16
15695*80a68eefSBob Badour 
15696*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
15697*80a68eefSBob Badour #define vtrn_u32 vtrn_s32
15698*80a68eefSBob Badour 
15699*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
vtrn_f32(float32x2_t a,float32x2_t b)15700*80a68eefSBob Badour _NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
15701*80a68eefSBob Badour {
15702*80a68eefSBob Badour     float32x2x2_t val;
15703*80a68eefSBob Badour     val.val[0].m64_f32[0] = a.m64_f32[0];
15704*80a68eefSBob Badour     val.val[0].m64_f32[1] = b.m64_f32[0];
15705*80a68eefSBob Badour     val.val[1].m64_f32[0] = a.m64_f32[1];
15706*80a68eefSBob Badour     val.val[1].m64_f32[1] = b.m64_f32[1];
15707*80a68eefSBob Badour     return val; //a0,b0,a1,b1
15708*80a68eefSBob Badour }
15709*80a68eefSBob Badour 
15710*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
15711*80a68eefSBob Badour #define  vtrn_p8 vtrn_u8
15712*80a68eefSBob Badour 
15713*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
15714*80a68eefSBob Badour #define  vtrn_p16 vtrn_s16
15715*80a68eefSBob Badour 
15716*80a68eefSBob Badour //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
vtrnq_s8(int8x16_t a,int8x16_t b)15717*80a68eefSBob Badour _NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
15718*80a68eefSBob Badour {
15719*80a68eefSBob Badour     int8x16x2_t r8x16;
15720*80a68eefSBob Badour     __m128i a_sh, b_sh;
15721*80a68eefSBob Badour     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15722*80a68eefSBob Badour     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15723*80a68eefSBob Badour 
15724*80a68eefSBob Badour     r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
15725*80a68eefSBob Badour     r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
15726*80a68eefSBob Badour     return r8x16;
15727*80a68eefSBob Badour }
15728*80a68eefSBob Badour 
15729*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
vtrnq_s16(int16x8_t a,int16x8_t b)15730*80a68eefSBob Badour _NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
15731*80a68eefSBob Badour {
15732*80a68eefSBob Badour     int16x8x2_t v16x8;
15733*80a68eefSBob Badour     __m128i a_sh, b_sh;
15734*80a68eefSBob Badour     a_sh = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
15735*80a68eefSBob Badour     b_sh = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
15736*80a68eefSBob Badour     v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
15737*80a68eefSBob Badour     v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
15738*80a68eefSBob Badour     return v16x8;
15739*80a68eefSBob Badour }
15740*80a68eefSBob Badour 
15741*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
vtrnq_s32(int32x4_t a,int32x4_t b)15742*80a68eefSBob Badour _NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
15743*80a68eefSBob Badour {
15744*80a68eefSBob Badour     //may be not optimal solution compared with serial
15745*80a68eefSBob Badour     int32x4x2_t v32x4;
15746*80a68eefSBob Badour     __m128i a_sh, b_sh;
15747*80a68eefSBob Badour     a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
15748*80a68eefSBob Badour     b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
15749*80a68eefSBob Badour 
15750*80a68eefSBob Badour     v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
15751*80a68eefSBob Badour     v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3,  b3
15752*80a68eefSBob Badour     return v32x4;
15753*80a68eefSBob Badour }
15754*80a68eefSBob Badour 
15755*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
15756*80a68eefSBob Badour #define vtrnq_u8 vtrnq_s8
15757*80a68eefSBob Badour 
15758*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
15759*80a68eefSBob Badour #define vtrnq_u16 vtrnq_s16
15760*80a68eefSBob Badour 
15761*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
15762*80a68eefSBob Badour #define vtrnq_u32 vtrnq_s32
15763*80a68eefSBob Badour 
15764*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
vtrnq_f32(float32x4_t a,float32x4_t b)15765*80a68eefSBob Badour _NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
15766*80a68eefSBob Badour {
15767*80a68eefSBob Badour     //may be not optimal solution compared with serial
15768*80a68eefSBob Badour     float32x4x2_t f32x4;
15769*80a68eefSBob Badour     __m128 a_sh, b_sh;
15770*80a68eefSBob Badour     a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
15771*80a68eefSBob Badour     b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
15772*80a68eefSBob Badour 
15773*80a68eefSBob Badour     f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
15774*80a68eefSBob Badour     f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3,  b3
15775*80a68eefSBob Badour     return f32x4;
15776*80a68eefSBob Badour }
15777*80a68eefSBob Badour 
15778*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
15779*80a68eefSBob Badour #define vtrnq_p8 vtrnq_s8
15780*80a68eefSBob Badour 
15781*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
15782*80a68eefSBob Badour #define vtrnq_p16 vtrnq_s16
15783*80a68eefSBob Badour 
15784*80a68eefSBob Badour //***************** Interleave elements ***************************
15785*80a68eefSBob Badour //*****************************************************************
15786*80a68eefSBob Badour //output has (a0,b0,a1,b1, a2,b2,.....)
15787*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
vzip_s8(int8x8_t a,int8x8_t b)15788*80a68eefSBob Badour _NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
15789*80a68eefSBob Badour {
15790*80a68eefSBob Badour     int8x8x2_t val;
15791*80a68eefSBob Badour     __m128i val0;
15792*80a68eefSBob Badour     val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
15793*80a68eefSBob Badour     vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15794*80a68eefSBob Badour     return val;
15795*80a68eefSBob Badour }
15796*80a68eefSBob Badour 
15797*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
vzip_s16(int16x4_t a,int16x4_t b)15798*80a68eefSBob Badour _NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
15799*80a68eefSBob Badour {
15800*80a68eefSBob Badour     int16x4x2_t val;
15801*80a68eefSBob Badour     __m128i val0;
15802*80a68eefSBob Badour     val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
15803*80a68eefSBob Badour     vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15804*80a68eefSBob Badour     return val;
15805*80a68eefSBob Badour }
15806*80a68eefSBob Badour 
15807*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
15808*80a68eefSBob Badour #define vzip_s32 vtrn_s32
15809*80a68eefSBob Badour 
15810*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
15811*80a68eefSBob Badour #define vzip_u8 vzip_s8
15812*80a68eefSBob Badour 
15813*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
15814*80a68eefSBob Badour #define vzip_u16 vzip_s16
15815*80a68eefSBob Badour 
15816*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
15817*80a68eefSBob Badour #define vzip_u32 vzip_s32
15818*80a68eefSBob Badour 
15819*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
15820*80a68eefSBob Badour #define vzip_f32 vtrn_f32
15821*80a68eefSBob Badour 
15822*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
15823*80a68eefSBob Badour #define vzip_p8 vzip_u8
15824*80a68eefSBob Badour 
15825*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
15826*80a68eefSBob Badour #define vzip_p16 vzip_u16
15827*80a68eefSBob Badour 
15828*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
vzipq_s8(int8x16_t a,int8x16_t b)15829*80a68eefSBob Badour _NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
15830*80a68eefSBob Badour {
15831*80a68eefSBob Badour     int8x16x2_t r8x16;
15832*80a68eefSBob Badour     r8x16.val[0] =  _mm_unpacklo_epi8(a, b);
15833*80a68eefSBob Badour     r8x16.val[1] =  _mm_unpackhi_epi8(a, b);
15834*80a68eefSBob Badour     return r8x16;
15835*80a68eefSBob Badour }
15836*80a68eefSBob Badour 
15837*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
vzipq_s16(int16x8_t a,int16x8_t b)15838*80a68eefSBob Badour _NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
15839*80a68eefSBob Badour {
15840*80a68eefSBob Badour     int16x8x2_t r16x8;
15841*80a68eefSBob Badour     r16x8.val[0] =  _mm_unpacklo_epi16(a, b);
15842*80a68eefSBob Badour     r16x8.val[1] =  _mm_unpackhi_epi16(a, b);
15843*80a68eefSBob Badour     return r16x8;
15844*80a68eefSBob Badour }
15845*80a68eefSBob Badour 
15846*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
vzipq_s32(int32x4_t a,int32x4_t b)15847*80a68eefSBob Badour _NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
15848*80a68eefSBob Badour {
15849*80a68eefSBob Badour     int32x4x2_t r32x4;
15850*80a68eefSBob Badour     r32x4.val[0] =  _mm_unpacklo_epi32(a, b);
15851*80a68eefSBob Badour     r32x4.val[1] =  _mm_unpackhi_epi32(a, b);
15852*80a68eefSBob Badour     return r32x4;
15853*80a68eefSBob Badour }
15854*80a68eefSBob Badour 
15855*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
15856*80a68eefSBob Badour #define vzipq_u8 vzipq_s8
15857*80a68eefSBob Badour 
15858*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
15859*80a68eefSBob Badour #define vzipq_u16 vzipq_s16
15860*80a68eefSBob Badour 
15861*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
15862*80a68eefSBob Badour #define vzipq_u32 vzipq_s32
15863*80a68eefSBob Badour 
15864*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
vzipq_f32(float32x4_t a,float32x4_t b)15865*80a68eefSBob Badour _NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
15866*80a68eefSBob Badour {
15867*80a68eefSBob Badour     float32x4x2_t f32x4;
15868*80a68eefSBob Badour     f32x4.val[0] =   _mm_unpacklo_ps ( a,  b);
15869*80a68eefSBob Badour     f32x4.val[1] =   _mm_unpackhi_ps ( a,  b);
15870*80a68eefSBob Badour     return f32x4;
15871*80a68eefSBob Badour }
15872*80a68eefSBob Badour 
15873*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
15874*80a68eefSBob Badour #define vzipq_p8 vzipq_u8
15875*80a68eefSBob Badour 
15876*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
15877*80a68eefSBob Badour #define vzipq_p16 vzipq_u16
15878*80a68eefSBob Badour 
15879*80a68eefSBob Badour //*********************** De-Interleave elements *************************
15880*80a68eefSBob Badour //*************************************************************************
15881*80a68eefSBob Badour //As the result of these functions first val  contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
15882*80a68eefSBob Badour //no such functions in IA32 SIMD, shuffle is required
15883*80a68eefSBob Badour _NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
vuzp_s8(int8x8_t a,int8x8_t b)15884*80a68eefSBob Badour _NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
15885*80a68eefSBob Badour {
15886*80a68eefSBob Badour     int8x8x2_t val;
15887*80a68eefSBob Badour     __m128i tmp, val0;
15888*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13,  2, 6, 10, 14, 3, 7, 11,15};
15889*80a68eefSBob Badour     tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15890*80a68eefSBob Badour     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6),  (a1, a3, a5, a7, b1,b3, b5, b7)
15891*80a68eefSBob Badour     vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15892*80a68eefSBob Badour     return val;
15893*80a68eefSBob Badour }
15894*80a68eefSBob Badour 
15895*80a68eefSBob Badour _NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
vuzp_s16(int16x4_t a,int16x4_t b)15896*80a68eefSBob Badour _NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
15897*80a68eefSBob Badour {
15898*80a68eefSBob Badour     int16x4x2_t val;
15899*80a68eefSBob Badour     __m128i tmp, val0;
15900*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1,  8,9,  2,3, 10,11,  4,5, 12,13, 6,7, 14,15};
15901*80a68eefSBob Badour     tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
15902*80a68eefSBob Badour     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
15903*80a68eefSBob Badour     vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15904*80a68eefSBob Badour     return val;
15905*80a68eefSBob Badour }
15906*80a68eefSBob Badour 
15907*80a68eefSBob Badour _NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
vuzp_s32(int32x2_t a,int32x2_t b)15908*80a68eefSBob Badour _NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
15909*80a68eefSBob Badour {
15910*80a68eefSBob Badour     int32x2x2_t val;
15911*80a68eefSBob Badour     __m128i val0;
15912*80a68eefSBob Badour     val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
15913*80a68eefSBob Badour     vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15914*80a68eefSBob Badour     return val;
15915*80a68eefSBob Badour }
15916*80a68eefSBob Badour 
15917*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
15918*80a68eefSBob Badour #define vuzp_u8 vuzp_s8
15919*80a68eefSBob Badour 
15920*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
15921*80a68eefSBob Badour #define vuzp_u16 vuzp_s16
15922*80a68eefSBob Badour 
15923*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
15924*80a68eefSBob Badour #define vuzp_u32 vuzp_s32
15925*80a68eefSBob Badour 
15926*80a68eefSBob Badour _NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
15927*80a68eefSBob Badour #define vuzp_f32 vzip_f32
15928*80a68eefSBob Badour 
15929*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
15930*80a68eefSBob Badour #define vuzp_p8 vuzp_u8
15931*80a68eefSBob Badour 
15932*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
15933*80a68eefSBob Badour #define vuzp_p16 vuzp_u16
15934*80a68eefSBob Badour 
15935*80a68eefSBob Badour _NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
vuzpq_s8(int8x16_t a,int8x16_t b)15936*80a68eefSBob Badour _NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
15937*80a68eefSBob Badour {
15938*80a68eefSBob Badour     int8x16x2_t v8x16;
15939*80a68eefSBob Badour     __m128i a_sh, b_sh;
15940*80a68eefSBob Badour     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15941*80a68eefSBob Badour     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15942*80a68eefSBob Badour     //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
15943*80a68eefSBob Badour     v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
15944*80a68eefSBob Badour     v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
15945*80a68eefSBob Badour     return v8x16;
15946*80a68eefSBob Badour }
15947*80a68eefSBob Badour 
15948*80a68eefSBob Badour _NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
vuzpq_s16(int16x8_t a,int16x8_t b)15949*80a68eefSBob Badour _NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
15950*80a68eefSBob Badour {
15951*80a68eefSBob Badour     int16x8x2_t v16x8;
15952*80a68eefSBob Badour     __m128i a_sh, b_sh;
15953*80a68eefSBob Badour      a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
15954*80a68eefSBob Badour     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_32_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
15955*80a68eefSBob Badour     v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
15956*80a68eefSBob Badour     v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
15957*80a68eefSBob Badour     return v16x8;
15958*80a68eefSBob Badour }
15959*80a68eefSBob Badour 
15960*80a68eefSBob Badour _NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
vuzpq_s32(int32x4_t a,int32x4_t b)15961*80a68eefSBob Badour _NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
15962*80a68eefSBob Badour {
15963*80a68eefSBob Badour     //may be not optimal solution compared with serial
15964*80a68eefSBob Badour     int32x4x2_t v32x4;
15965*80a68eefSBob Badour     __m128i a_sh, b_sh;
15966*80a68eefSBob Badour     a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
15967*80a68eefSBob Badour     b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
15968*80a68eefSBob Badour 
15969*80a68eefSBob Badour     v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
15970*80a68eefSBob Badour     v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
15971*80a68eefSBob Badour     return v32x4;
15972*80a68eefSBob Badour }
15973*80a68eefSBob Badour 
15974*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
15975*80a68eefSBob Badour #define vuzpq_u8 vuzpq_s8
15976*80a68eefSBob Badour 
15977*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
15978*80a68eefSBob Badour #define vuzpq_u16 vuzpq_s16
15979*80a68eefSBob Badour 
15980*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
15981*80a68eefSBob Badour #define vuzpq_u32 vuzpq_s32
15982*80a68eefSBob Badour 
15983*80a68eefSBob Badour _NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
vuzpq_f32(float32x4_t a,float32x4_t b)15984*80a68eefSBob Badour _NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
15985*80a68eefSBob Badour {
15986*80a68eefSBob Badour     float32x4x2_t v32x4;
15987*80a68eefSBob Badour     v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
15988*80a68eefSBob Badour     v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
15989*80a68eefSBob Badour     return v32x4;
15990*80a68eefSBob Badour }
15991*80a68eefSBob Badour 
15992*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
15993*80a68eefSBob Badour #define vuzpq_p8 vuzpq_u8
15994*80a68eefSBob Badour 
15995*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
15996*80a68eefSBob Badour #define vuzpq_p16 vuzpq_u16
15997*80a68eefSBob Badour 
15998*80a68eefSBob Badour //##############################################################################################
15999*80a68eefSBob Badour //*********************** Reinterpret cast intrinsics.******************************************
16000*80a68eefSBob Badour //##############################################################################################
16001*80a68eefSBob Badour // Not a part of oficial NEON instruction set but available in gcc compiler *********************
16002*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
16003*80a68eefSBob Badour #define vreinterpret_p8_u32
16004*80a68eefSBob Badour 
16005*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
16006*80a68eefSBob Badour #define vreinterpret_p8_u16
16007*80a68eefSBob Badour 
16008*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
16009*80a68eefSBob Badour #define vreinterpret_p8_u8
16010*80a68eefSBob Badour 
16011*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
16012*80a68eefSBob Badour #define vreinterpret_p8_s32
16013*80a68eefSBob Badour 
16014*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
16015*80a68eefSBob Badour #define vreinterpret_p8_s16
16016*80a68eefSBob Badour 
16017*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
16018*80a68eefSBob Badour #define vreinterpret_p8_s8
16019*80a68eefSBob Badour 
16020*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
16021*80a68eefSBob Badour #define vreinterpret_p8_u64
16022*80a68eefSBob Badour 
16023*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
16024*80a68eefSBob Badour #define vreinterpret_p8_s64
16025*80a68eefSBob Badour 
16026*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
16027*80a68eefSBob Badour #define vreinterpret_p8_f32
16028*80a68eefSBob Badour 
16029*80a68eefSBob Badour _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
16030*80a68eefSBob Badour #define vreinterpret_p8_p16
16031*80a68eefSBob Badour 
16032*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
16033*80a68eefSBob Badour #define vreinterpretq_p8_u32
16034*80a68eefSBob Badour 
16035*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
16036*80a68eefSBob Badour #define vreinterpretq_p8_u16
16037*80a68eefSBob Badour 
16038*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
16039*80a68eefSBob Badour #define vreinterpretq_p8_u8
16040*80a68eefSBob Badour 
16041*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
16042*80a68eefSBob Badour #define vreinterpretq_p8_s32
16043*80a68eefSBob Badour 
16044*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
16045*80a68eefSBob Badour #define vreinterpretq_p8_s16
16046*80a68eefSBob Badour 
16047*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
16048*80a68eefSBob Badour #define vreinterpretq_p8_s8
16049*80a68eefSBob Badour 
16050*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
16051*80a68eefSBob Badour #define vreinterpretq_p8_u64
16052*80a68eefSBob Badour 
16053*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
16054*80a68eefSBob Badour #define vreinterpretq_p8_s64
16055*80a68eefSBob Badour 
16056*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
16057*80a68eefSBob Badour #define vreinterpretq_p8_f32(t) _M128i(t)
16058*80a68eefSBob Badour 
16059*80a68eefSBob Badour _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
16060*80a68eefSBob Badour #define vreinterpretq_p8_p16
16061*80a68eefSBob Badour 
16062*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
16063*80a68eefSBob Badour #define vreinterpret_p16_u32
16064*80a68eefSBob Badour 
16065*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
16066*80a68eefSBob Badour #define vreinterpret_p16_u16
16067*80a68eefSBob Badour 
16068*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
16069*80a68eefSBob Badour #define vreinterpret_p16_u8
16070*80a68eefSBob Badour 
16071*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
16072*80a68eefSBob Badour #define vreinterpret_p16_s32
16073*80a68eefSBob Badour 
16074*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
16075*80a68eefSBob Badour #define vreinterpret_p16_s16
16076*80a68eefSBob Badour 
16077*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
16078*80a68eefSBob Badour #define vreinterpret_p16_s8
16079*80a68eefSBob Badour 
16080*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
16081*80a68eefSBob Badour #define vreinterpret_p16_u64
16082*80a68eefSBob Badour 
16083*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
16084*80a68eefSBob Badour #define vreinterpret_p16_s64
16085*80a68eefSBob Badour 
16086*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
16087*80a68eefSBob Badour #define vreinterpret_p16_f32
16088*80a68eefSBob Badour 
16089*80a68eefSBob Badour _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
16090*80a68eefSBob Badour #define vreinterpret_p16_p8
16091*80a68eefSBob Badour 
16092*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
16093*80a68eefSBob Badour #define vreinterpretq_p16_u32
16094*80a68eefSBob Badour 
16095*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
16096*80a68eefSBob Badour #define vreinterpretq_p16_u16
16097*80a68eefSBob Badour 
16098*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
16099*80a68eefSBob Badour #define vreinterpretq_p16_s32
16100*80a68eefSBob Badour 
16101*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
16102*80a68eefSBob Badour #define vreinterpretq_p16_s16
16103*80a68eefSBob Badour 
16104*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
16105*80a68eefSBob Badour #define vreinterpretq_p16_s8
16106*80a68eefSBob Badour 
16107*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
16108*80a68eefSBob Badour #define vreinterpretq_p16_u64
16109*80a68eefSBob Badour 
16110*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
16111*80a68eefSBob Badour #define vreinterpretq_p16_s64
16112*80a68eefSBob Badour 
16113*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
16114*80a68eefSBob Badour #define vreinterpretq_p16_f32(t) _M128i(t)
16115*80a68eefSBob Badour 
16116*80a68eefSBob Badour _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
16117*80a68eefSBob Badour #define vreinterpretq_p16_p8  vreinterpretq_s16_p8
16118*80a68eefSBob Badour 
16119*80a68eefSBob Badour //****  Integer to float  ******
16120*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
vreinterpret_f32_u32(uint32x2_t t)16121*80a68eefSBob Badour _NEON2SSE_INLINE float32x2_t vreinterpret_f32_u32 (uint32x2_t t)
16122*80a68eefSBob Badour {
16123*80a68eefSBob Badour     return (*(__m64_128*)&(t));
16124*80a68eefSBob Badour }
16125*80a68eefSBob Badour 
16126*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
16127*80a68eefSBob Badour #define vreinterpret_f32_u16 vreinterpret_f32_u32
16128*80a68eefSBob Badour 
16129*80a68eefSBob Badour 
16130*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
16131*80a68eefSBob Badour #define vreinterpret_f32_u8 vreinterpret_f32_u32
16132*80a68eefSBob Badour 
16133*80a68eefSBob Badour 
16134*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s32 (int32x2_t t);
16135*80a68eefSBob Badour #define vreinterpret_f32_s32 vreinterpret_f32_u32
16136*80a68eefSBob Badour 
16137*80a68eefSBob Badour 
16138*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s16 (int16x4_t t);
16139*80a68eefSBob Badour #define vreinterpret_f32_s16 vreinterpret_f32_u32
16140*80a68eefSBob Badour 
16141*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s8 (int8x8_t t);
16142*80a68eefSBob Badour #define vreinterpret_f32_s8 vreinterpret_f32_u32
16143*80a68eefSBob Badour 
16144*80a68eefSBob Badour 
16145*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u64(uint64x1_t t);
16146*80a68eefSBob Badour #define vreinterpret_f32_u64 vreinterpret_f32_u32
16147*80a68eefSBob Badour 
16148*80a68eefSBob Badour 
16149*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s64 (int64x1_t t);
16150*80a68eefSBob Badour #define vreinterpret_f32_s64 vreinterpret_f32_u32
16151*80a68eefSBob Badour 
16152*80a68eefSBob Badour 
16153*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
16154*80a68eefSBob Badour #define vreinterpret_f32_p16 vreinterpret_f32_u32
16155*80a68eefSBob Badour 
16156*80a68eefSBob Badour _NEON2SSESTORAGE float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
16157*80a68eefSBob Badour #define vreinterpret_f32_p8 vreinterpret_f32_u32
16158*80a68eefSBob Badour 
16159*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
16160*80a68eefSBob Badour #define  vreinterpretq_f32_u32(t) _M128(t)
16161*80a68eefSBob Badour 
16162*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
16163*80a68eefSBob Badour #define vreinterpretq_f32_u16 vreinterpretq_f32_u32
16164*80a68eefSBob Badour 
16165*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
16166*80a68eefSBob Badour #define vreinterpretq_f32_u8 vreinterpretq_f32_u32
16167*80a68eefSBob Badour 
16168*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
16169*80a68eefSBob Badour #define vreinterpretq_f32_s32 vreinterpretq_f32_u32
16170*80a68eefSBob Badour 
16171*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
16172*80a68eefSBob Badour #define vreinterpretq_f32_s16 vreinterpretq_f32_u32
16173*80a68eefSBob Badour 
16174*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
16175*80a68eefSBob Badour #define vreinterpretq_f32_s8 vreinterpretq_f32_u32
16176*80a68eefSBob Badour 
16177*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
16178*80a68eefSBob Badour #define vreinterpretq_f32_u64 vreinterpretq_f32_u32
16179*80a68eefSBob Badour 
16180*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
16181*80a68eefSBob Badour #define vreinterpretq_f32_s64 vreinterpretq_f32_u32
16182*80a68eefSBob Badour 
16183*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
16184*80a68eefSBob Badour #define vreinterpretq_f32_p16 vreinterpretq_f32_u32
16185*80a68eefSBob Badour 
16186*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
16187*80a68eefSBob Badour #define vreinterpretq_f32_p8 vreinterpretq_f32_u32
16188*80a68eefSBob Badour 
16189*80a68eefSBob Badour //*** Integer type conversions ******************
16190*80a68eefSBob Badour //no conversion necessary for the following functions because it is same data type
16191*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
16192*80a68eefSBob Badour #define vreinterpret_s64_u32
16193*80a68eefSBob Badour 
16194*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
16195*80a68eefSBob Badour #define vreinterpret_s64_u16
16196*80a68eefSBob Badour 
16197*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
16198*80a68eefSBob Badour #define vreinterpret_s64_u8
16199*80a68eefSBob Badour 
16200*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s32 (int32x2_t t);
16201*80a68eefSBob Badour #define  vreinterpret_s64_s32
16202*80a68eefSBob Badour 
16203*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s16 (int16x4_t t);
16204*80a68eefSBob Badour #define vreinterpret_s64_s16
16205*80a68eefSBob Badour 
16206*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s8 (int8x8_t t);
16207*80a68eefSBob Badour #define  vreinterpret_s64_s8
16208*80a68eefSBob Badour 
16209*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
16210*80a68eefSBob Badour #define  vreinterpret_s64_u64
16211*80a68eefSBob Badour 
16212*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vreinterpret_s64_f32 (float32x2_t t);
16213*80a68eefSBob Badour #define  vreinterpret_s64_f32
16214*80a68eefSBob Badour 
16215*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
16216*80a68eefSBob Badour #define vreinterpret_s64_p16
16217*80a68eefSBob Badour 
16218*80a68eefSBob Badour _NEON2SSESTORAGE int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
16219*80a68eefSBob Badour #define vreinterpret_s64_p8
16220*80a68eefSBob Badour 
16221*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
16222*80a68eefSBob Badour #define vreinterpretq_s64_u32
16223*80a68eefSBob Badour 
16224*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
16225*80a68eefSBob Badour #define vreinterpretq_s64_s16
16226*80a68eefSBob Badour 
16227*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
16228*80a68eefSBob Badour #define vreinterpretq_s64_u8
16229*80a68eefSBob Badour 
16230*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
16231*80a68eefSBob Badour #define vreinterpretq_s64_s32
16232*80a68eefSBob Badour 
16233*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
16234*80a68eefSBob Badour #define vreinterpretq_s64_u16
16235*80a68eefSBob Badour 
16236*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
16237*80a68eefSBob Badour #define vreinterpretq_s64_s8
16238*80a68eefSBob Badour 
16239*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
16240*80a68eefSBob Badour #define vreinterpretq_s64_u64
16241*80a68eefSBob Badour 
16242*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
16243*80a68eefSBob Badour #define vreinterpretq_s64_f32(t) _M128i(t)
16244*80a68eefSBob Badour 
16245*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
16246*80a68eefSBob Badour #define vreinterpretq_s64_p16
16247*80a68eefSBob Badour 
16248*80a68eefSBob Badour _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
16249*80a68eefSBob Badour #define vreinterpretq_s64_p8
16250*80a68eefSBob Badour 
16251*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
16252*80a68eefSBob Badour #define vreinterpret_u64_u32
16253*80a68eefSBob Badour 
16254*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
16255*80a68eefSBob Badour #define vreinterpret_u64_u16
16256*80a68eefSBob Badour 
16257*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
16258*80a68eefSBob Badour #define vreinterpret_u64_u8
16259*80a68eefSBob Badour 
16260*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
16261*80a68eefSBob Badour #define vreinterpret_u64_s32
16262*80a68eefSBob Badour 
16263*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
16264*80a68eefSBob Badour #define vreinterpret_u64_s16
16265*80a68eefSBob Badour 
16266*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
16267*80a68eefSBob Badour #define vreinterpret_u64_s8
16268*80a68eefSBob Badour 
16269*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
16270*80a68eefSBob Badour #define vreinterpret_u64_s64
16271*80a68eefSBob Badour 
16272*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
16273*80a68eefSBob Badour #define vreinterpret_u64_f32
16274*80a68eefSBob Badour 
16275*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
16276*80a68eefSBob Badour #define vreinterpret_u64_p16
16277*80a68eefSBob Badour 
16278*80a68eefSBob Badour _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
16279*80a68eefSBob Badour #define vreinterpret_u64_p8
16280*80a68eefSBob Badour 
16281*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
16282*80a68eefSBob Badour #define vreinterpretq_u64_u32
16283*80a68eefSBob Badour 
16284*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
16285*80a68eefSBob Badour #define vreinterpretq_u64_u16
16286*80a68eefSBob Badour 
16287*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
16288*80a68eefSBob Badour #define vreinterpretq_u64_u8
16289*80a68eefSBob Badour 
16290*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
16291*80a68eefSBob Badour #define vreinterpretq_u64_s32
16292*80a68eefSBob Badour 
16293*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
16294*80a68eefSBob Badour #define vreinterpretq_u64_s16
16295*80a68eefSBob Badour 
16296*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
16297*80a68eefSBob Badour #define vreinterpretq_u64_s8
16298*80a68eefSBob Badour 
16299*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
16300*80a68eefSBob Badour #define vreinterpretq_u64_s64
16301*80a68eefSBob Badour 
16302*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
16303*80a68eefSBob Badour #define vreinterpretq_u64_f32(t) _M128i(t)
16304*80a68eefSBob Badour 
16305*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
16306*80a68eefSBob Badour #define vreinterpretq_u64_p16
16307*80a68eefSBob Badour 
16308*80a68eefSBob Badour _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
16309*80a68eefSBob Badour #define vreinterpretq_u64_p8
16310*80a68eefSBob Badour 
16311*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
16312*80a68eefSBob Badour #define vreinterpret_s8_u32
16313*80a68eefSBob Badour 
16314*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
16315*80a68eefSBob Badour #define vreinterpret_s8_u16
16316*80a68eefSBob Badour 
16317*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
16318*80a68eefSBob Badour #define vreinterpret_s8_u8
16319*80a68eefSBob Badour 
16320*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s32 (int32x2_t t);
16321*80a68eefSBob Badour #define vreinterpret_s8_s32
16322*80a68eefSBob Badour 
16323*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s16 (int16x4_t t);
16324*80a68eefSBob Badour #define vreinterpret_s8_s16
16325*80a68eefSBob Badour 
16326*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
16327*80a68eefSBob Badour #define vreinterpret_s8_u64
16328*80a68eefSBob Badour 
16329*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s64 (int64x1_t t);
16330*80a68eefSBob Badour #define vreinterpret_s8_s64
16331*80a68eefSBob Badour 
16332*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vreinterpret_s8_f32 (float32x2_t t);
16333*80a68eefSBob Badour #define vreinterpret_s8_f32
16334*80a68eefSBob Badour 
16335*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
16336*80a68eefSBob Badour #define vreinterpret_s8_p16
16337*80a68eefSBob Badour 
16338*80a68eefSBob Badour _NEON2SSESTORAGE int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
16339*80a68eefSBob Badour #define vreinterpret_s8_p8
16340*80a68eefSBob Badour 
16341*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
16342*80a68eefSBob Badour #define vreinterpretq_s8_u32
16343*80a68eefSBob Badour 
16344*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
16345*80a68eefSBob Badour #define vreinterpretq_s8_u16
16346*80a68eefSBob Badour 
16347*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
16348*80a68eefSBob Badour #define vreinterpretq_s8_u8
16349*80a68eefSBob Badour 
16350*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
16351*80a68eefSBob Badour #define vreinterpretq_s8_s32
16352*80a68eefSBob Badour 
16353*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
16354*80a68eefSBob Badour #define vreinterpretq_s8_s16
16355*80a68eefSBob Badour 
16356*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
16357*80a68eefSBob Badour #define vreinterpretq_s8_u64
16358*80a68eefSBob Badour 
16359*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
16360*80a68eefSBob Badour #define vreinterpretq_s8_s64
16361*80a68eefSBob Badour 
16362*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
16363*80a68eefSBob Badour #define vreinterpretq_s8_f32(t) _M128i(t)
16364*80a68eefSBob Badour 
16365*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
16366*80a68eefSBob Badour #define vreinterpretq_s8_p16
16367*80a68eefSBob Badour 
16368*80a68eefSBob Badour _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
16369*80a68eefSBob Badour #define vreinterpretq_s8_p8
16370*80a68eefSBob Badour 
16371*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
16372*80a68eefSBob Badour #define vreinterpret_s16_u32
16373*80a68eefSBob Badour 
16374*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
16375*80a68eefSBob Badour #define vreinterpret_s16_u16
16376*80a68eefSBob Badour 
16377*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
16378*80a68eefSBob Badour #define vreinterpret_s16_u8
16379*80a68eefSBob Badour 
16380*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s32 (int32x2_t t);
16381*80a68eefSBob Badour #define vreinterpret_s16_s32
16382*80a68eefSBob Badour 
16383*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s8 (int8x8_t t);
16384*80a68eefSBob Badour #define vreinterpret_s16_s8
16385*80a68eefSBob Badour 
16386*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
16387*80a68eefSBob Badour #define vreinterpret_s16_u64
16388*80a68eefSBob Badour 
16389*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s64 (int64x1_t t);
16390*80a68eefSBob Badour #define vreinterpret_s16_s64
16391*80a68eefSBob Badour 
16392*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vreinterpret_s16_f32 (float32x2_t t);
16393*80a68eefSBob Badour #define vreinterpret_s16_f32
16394*80a68eefSBob Badour 
16395*80a68eefSBob Badour 
16396*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
16397*80a68eefSBob Badour #define vreinterpret_s16_p16
16398*80a68eefSBob Badour 
16399*80a68eefSBob Badour _NEON2SSESTORAGE int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
16400*80a68eefSBob Badour #define vreinterpret_s16_p8
16401*80a68eefSBob Badour 
16402*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
16403*80a68eefSBob Badour #define vreinterpretq_s16_u32
16404*80a68eefSBob Badour 
16405*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
16406*80a68eefSBob Badour #define vreinterpretq_s16_u16
16407*80a68eefSBob Badour 
16408*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
16409*80a68eefSBob Badour #define vreinterpretq_s16_u8
16410*80a68eefSBob Badour 
16411*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
16412*80a68eefSBob Badour #define vreinterpretq_s16_s32
16413*80a68eefSBob Badour 
16414*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
16415*80a68eefSBob Badour #define vreinterpretq_s16_s8
16416*80a68eefSBob Badour 
16417*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
16418*80a68eefSBob Badour #define vreinterpretq_s16_u64
16419*80a68eefSBob Badour 
16420*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
16421*80a68eefSBob Badour #define vreinterpretq_s16_s64
16422*80a68eefSBob Badour 
16423*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
16424*80a68eefSBob Badour #define vreinterpretq_s16_f32(t) _M128i(t)
16425*80a68eefSBob Badour 
16426*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
16427*80a68eefSBob Badour #define vreinterpretq_s16_p16
16428*80a68eefSBob Badour 
16429*80a68eefSBob Badour _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
16430*80a68eefSBob Badour #define vreinterpretq_s16_p8
16431*80a68eefSBob Badour 
16432*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
16433*80a68eefSBob Badour #define vreinterpret_s32_u32
16434*80a68eefSBob Badour 
16435*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
16436*80a68eefSBob Badour #define vreinterpret_s32_u16
16437*80a68eefSBob Badour 
16438*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
16439*80a68eefSBob Badour #define vreinterpret_s32_u8
16440*80a68eefSBob Badour 
16441*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s16 (int16x4_t t);
16442*80a68eefSBob Badour #define vreinterpret_s32_s16
16443*80a68eefSBob Badour 
16444*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s8 (int8x8_t t);
16445*80a68eefSBob Badour #define vreinterpret_s32_s8
16446*80a68eefSBob Badour 
16447*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
16448*80a68eefSBob Badour #define vreinterpret_s32_u64
16449*80a68eefSBob Badour 
16450*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s64 (int64x1_t t);
16451*80a68eefSBob Badour #define vreinterpret_s32_s64
16452*80a68eefSBob Badour 
16453*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vreinterpret_s32_f32 (float32x2_t t);
16454*80a68eefSBob Badour #define vreinterpret_s32_f32
16455*80a68eefSBob Badour 
16456*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
16457*80a68eefSBob Badour #define vreinterpret_s32_p16
16458*80a68eefSBob Badour 
16459*80a68eefSBob Badour _NEON2SSESTORAGE int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
16460*80a68eefSBob Badour #define vreinterpret_s32_p8
16461*80a68eefSBob Badour 
16462*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
16463*80a68eefSBob Badour #define vreinterpretq_s32_u32
16464*80a68eefSBob Badour 
16465*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
16466*80a68eefSBob Badour #define vreinterpretq_s32_u16
16467*80a68eefSBob Badour 
16468*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
16469*80a68eefSBob Badour #define vreinterpretq_s32_u8
16470*80a68eefSBob Badour 
16471*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
16472*80a68eefSBob Badour #define vreinterpretq_s32_s16
16473*80a68eefSBob Badour 
16474*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
16475*80a68eefSBob Badour #define vreinterpretq_s32_s8
16476*80a68eefSBob Badour 
16477*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
16478*80a68eefSBob Badour #define vreinterpretq_s32_u64
16479*80a68eefSBob Badour 
16480*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
16481*80a68eefSBob Badour #define vreinterpretq_s32_s64
16482*80a68eefSBob Badour 
16483*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
16484*80a68eefSBob Badour #define vreinterpretq_s32_f32(t)  _M128i(t)
16485*80a68eefSBob Badour 
16486*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
16487*80a68eefSBob Badour #define vreinterpretq_s32_p16
16488*80a68eefSBob Badour 
16489*80a68eefSBob Badour _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
16490*80a68eefSBob Badour #define vreinterpretq_s32_p8
16491*80a68eefSBob Badour 
16492*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
16493*80a68eefSBob Badour #define vreinterpret_u8_u32
16494*80a68eefSBob Badour 
16495*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
16496*80a68eefSBob Badour #define vreinterpret_u8_u16
16497*80a68eefSBob Badour 
16498*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
16499*80a68eefSBob Badour #define vreinterpret_u8_s32
16500*80a68eefSBob Badour 
16501*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
16502*80a68eefSBob Badour #define vreinterpret_u8_s16
16503*80a68eefSBob Badour 
16504*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
16505*80a68eefSBob Badour #define vreinterpret_u8_s8
16506*80a68eefSBob Badour 
16507*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
16508*80a68eefSBob Badour #define vreinterpret_u8_u64
16509*80a68eefSBob Badour 
16510*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
16511*80a68eefSBob Badour #define vreinterpret_u8_s64
16512*80a68eefSBob Badour 
16513*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
16514*80a68eefSBob Badour #define vreinterpret_u8_f32
16515*80a68eefSBob Badour 
16516*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
16517*80a68eefSBob Badour #define vreinterpret_u8_p16
16518*80a68eefSBob Badour 
16519*80a68eefSBob Badour _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
16520*80a68eefSBob Badour #define vreinterpret_u8_p8
16521*80a68eefSBob Badour 
16522*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
16523*80a68eefSBob Badour #define vreinterpretq_u8_u32
16524*80a68eefSBob Badour 
16525*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
16526*80a68eefSBob Badour #define vreinterpretq_u8_u16
16527*80a68eefSBob Badour 
16528*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
16529*80a68eefSBob Badour #define vreinterpretq_u8_s32
16530*80a68eefSBob Badour 
16531*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
16532*80a68eefSBob Badour #define vreinterpretq_u8_s16
16533*80a68eefSBob Badour 
16534*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
16535*80a68eefSBob Badour #define vreinterpretq_u8_s8
16536*80a68eefSBob Badour 
16537*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
16538*80a68eefSBob Badour #define vreinterpretq_u8_u64
16539*80a68eefSBob Badour 
16540*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
16541*80a68eefSBob Badour #define vreinterpretq_u8_s64
16542*80a68eefSBob Badour 
16543*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
16544*80a68eefSBob Badour #define vreinterpretq_u8_f32(t) _M128i(t)
16545*80a68eefSBob Badour 
16546*80a68eefSBob Badour 
16547*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
16548*80a68eefSBob Badour #define vreinterpretq_u8_p16
16549*80a68eefSBob Badour 
16550*80a68eefSBob Badour _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
16551*80a68eefSBob Badour #define vreinterpretq_u8_p8
16552*80a68eefSBob Badour 
16553*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
16554*80a68eefSBob Badour #define vreinterpret_u16_u32
16555*80a68eefSBob Badour 
16556*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
16557*80a68eefSBob Badour #define vreinterpret_u16_u8
16558*80a68eefSBob Badour 
16559*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
16560*80a68eefSBob Badour #define vreinterpret_u16_s32
16561*80a68eefSBob Badour 
16562*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
16563*80a68eefSBob Badour #define vreinterpret_u16_s16
16564*80a68eefSBob Badour 
16565*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
16566*80a68eefSBob Badour #define vreinterpret_u16_s8
16567*80a68eefSBob Badour 
16568*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
16569*80a68eefSBob Badour #define vreinterpret_u16_u64
16570*80a68eefSBob Badour 
16571*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
16572*80a68eefSBob Badour #define vreinterpret_u16_s64
16573*80a68eefSBob Badour 
16574*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
16575*80a68eefSBob Badour #define vreinterpret_u16_f32
16576*80a68eefSBob Badour 
16577*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
16578*80a68eefSBob Badour #define vreinterpret_u16_p16
16579*80a68eefSBob Badour 
16580*80a68eefSBob Badour _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
16581*80a68eefSBob Badour #define vreinterpret_u16_p8
16582*80a68eefSBob Badour 
16583*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
16584*80a68eefSBob Badour #define vreinterpretq_u16_u32
16585*80a68eefSBob Badour 
16586*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
16587*80a68eefSBob Badour #define vreinterpretq_u16_u8
16588*80a68eefSBob Badour 
16589*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
16590*80a68eefSBob Badour #define vreinterpretq_u16_s32
16591*80a68eefSBob Badour 
16592*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
16593*80a68eefSBob Badour #define vreinterpretq_u16_s16
16594*80a68eefSBob Badour 
16595*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
16596*80a68eefSBob Badour #define vreinterpretq_u16_s8
16597*80a68eefSBob Badour 
16598*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
16599*80a68eefSBob Badour #define vreinterpretq_u16_u64
16600*80a68eefSBob Badour 
16601*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
16602*80a68eefSBob Badour #define vreinterpretq_u16_s64
16603*80a68eefSBob Badour 
16604*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
16605*80a68eefSBob Badour #define vreinterpretq_u16_f32(t) _M128i(t)
16606*80a68eefSBob Badour 
16607*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
16608*80a68eefSBob Badour #define vreinterpretq_u16_p16
16609*80a68eefSBob Badour 
16610*80a68eefSBob Badour _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
16611*80a68eefSBob Badour #define vreinterpretq_u16_p8
16612*80a68eefSBob Badour 
16613*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
16614*80a68eefSBob Badour #define vreinterpret_u32_u16
16615*80a68eefSBob Badour 
16616*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
16617*80a68eefSBob Badour #define vreinterpret_u32_u8
16618*80a68eefSBob Badour 
16619*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
16620*80a68eefSBob Badour #define vreinterpret_u32_s32
16621*80a68eefSBob Badour 
16622*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
16623*80a68eefSBob Badour #define vreinterpret_u32_s16
16624*80a68eefSBob Badour 
16625*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
16626*80a68eefSBob Badour #define vreinterpret_u32_s8
16627*80a68eefSBob Badour 
16628*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
16629*80a68eefSBob Badour #define vreinterpret_u32_u64
16630*80a68eefSBob Badour 
16631*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
16632*80a68eefSBob Badour #define vreinterpret_u32_s64
16633*80a68eefSBob Badour 
16634*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
16635*80a68eefSBob Badour #define vreinterpret_u32_f32
16636*80a68eefSBob Badour 
16637*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
16638*80a68eefSBob Badour #define vreinterpret_u32_p16
16639*80a68eefSBob Badour 
16640*80a68eefSBob Badour _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
16641*80a68eefSBob Badour #define vreinterpret_u32_p8
16642*80a68eefSBob Badour 
16643*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
16644*80a68eefSBob Badour #define vreinterpretq_u32_u16
16645*80a68eefSBob Badour 
16646*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
16647*80a68eefSBob Badour #define vreinterpretq_u32_u8
16648*80a68eefSBob Badour 
16649*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
16650*80a68eefSBob Badour #define vreinterpretq_u32_s32
16651*80a68eefSBob Badour 
16652*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
16653*80a68eefSBob Badour #define vreinterpretq_u32_s16
16654*80a68eefSBob Badour 
16655*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
16656*80a68eefSBob Badour #define vreinterpretq_u32_s8
16657*80a68eefSBob Badour 
16658*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
16659*80a68eefSBob Badour #define vreinterpretq_u32_u64
16660*80a68eefSBob Badour 
16661*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
16662*80a68eefSBob Badour #define vreinterpretq_u32_s64
16663*80a68eefSBob Badour 
16664*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
16665*80a68eefSBob Badour #define  vreinterpretq_u32_f32(t) _M128i(t)
16666*80a68eefSBob Badour 
16667*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
16668*80a68eefSBob Badour #define vreinterpretq_u32_p16
16669*80a68eefSBob Badour 
16670*80a68eefSBob Badour _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
16671*80a68eefSBob Badour #define vreinterpretq_u32_p8
16672*80a68eefSBob Badour 
16673*80a68eefSBob Badour //*************  Round ******************
16674*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a);
16675*80a68eefSBob Badour #ifdef USE_SSE4
16676*80a68eefSBob Badour #   define vrndnq_f32(a) _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
16677*80a68eefSBob Badour #else
_NEON2SSE_PERFORMANCE_WARNING(float32x4_t vrndnq_f32 (float32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)16678*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( float32x4_t vrndnq_f32(float32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
16679*80a68eefSBob Badour {
16680*80a68eefSBob Badour     int i;
16681*80a68eefSBob Badour     _NEON2SSE_ALIGN_16 float32_t res[4];
16682*80a68eefSBob Badour     _mm_store_ps(res, a);
16683*80a68eefSBob Badour      for(i = 0; i<4; i++) {
16684*80a68eefSBob Badour        res[i] = nearbyintf(res[i]);
16685*80a68eefSBob Badour      }
16686*80a68eefSBob Badour     return _mm_load_ps(res);
16687*80a68eefSBob Badour }
16688*80a68eefSBob Badour #endif
16689*80a68eefSBob Badour 
16690*80a68eefSBob Badour 
16691*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a);
16692*80a68eefSBob Badour #ifdef USE_SSE4
16693*80a68eefSBob Badour #   define  vrndnq_f64(a)  _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
16694*80a68eefSBob Badour #else
_NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64 (float64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)16695*80a68eefSBob Badour _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64(float64x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
16696*80a68eefSBob Badour {
16697*80a68eefSBob Badour      _NEON2SSE_ALIGN_16 float64_t res[2];
16698*80a68eefSBob Badour      _mm_store_pd(res, a);
16699*80a68eefSBob Badour      res[0] = nearbyintf(res[0]);
16700*80a68eefSBob Badour      res[1] = nearbyintf(res[1]);
16701*80a68eefSBob Badour      return _mm_load_pd(res);
16702*80a68eefSBob Badour }
16703*80a68eefSBob Badour #endif
16704*80a68eefSBob Badour 
16705*80a68eefSBob Badour 
16706*80a68eefSBob Badour 
16707*80a68eefSBob Badour //************* Sqrt ******************
16708*80a68eefSBob Badour _NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a);
16709*80a68eefSBob Badour #define vsqrtq_f32 _mm_sqrt_ps
16710*80a68eefSBob Badour 
16711*80a68eefSBob Badour _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a);
16712*80a68eefSBob Badour #define vsqrtq_f64 _mm_sqrt_pd
16713*80a68eefSBob Badour 
16714*80a68eefSBob Badour 
16715*80a68eefSBob Badour #endif /* NEON2SSE_H */
16716