xref: /aosp_15_r20/external/pffft/simd/pf_sse2_double.h (revision 3f1979aa0d7ad34fcf3763de7b7b8f8cd67e5bdd)
1*3f1979aaSAndroid Build Coastguard Worker /*
2*3f1979aaSAndroid Build Coastguard Worker    Copyright (c) 2020  Dario Mambro ( [email protected] )
3*3f1979aaSAndroid Build Coastguard Worker */
4*3f1979aaSAndroid Build Coastguard Worker 
5*3f1979aaSAndroid Build Coastguard Worker /* Copyright (c) 2013  Julien Pommier ( [email protected] )
6*3f1979aaSAndroid Build Coastguard Worker 
7*3f1979aaSAndroid Build Coastguard Worker    Redistribution and use of the Software in source and binary forms,
8*3f1979aaSAndroid Build Coastguard Worker    with or without modification, is permitted provided that the
9*3f1979aaSAndroid Build Coastguard Worker    following conditions are met:
10*3f1979aaSAndroid Build Coastguard Worker 
11*3f1979aaSAndroid Build Coastguard Worker    - Neither the names of NCAR's Computational and Information Systems
12*3f1979aaSAndroid Build Coastguard Worker    Laboratory, the University Corporation for Atmospheric Research,
13*3f1979aaSAndroid Build Coastguard Worker    nor the names of its sponsors or contributors may be used to
14*3f1979aaSAndroid Build Coastguard Worker    endorse or promote products derived from this Software without
15*3f1979aaSAndroid Build Coastguard Worker    specific prior written permission.
16*3f1979aaSAndroid Build Coastguard Worker 
17*3f1979aaSAndroid Build Coastguard Worker    - Redistributions of source code must retain the above copyright
18*3f1979aaSAndroid Build Coastguard Worker    notices, this list of conditions, and the disclaimer below.
19*3f1979aaSAndroid Build Coastguard Worker 
20*3f1979aaSAndroid Build Coastguard Worker    - Redistributions in binary form must reproduce the above copyright
21*3f1979aaSAndroid Build Coastguard Worker    notice, this list of conditions, and the disclaimer below in the
22*3f1979aaSAndroid Build Coastguard Worker    documentation and/or other materials provided with the
23*3f1979aaSAndroid Build Coastguard Worker    distribution.
24*3f1979aaSAndroid Build Coastguard Worker 
25*3f1979aaSAndroid Build Coastguard Worker    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26*3f1979aaSAndroid Build Coastguard Worker    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
27*3f1979aaSAndroid Build Coastguard Worker    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28*3f1979aaSAndroid Build Coastguard Worker    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
29*3f1979aaSAndroid Build Coastguard Worker    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
30*3f1979aaSAndroid Build Coastguard Worker    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31*3f1979aaSAndroid Build Coastguard Worker    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32*3f1979aaSAndroid Build Coastguard Worker    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
33*3f1979aaSAndroid Build Coastguard Worker    SOFTWARE.
34*3f1979aaSAndroid Build Coastguard Worker */
35*3f1979aaSAndroid Build Coastguard Worker 
36*3f1979aaSAndroid Build Coastguard Worker #ifndef PF_SSE2_DBL_H
37*3f1979aaSAndroid Build Coastguard Worker #define PF_SSE2_DBL_H
38*3f1979aaSAndroid Build Coastguard Worker 
39*3f1979aaSAndroid Build Coastguard Worker //detect sse2 support under MSVC
40*3f1979aaSAndroid Build Coastguard Worker #if defined ( _M_IX86_FP )
41*3f1979aaSAndroid Build Coastguard Worker #  if _M_IX86_FP == 2
42*3f1979aaSAndroid Build Coastguard Worker #    if !defined(__SSE2__)
43*3f1979aaSAndroid Build Coastguard Worker #      define __SSE2__
44*3f1979aaSAndroid Build Coastguard Worker #    endif
45*3f1979aaSAndroid Build Coastguard Worker #  endif
46*3f1979aaSAndroid Build Coastguard Worker #endif
47*3f1979aaSAndroid Build Coastguard Worker 
48*3f1979aaSAndroid Build Coastguard Worker /*
49*3f1979aaSAndroid Build Coastguard Worker   SSE2 64bit support macros
50*3f1979aaSAndroid Build Coastguard Worker */
51*3f1979aaSAndroid Build Coastguard Worker #if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && (defined( __SSE4_2__ ) |  defined( __SSE4_1__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || defined ( __x86_64__ ))
52*3f1979aaSAndroid Build Coastguard Worker #pragma message (__FILE__ ": SSE2 double macros are defined" )
53*3f1979aaSAndroid Build Coastguard Worker 
54*3f1979aaSAndroid Build Coastguard Worker #include <emmintrin.h>
55*3f1979aaSAndroid Build Coastguard Worker 
56*3f1979aaSAndroid Build Coastguard Worker typedef struct {
57*3f1979aaSAndroid Build Coastguard Worker     __m128d d128[2];
58*3f1979aaSAndroid Build Coastguard Worker } m256d;
59*3f1979aaSAndroid Build Coastguard Worker 
60*3f1979aaSAndroid Build Coastguard Worker typedef m256d v4sf;
61*3f1979aaSAndroid Build Coastguard Worker 
62*3f1979aaSAndroid Build Coastguard Worker #  define SIMD_SZ 4
63*3f1979aaSAndroid Build Coastguard Worker 
64*3f1979aaSAndroid Build Coastguard Worker typedef union v4sf_union {
65*3f1979aaSAndroid Build Coastguard Worker   v4sf  v;
66*3f1979aaSAndroid Build Coastguard Worker   double f[SIMD_SZ];
67*3f1979aaSAndroid Build Coastguard Worker } v4sf_union;
68*3f1979aaSAndroid Build Coastguard Worker 
69*3f1979aaSAndroid Build Coastguard Worker 
70*3f1979aaSAndroid Build Coastguard Worker #if defined(__GNUC__) || defined(__clang__)
71*3f1979aaSAndroid Build Coastguard Worker 
72*3f1979aaSAndroid Build Coastguard Worker #pragma push_macro("FORCE_INLINE")
73*3f1979aaSAndroid Build Coastguard Worker #define FORCE_INLINE static inline __attribute__((always_inline))
74*3f1979aaSAndroid Build Coastguard Worker 
75*3f1979aaSAndroid Build Coastguard Worker #elif defined (_MSC_VER)
76*3f1979aaSAndroid Build Coastguard Worker #define FORCE_INLINE static __forceinline
77*3f1979aaSAndroid Build Coastguard Worker 
78*3f1979aaSAndroid Build Coastguard Worker #else
79*3f1979aaSAndroid Build Coastguard Worker #error "Macro name collisions may happens with unknown compiler"
80*3f1979aaSAndroid Build Coastguard Worker #ifdef FORCE_INLINE
81*3f1979aaSAndroid Build Coastguard Worker #undef FORCE_INLINE
82*3f1979aaSAndroid Build Coastguard Worker #endif
83*3f1979aaSAndroid Build Coastguard Worker #define FORCE_INLINE static inline
84*3f1979aaSAndroid Build Coastguard Worker #endif
85*3f1979aaSAndroid Build Coastguard Worker 
mm256_setzero_pd(void)86*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_setzero_pd(void)
87*3f1979aaSAndroid Build Coastguard Worker {
88*3f1979aaSAndroid Build Coastguard Worker     m256d ret;
89*3f1979aaSAndroid Build Coastguard Worker     ret.d128[0] = ret.d128[1] = _mm_setzero_pd();
90*3f1979aaSAndroid Build Coastguard Worker     return ret;
91*3f1979aaSAndroid Build Coastguard Worker }
92*3f1979aaSAndroid Build Coastguard Worker 
mm256_mul_pd(m256d a,m256d b)93*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_mul_pd(m256d a, m256d b)
94*3f1979aaSAndroid Build Coastguard Worker {
95*3f1979aaSAndroid Build Coastguard Worker     m256d ret;
96*3f1979aaSAndroid Build Coastguard Worker     ret.d128[0] = _mm_mul_pd(a.d128[0], b.d128[0]);
97*3f1979aaSAndroid Build Coastguard Worker     ret.d128[1] = _mm_mul_pd(a.d128[1], b.d128[1]);
98*3f1979aaSAndroid Build Coastguard Worker     return ret;
99*3f1979aaSAndroid Build Coastguard Worker }
100*3f1979aaSAndroid Build Coastguard Worker 
mm256_add_pd(m256d a,m256d b)101*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_add_pd(m256d a, m256d b)
102*3f1979aaSAndroid Build Coastguard Worker {
103*3f1979aaSAndroid Build Coastguard Worker     m256d ret;
104*3f1979aaSAndroid Build Coastguard Worker     ret.d128[0] = _mm_add_pd(a.d128[0], b.d128[0]);
105*3f1979aaSAndroid Build Coastguard Worker     ret.d128[1] = _mm_add_pd(a.d128[1], b.d128[1]);
106*3f1979aaSAndroid Build Coastguard Worker     return ret;
107*3f1979aaSAndroid Build Coastguard Worker }
108*3f1979aaSAndroid Build Coastguard Worker 
mm256_sub_pd(m256d a,m256d b)109*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_sub_pd(m256d a, m256d b)
110*3f1979aaSAndroid Build Coastguard Worker {
111*3f1979aaSAndroid Build Coastguard Worker     m256d ret;
112*3f1979aaSAndroid Build Coastguard Worker     ret.d128[0] = _mm_sub_pd(a.d128[0], b.d128[0]);
113*3f1979aaSAndroid Build Coastguard Worker     ret.d128[1] = _mm_sub_pd(a.d128[1], b.d128[1]);
114*3f1979aaSAndroid Build Coastguard Worker     return ret;
115*3f1979aaSAndroid Build Coastguard Worker }
116*3f1979aaSAndroid Build Coastguard Worker 
mm256_set1_pd(double a)117*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_set1_pd(double a)
118*3f1979aaSAndroid Build Coastguard Worker {
119*3f1979aaSAndroid Build Coastguard Worker     m256d ret;
120*3f1979aaSAndroid Build Coastguard Worker     ret.d128[0] = ret.d128[1] = _mm_set1_pd(a);
121*3f1979aaSAndroid Build Coastguard Worker     return ret;
122*3f1979aaSAndroid Build Coastguard Worker }
123*3f1979aaSAndroid Build Coastguard Worker 
mm256_load_pd(double const * mem_addr)124*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_load_pd (double const * mem_addr)
125*3f1979aaSAndroid Build Coastguard Worker {
126*3f1979aaSAndroid Build Coastguard Worker     m256d res;
127*3f1979aaSAndroid Build Coastguard Worker     res.d128[0] = _mm_load_pd((const double *)mem_addr);
128*3f1979aaSAndroid Build Coastguard Worker     res.d128[1] = _mm_load_pd((const double *)mem_addr + 2);
129*3f1979aaSAndroid Build Coastguard Worker     return res;
130*3f1979aaSAndroid Build Coastguard Worker }
mm256_loadu_pd(double const * mem_addr)131*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_loadu_pd (double const * mem_addr)
132*3f1979aaSAndroid Build Coastguard Worker {
133*3f1979aaSAndroid Build Coastguard Worker     m256d res;
134*3f1979aaSAndroid Build Coastguard Worker     res.d128[0] = _mm_loadu_pd((const double *)mem_addr);
135*3f1979aaSAndroid Build Coastguard Worker     res.d128[1] = _mm_loadu_pd((const double *)mem_addr + 2);
136*3f1979aaSAndroid Build Coastguard Worker     return res;
137*3f1979aaSAndroid Build Coastguard Worker }
138*3f1979aaSAndroid Build Coastguard Worker 
139*3f1979aaSAndroid Build Coastguard Worker 
140*3f1979aaSAndroid Build Coastguard Worker #  define VARCH "SSE2"
141*3f1979aaSAndroid Build Coastguard Worker #  define VREQUIRES_ALIGN 1
142*3f1979aaSAndroid Build Coastguard Worker #  define VZERO() mm256_setzero_pd()
143*3f1979aaSAndroid Build Coastguard Worker #  define VMUL(a,b) mm256_mul_pd(a,b)
144*3f1979aaSAndroid Build Coastguard Worker #  define VADD(a,b) mm256_add_pd(a,b)
145*3f1979aaSAndroid Build Coastguard Worker #  define VMADD(a,b,c) mm256_add_pd(mm256_mul_pd(a,b), c)
146*3f1979aaSAndroid Build Coastguard Worker #  define VSUB(a,b) mm256_sub_pd(a,b)
147*3f1979aaSAndroid Build Coastguard Worker #  define LD_PS1(p) mm256_set1_pd(p)
148*3f1979aaSAndroid Build Coastguard Worker #  define VLOAD_UNALIGNED(ptr)  mm256_loadu_pd(ptr)
149*3f1979aaSAndroid Build Coastguard Worker #  define VLOAD_ALIGNED(ptr)    mm256_load_pd(ptr)
150*3f1979aaSAndroid Build Coastguard Worker 
151*3f1979aaSAndroid Build Coastguard Worker 
mm256_castpd256_pd128(m256d a)152*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d mm256_castpd256_pd128(m256d a)
153*3f1979aaSAndroid Build Coastguard Worker {
154*3f1979aaSAndroid Build Coastguard Worker     return a.d128[0];
155*3f1979aaSAndroid Build Coastguard Worker }
156*3f1979aaSAndroid Build Coastguard Worker 
mm256_extractf128_pd(m256d a,const int imm8)157*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d mm256_extractf128_pd (m256d a, const int imm8)
158*3f1979aaSAndroid Build Coastguard Worker {
159*3f1979aaSAndroid Build Coastguard Worker     assert(imm8 >= 0 && imm8 <= 1);
160*3f1979aaSAndroid Build Coastguard Worker     return a.d128[imm8];
161*3f1979aaSAndroid Build Coastguard Worker }
mm256_insertf128_pd_1(m256d a,__m128d b)162*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_insertf128_pd_1(m256d a, __m128d b)
163*3f1979aaSAndroid Build Coastguard Worker {
164*3f1979aaSAndroid Build Coastguard Worker     m256d res;
165*3f1979aaSAndroid Build Coastguard Worker     res.d128[0] = a.d128[0];
166*3f1979aaSAndroid Build Coastguard Worker     res.d128[1] = b;
167*3f1979aaSAndroid Build Coastguard Worker     return res;
168*3f1979aaSAndroid Build Coastguard Worker }
mm256_castpd128_pd256(__m128d a)169*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_castpd128_pd256(__m128d a)
170*3f1979aaSAndroid Build Coastguard Worker {
171*3f1979aaSAndroid Build Coastguard Worker     m256d res;
172*3f1979aaSAndroid Build Coastguard Worker     res.d128[0] = a;
173*3f1979aaSAndroid Build Coastguard Worker     return res;
174*3f1979aaSAndroid Build Coastguard Worker }
175*3f1979aaSAndroid Build Coastguard Worker 
mm256_shuffle_pd_00(m256d a,m256d b)176*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_shuffle_pd_00(m256d a, m256d b)
177*3f1979aaSAndroid Build Coastguard Worker {
178*3f1979aaSAndroid Build Coastguard Worker     m256d res;
179*3f1979aaSAndroid Build Coastguard Worker     res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0],0);
180*3f1979aaSAndroid Build Coastguard Worker     res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1],0);
181*3f1979aaSAndroid Build Coastguard Worker     return res;
182*3f1979aaSAndroid Build Coastguard Worker }
183*3f1979aaSAndroid Build Coastguard Worker 
mm256_shuffle_pd_11(m256d a,m256d b)184*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_shuffle_pd_11(m256d a, m256d b)
185*3f1979aaSAndroid Build Coastguard Worker {
186*3f1979aaSAndroid Build Coastguard Worker     m256d res;
187*3f1979aaSAndroid Build Coastguard Worker     res.d128[0] = _mm_shuffle_pd(a.d128[0],b.d128[0], 3);
188*3f1979aaSAndroid Build Coastguard Worker     res.d128[1] = _mm_shuffle_pd(a.d128[1],b.d128[1], 3);
189*3f1979aaSAndroid Build Coastguard Worker     return res;
190*3f1979aaSAndroid Build Coastguard Worker }
191*3f1979aaSAndroid Build Coastguard Worker 
mm256_permute2f128_pd_0x20(m256d a,m256d b)192*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_permute2f128_pd_0x20(m256d a, m256d b) {
193*3f1979aaSAndroid Build Coastguard Worker     m256d res;
194*3f1979aaSAndroid Build Coastguard Worker     res.d128[0] = a.d128[0];
195*3f1979aaSAndroid Build Coastguard Worker     res.d128[1] = b.d128[0];
196*3f1979aaSAndroid Build Coastguard Worker     return res;
197*3f1979aaSAndroid Build Coastguard Worker }
198*3f1979aaSAndroid Build Coastguard Worker 
199*3f1979aaSAndroid Build Coastguard Worker 
mm256_permute2f128_pd_0x31(m256d a,m256d b)200*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_permute2f128_pd_0x31(m256d a, m256d b)
201*3f1979aaSAndroid Build Coastguard Worker {
202*3f1979aaSAndroid Build Coastguard Worker     m256d res;
203*3f1979aaSAndroid Build Coastguard Worker     res.d128[0] = a.d128[1];
204*3f1979aaSAndroid Build Coastguard Worker     res.d128[1] = b.d128[1];
205*3f1979aaSAndroid Build Coastguard Worker     return res;
206*3f1979aaSAndroid Build Coastguard Worker }
207*3f1979aaSAndroid Build Coastguard Worker 
mm256_reverse(m256d x)208*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE m256d mm256_reverse(m256d x)
209*3f1979aaSAndroid Build Coastguard Worker {
210*3f1979aaSAndroid Build Coastguard Worker     m256d res;
211*3f1979aaSAndroid Build Coastguard Worker     res.d128[0] = _mm_shuffle_pd(x.d128[1],x.d128[1],1);
212*3f1979aaSAndroid Build Coastguard Worker     res.d128[1] = _mm_shuffle_pd(x.d128[0],x.d128[0],1);
213*3f1979aaSAndroid Build Coastguard Worker     return res;
214*3f1979aaSAndroid Build Coastguard Worker }
215*3f1979aaSAndroid Build Coastguard Worker 
216*3f1979aaSAndroid Build Coastguard Worker /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
217*3f1979aaSAndroid Build Coastguard Worker out1 = [ in1[0], in2[0], in1[1], in2[1] ]
218*3f1979aaSAndroid Build Coastguard Worker out2 = [ in1[2], in2[2], in1[3], in2[3] ]
219*3f1979aaSAndroid Build Coastguard Worker */
220*3f1979aaSAndroid Build Coastguard Worker #  define INTERLEAVE2(in1, in2, out1, out2) {							\
221*3f1979aaSAndroid Build Coastguard Worker 	__m128d low1__ = mm256_castpd256_pd128(in1);						\
222*3f1979aaSAndroid Build Coastguard Worker 	__m128d low2__ = mm256_castpd256_pd128(in2);						\
223*3f1979aaSAndroid Build Coastguard Worker 	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
224*3f1979aaSAndroid Build Coastguard Worker 	__m128d high2__ = mm256_extractf128_pd(in2, 1);					\
225*3f1979aaSAndroid Build Coastguard Worker 	m256d tmp__ = mm256_insertf128_pd_1(								\
226*3f1979aaSAndroid Build Coastguard Worker 		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
227*3f1979aaSAndroid Build Coastguard Worker 		_mm_shuffle_pd(low1__, low2__, 3));								\
228*3f1979aaSAndroid Build Coastguard Worker 	out2 = mm256_insertf128_pd_1(										\
229*3f1979aaSAndroid Build Coastguard Worker 		mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
230*3f1979aaSAndroid Build Coastguard Worker 		_mm_shuffle_pd(high1__, high2__, 3));							\
231*3f1979aaSAndroid Build Coastguard Worker 	out1 = tmp__;														\
232*3f1979aaSAndroid Build Coastguard Worker }
233*3f1979aaSAndroid Build Coastguard Worker 
234*3f1979aaSAndroid Build Coastguard Worker /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
235*3f1979aaSAndroid Build Coastguard Worker out1 = [ in1[0], in1[2], in2[0], in2[2] ]
236*3f1979aaSAndroid Build Coastguard Worker out2 = [ in1[1], in1[3], in2[1], in2[3] ]
237*3f1979aaSAndroid Build Coastguard Worker */
238*3f1979aaSAndroid Build Coastguard Worker #  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
239*3f1979aaSAndroid Build Coastguard Worker 	__m128d low1__ = mm256_castpd256_pd128(in1);						\
240*3f1979aaSAndroid Build Coastguard Worker 	__m128d low2__ = mm256_castpd256_pd128(in2);						\
241*3f1979aaSAndroid Build Coastguard Worker 	__m128d high1__ = mm256_extractf128_pd(in1, 1);					\
242*3f1979aaSAndroid Build Coastguard Worker 	__m128d high2__ = mm256_extractf128_pd(in2, 1); 					\
243*3f1979aaSAndroid Build Coastguard Worker 	m256d tmp__ = mm256_insertf128_pd_1(								\
244*3f1979aaSAndroid Build Coastguard Worker 		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
245*3f1979aaSAndroid Build Coastguard Worker 		_mm_shuffle_pd(low2__, high2__, 0));							\
246*3f1979aaSAndroid Build Coastguard Worker 	out2 = mm256_insertf128_pd_1(										\
247*3f1979aaSAndroid Build Coastguard Worker 		mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
248*3f1979aaSAndroid Build Coastguard Worker 		_mm_shuffle_pd(low2__, high2__, 3));							\
249*3f1979aaSAndroid Build Coastguard Worker 	out1 = tmp__;														\
250*3f1979aaSAndroid Build Coastguard Worker }
251*3f1979aaSAndroid Build Coastguard Worker 
252*3f1979aaSAndroid Build Coastguard Worker #  define VTRANSPOSE4(row0, row1, row2, row3) {							\
253*3f1979aaSAndroid Build Coastguard Worker         m256d tmp3, tmp2, tmp1, tmp0;                     			\
254*3f1979aaSAndroid Build Coastguard Worker                                                             			\
255*3f1979aaSAndroid Build Coastguard Worker         tmp0 = mm256_shuffle_pd_00((row0),(row1));       				\
256*3f1979aaSAndroid Build Coastguard Worker         tmp2 = mm256_shuffle_pd_11((row0),(row1));       				\
257*3f1979aaSAndroid Build Coastguard Worker         tmp1 = mm256_shuffle_pd_00((row2),(row3));       				\
258*3f1979aaSAndroid Build Coastguard Worker         tmp3 = mm256_shuffle_pd_11((row2),(row3));       				\
259*3f1979aaSAndroid Build Coastguard Worker                                                             			\
260*3f1979aaSAndroid Build Coastguard Worker         (row0) = mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
261*3f1979aaSAndroid Build Coastguard Worker         (row1) = mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
262*3f1979aaSAndroid Build Coastguard Worker         (row2) = mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
263*3f1979aaSAndroid Build Coastguard Worker         (row3) = mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
264*3f1979aaSAndroid Build Coastguard Worker     }
265*3f1979aaSAndroid Build Coastguard Worker 
266*3f1979aaSAndroid Build Coastguard Worker /*VSWAPHL(a, b) pseudo code:
267*3f1979aaSAndroid Build Coastguard Worker return [ b[0], b[1], a[2], a[3] ]
268*3f1979aaSAndroid Build Coastguard Worker */
269*3f1979aaSAndroid Build Coastguard Worker #  define VSWAPHL(a,b)	\
270*3f1979aaSAndroid Build Coastguard Worker    mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_castpd256_pd128(b)), mm256_extractf128_pd(a, 1))
271*3f1979aaSAndroid Build Coastguard Worker 
272*3f1979aaSAndroid Build Coastguard Worker /* reverse/flip all floats */
273*3f1979aaSAndroid Build Coastguard Worker #  define VREV_S(a)   mm256_reverse(a)
274*3f1979aaSAndroid Build Coastguard Worker 
275*3f1979aaSAndroid Build Coastguard Worker /* reverse/flip complex floats */
276*3f1979aaSAndroid Build Coastguard Worker #  define VREV_C(a)    mm256_insertf128_pd_1(mm256_castpd128_pd256(mm256_extractf128_pd(a, 1)), mm256_castpd256_pd128(a))
277*3f1979aaSAndroid Build Coastguard Worker 
278*3f1979aaSAndroid Build Coastguard Worker #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
279*3f1979aaSAndroid Build Coastguard Worker 
280*3f1979aaSAndroid Build Coastguard Worker #endif
281*3f1979aaSAndroid Build Coastguard Worker #endif
282