xref: /aosp_15_r20/external/pffft/simd/pf_neon_double.h (revision 3f1979aa0d7ad34fcf3763de7b7b8f8cd67e5bdd)
1*3f1979aaSAndroid Build Coastguard Worker /*
2*3f1979aaSAndroid Build Coastguard Worker    Copyright (c) 2020  Dario Mambro ( [email protected] )
3*3f1979aaSAndroid Build Coastguard Worker */
4*3f1979aaSAndroid Build Coastguard Worker 
5*3f1979aaSAndroid Build Coastguard Worker /* Copyright (c) 2013  Julien Pommier ( [email protected] )
6*3f1979aaSAndroid Build Coastguard Worker 
7*3f1979aaSAndroid Build Coastguard Worker    Redistribution and use of the Software in source and binary forms,
8*3f1979aaSAndroid Build Coastguard Worker    with or without modification, is permitted provided that the
9*3f1979aaSAndroid Build Coastguard Worker    following conditions are met:
10*3f1979aaSAndroid Build Coastguard Worker 
11*3f1979aaSAndroid Build Coastguard Worker    - Neither the names of NCAR's Computational and Information Systems
12*3f1979aaSAndroid Build Coastguard Worker    Laboratory, the University Corporation for Atmospheric Research,
13*3f1979aaSAndroid Build Coastguard Worker    nor the names of its sponsors or contributors may be used to
14*3f1979aaSAndroid Build Coastguard Worker    endorse or promote products derived from this Software without
15*3f1979aaSAndroid Build Coastguard Worker    specific prior written permission.
16*3f1979aaSAndroid Build Coastguard Worker 
17*3f1979aaSAndroid Build Coastguard Worker    - Redistributions of source code must retain the above copyright
18*3f1979aaSAndroid Build Coastguard Worker    notices, this list of conditions, and the disclaimer below.
19*3f1979aaSAndroid Build Coastguard Worker 
20*3f1979aaSAndroid Build Coastguard Worker    - Redistributions in binary form must reproduce the above copyright
21*3f1979aaSAndroid Build Coastguard Worker    notice, this list of conditions, and the disclaimer below in the
22*3f1979aaSAndroid Build Coastguard Worker    documentation and/or other materials provided with the
23*3f1979aaSAndroid Build Coastguard Worker    distribution.
24*3f1979aaSAndroid Build Coastguard Worker 
25*3f1979aaSAndroid Build Coastguard Worker    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26*3f1979aaSAndroid Build Coastguard Worker    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
27*3f1979aaSAndroid Build Coastguard Worker    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28*3f1979aaSAndroid Build Coastguard Worker    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
29*3f1979aaSAndroid Build Coastguard Worker    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
30*3f1979aaSAndroid Build Coastguard Worker    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31*3f1979aaSAndroid Build Coastguard Worker    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32*3f1979aaSAndroid Build Coastguard Worker    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
33*3f1979aaSAndroid Build Coastguard Worker    SOFTWARE.
34*3f1979aaSAndroid Build Coastguard Worker */
35*3f1979aaSAndroid Build Coastguard Worker 
36*3f1979aaSAndroid Build Coastguard Worker #ifndef PF_NEON_DBL_H
37*3f1979aaSAndroid Build Coastguard Worker #define PF_NEON_DBL_H
38*3f1979aaSAndroid Build Coastguard Worker 
39*3f1979aaSAndroid Build Coastguard Worker /*
40*3f1979aaSAndroid Build Coastguard Worker   NEON 64bit support macros
41*3f1979aaSAndroid Build Coastguard Worker */
42*3f1979aaSAndroid Build Coastguard Worker #if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
43*3f1979aaSAndroid Build Coastguard Worker 
44*3f1979aaSAndroid Build Coastguard Worker #pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
45*3f1979aaSAndroid Build Coastguard Worker 
46*3f1979aaSAndroid Build Coastguard Worker #include "pf_neon_double_from_avx.h"
47*3f1979aaSAndroid Build Coastguard Worker typedef __m256d v4sf;
48*3f1979aaSAndroid Build Coastguard Worker 
49*3f1979aaSAndroid Build Coastguard Worker /* 4 doubles by simd vector */
50*3f1979aaSAndroid Build Coastguard Worker #  define SIMD_SZ 4
51*3f1979aaSAndroid Build Coastguard Worker 
52*3f1979aaSAndroid Build Coastguard Worker typedef union v4sf_union {
53*3f1979aaSAndroid Build Coastguard Worker   v4sf  v;
54*3f1979aaSAndroid Build Coastguard Worker   double f[SIMD_SZ];
55*3f1979aaSAndroid Build Coastguard Worker } v4sf_union;
56*3f1979aaSAndroid Build Coastguard Worker 
57*3f1979aaSAndroid Build Coastguard Worker #  define VARCH "NEON"
58*3f1979aaSAndroid Build Coastguard Worker #  define VREQUIRES_ALIGN 1
59*3f1979aaSAndroid Build Coastguard Worker #  define VZERO() _mm256_setzero_pd()
60*3f1979aaSAndroid Build Coastguard Worker #  define VMUL(a,b) _mm256_mul_pd(a,b)
61*3f1979aaSAndroid Build Coastguard Worker #  define VADD(a,b) _mm256_add_pd(a,b)
62*3f1979aaSAndroid Build Coastguard Worker #  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
63*3f1979aaSAndroid Build Coastguard Worker #  define VSUB(a,b) _mm256_sub_pd(a,b)
64*3f1979aaSAndroid Build Coastguard Worker #  define LD_PS1(p) _mm256_set1_pd(p)
65*3f1979aaSAndroid Build Coastguard Worker #  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
66*3f1979aaSAndroid Build Coastguard Worker #  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
67*3f1979aaSAndroid Build Coastguard Worker 
_mm256_insertf128_pd_1(__m256d a,__m128d b)68*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
69*3f1979aaSAndroid Build Coastguard Worker {
70*3f1979aaSAndroid Build Coastguard Worker     __m256d res;
71*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[0] = a.vect_f64[0];
72*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[1] = b;
73*3f1979aaSAndroid Build Coastguard Worker     return res;
74*3f1979aaSAndroid Build Coastguard Worker }
75*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_pd_00(__m128d a,__m128d b)76*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
77*3f1979aaSAndroid Build Coastguard Worker {
78*3f1979aaSAndroid Build Coastguard Worker     float64x1_t al = vget_low_f64(a);
79*3f1979aaSAndroid Build Coastguard Worker     float64x1_t bl = vget_low_f64(b);
80*3f1979aaSAndroid Build Coastguard Worker     return vcombine_f64(al, bl);
81*3f1979aaSAndroid Build Coastguard Worker }
82*3f1979aaSAndroid Build Coastguard Worker 
_mm_shuffle_pd_11(__m128d a,__m128d b)83*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
84*3f1979aaSAndroid Build Coastguard Worker {
85*3f1979aaSAndroid Build Coastguard Worker     float64x1_t ah = vget_high_f64(a);
86*3f1979aaSAndroid Build Coastguard Worker     float64x1_t bh = vget_high_f64(b);
87*3f1979aaSAndroid Build Coastguard Worker     return vcombine_f64(ah, bh);
88*3f1979aaSAndroid Build Coastguard Worker }
89*3f1979aaSAndroid Build Coastguard Worker 
_mm256_shuffle_pd_00(__m256d a,__m256d b)90*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
91*3f1979aaSAndroid Build Coastguard Worker {
92*3f1979aaSAndroid Build Coastguard Worker     __m256d res;
93*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
94*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
95*3f1979aaSAndroid Build Coastguard Worker     return res;
96*3f1979aaSAndroid Build Coastguard Worker }
97*3f1979aaSAndroid Build Coastguard Worker 
_mm256_shuffle_pd_11(__m256d a,__m256d b)98*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
99*3f1979aaSAndroid Build Coastguard Worker {
100*3f1979aaSAndroid Build Coastguard Worker     __m256d res;
101*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
102*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
103*3f1979aaSAndroid Build Coastguard Worker     return res;
104*3f1979aaSAndroid Build Coastguard Worker }
105*3f1979aaSAndroid Build Coastguard Worker 
_mm256_permute2f128_pd_0x20(__m256d a,__m256d b)106*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
107*3f1979aaSAndroid Build Coastguard Worker     __m256d res;
108*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[0] = a.vect_f64[0];
109*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[1] = b.vect_f64[0];
110*3f1979aaSAndroid Build Coastguard Worker     return res;
111*3f1979aaSAndroid Build Coastguard Worker }
112*3f1979aaSAndroid Build Coastguard Worker 
113*3f1979aaSAndroid Build Coastguard Worker 
_mm256_permute2f128_pd_0x31(__m256d a,__m256d b)114*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
115*3f1979aaSAndroid Build Coastguard Worker {
116*3f1979aaSAndroid Build Coastguard Worker     __m256d res;
117*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[0] = a.vect_f64[1];
118*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[1] = b.vect_f64[1];
119*3f1979aaSAndroid Build Coastguard Worker     return res;
120*3f1979aaSAndroid Build Coastguard Worker }
121*3f1979aaSAndroid Build Coastguard Worker 
_mm256_reverse(__m256d x)122*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_reverse(__m256d x)
123*3f1979aaSAndroid Build Coastguard Worker {
124*3f1979aaSAndroid Build Coastguard Worker     __m256d res;
125*3f1979aaSAndroid Build Coastguard Worker     float64x2_t low = x.vect_f64[0];
126*3f1979aaSAndroid Build Coastguard Worker     float64x2_t high = x.vect_f64[1];
127*3f1979aaSAndroid Build Coastguard Worker     float64x1_t a = vget_low_f64(low);
128*3f1979aaSAndroid Build Coastguard Worker     float64x1_t b = vget_high_f64(low);
129*3f1979aaSAndroid Build Coastguard Worker     float64x1_t c = vget_low_f64(high);
130*3f1979aaSAndroid Build Coastguard Worker     float64x1_t d = vget_high_f64(high);
131*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[0] =  vcombine_f64(d, c);
132*3f1979aaSAndroid Build Coastguard Worker     res.vect_f64[1] =  vcombine_f64(b, a);
133*3f1979aaSAndroid Build Coastguard Worker     return res;
134*3f1979aaSAndroid Build Coastguard Worker }
135*3f1979aaSAndroid Build Coastguard Worker 
136*3f1979aaSAndroid Build Coastguard Worker /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
137*3f1979aaSAndroid Build Coastguard Worker out1 = [ in1[0], in2[0], in1[1], in2[1] ]
138*3f1979aaSAndroid Build Coastguard Worker out2 = [ in1[2], in2[2], in1[3], in2[3] ]
139*3f1979aaSAndroid Build Coastguard Worker */
140*3f1979aaSAndroid Build Coastguard Worker #  define INTERLEAVE2(in1, in2, out1, out2) {							\
141*3f1979aaSAndroid Build Coastguard Worker 	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
142*3f1979aaSAndroid Build Coastguard Worker 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
143*3f1979aaSAndroid Build Coastguard Worker 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
144*3f1979aaSAndroid Build Coastguard Worker 	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
145*3f1979aaSAndroid Build Coastguard Worker 	__m256d tmp__ = _mm256_insertf128_pd_1(								\
146*3f1979aaSAndroid Build Coastguard Worker 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)),		\
147*3f1979aaSAndroid Build Coastguard Worker 		_mm_shuffle_pd_11(low1__, low2__));								\
148*3f1979aaSAndroid Build Coastguard Worker 	out2 = _mm256_insertf128_pd_1(										\
149*3f1979aaSAndroid Build Coastguard Worker 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)),	\
150*3f1979aaSAndroid Build Coastguard Worker 		_mm_shuffle_pd_11(high1__, high2__));							\
151*3f1979aaSAndroid Build Coastguard Worker 	out1 = tmp__;														\
152*3f1979aaSAndroid Build Coastguard Worker }
153*3f1979aaSAndroid Build Coastguard Worker 
154*3f1979aaSAndroid Build Coastguard Worker /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
155*3f1979aaSAndroid Build Coastguard Worker out1 = [ in1[0], in1[2], in2[0], in2[2] ]
156*3f1979aaSAndroid Build Coastguard Worker out2 = [ in1[1], in1[3], in2[1], in2[3] ]
157*3f1979aaSAndroid Build Coastguard Worker */
158*3f1979aaSAndroid Build Coastguard Worker #  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
159*3f1979aaSAndroid Build Coastguard Worker 	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
160*3f1979aaSAndroid Build Coastguard Worker 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
161*3f1979aaSAndroid Build Coastguard Worker 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
162*3f1979aaSAndroid Build Coastguard Worker 	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
163*3f1979aaSAndroid Build Coastguard Worker 	__m256d tmp__ = _mm256_insertf128_pd_1(								\
164*3f1979aaSAndroid Build Coastguard Worker 		_mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)),		\
165*3f1979aaSAndroid Build Coastguard Worker 		_mm_shuffle_pd_00(low2__, high2__));							\
166*3f1979aaSAndroid Build Coastguard Worker 	out2 = _mm256_insertf128_pd_1(										\
167*3f1979aaSAndroid Build Coastguard Worker 		_mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)),		\
168*3f1979aaSAndroid Build Coastguard Worker 		_mm_shuffle_pd_11(low2__, high2__));							\
169*3f1979aaSAndroid Build Coastguard Worker 	out1 = tmp__;														\
170*3f1979aaSAndroid Build Coastguard Worker }
171*3f1979aaSAndroid Build Coastguard Worker 
172*3f1979aaSAndroid Build Coastguard Worker #  define VTRANSPOSE4(row0, row1, row2, row3) {							\
173*3f1979aaSAndroid Build Coastguard Worker         __m256d tmp3, tmp2, tmp1, tmp0;                     			\
174*3f1979aaSAndroid Build Coastguard Worker                                                             			\
175*3f1979aaSAndroid Build Coastguard Worker         tmp0 = _mm256_shuffle_pd_00((row0),(row1));       				\
176*3f1979aaSAndroid Build Coastguard Worker         tmp2 = _mm256_shuffle_pd_11((row0),(row1));       				\
177*3f1979aaSAndroid Build Coastguard Worker         tmp1 = _mm256_shuffle_pd_00((row2),(row3));       				\
178*3f1979aaSAndroid Build Coastguard Worker         tmp3 = _mm256_shuffle_pd_11((row2),(row3));       				\
179*3f1979aaSAndroid Build Coastguard Worker                                                             			\
180*3f1979aaSAndroid Build Coastguard Worker         (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1);			    \
181*3f1979aaSAndroid Build Coastguard Worker         (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); 		        \
182*3f1979aaSAndroid Build Coastguard Worker         (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); 		        \
183*3f1979aaSAndroid Build Coastguard Worker         (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); 		        \
184*3f1979aaSAndroid Build Coastguard Worker     }
185*3f1979aaSAndroid Build Coastguard Worker 
186*3f1979aaSAndroid Build Coastguard Worker /*VSWAPHL(a, b) pseudo code:
187*3f1979aaSAndroid Build Coastguard Worker return [ b[0], b[1], a[2], a[3] ]
188*3f1979aaSAndroid Build Coastguard Worker */
189*3f1979aaSAndroid Build Coastguard Worker #  define VSWAPHL(a,b)	\
190*3f1979aaSAndroid Build Coastguard Worker    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
191*3f1979aaSAndroid Build Coastguard Worker 
192*3f1979aaSAndroid Build Coastguard Worker /* reverse/flip all floats */
193*3f1979aaSAndroid Build Coastguard Worker #  define VREV_S(a)   _mm256_reverse(a)
194*3f1979aaSAndroid Build Coastguard Worker 
195*3f1979aaSAndroid Build Coastguard Worker /* reverse/flip complex floats */
196*3f1979aaSAndroid Build Coastguard Worker #  define VREV_C(a)    _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
197*3f1979aaSAndroid Build Coastguard Worker 
198*3f1979aaSAndroid Build Coastguard Worker #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
199*3f1979aaSAndroid Build Coastguard Worker 
200*3f1979aaSAndroid Build Coastguard Worker #endif
201*3f1979aaSAndroid Build Coastguard Worker 
202*3f1979aaSAndroid Build Coastguard Worker #endif /* PF_AVX_DBL_H */
203*3f1979aaSAndroid Build Coastguard Worker 
204