1*3f1979aaSAndroid Build Coastguard Worker /*
2*3f1979aaSAndroid Build Coastguard Worker Copyright (c) 2020 Dario Mambro ( [email protected] )
3*3f1979aaSAndroid Build Coastguard Worker */
4*3f1979aaSAndroid Build Coastguard Worker
5*3f1979aaSAndroid Build Coastguard Worker /* Copyright (c) 2013 Julien Pommier ( [email protected] )
6*3f1979aaSAndroid Build Coastguard Worker
7*3f1979aaSAndroid Build Coastguard Worker Redistribution and use of the Software in source and binary forms,
8*3f1979aaSAndroid Build Coastguard Worker with or without modification, is permitted provided that the
9*3f1979aaSAndroid Build Coastguard Worker following conditions are met:
10*3f1979aaSAndroid Build Coastguard Worker
11*3f1979aaSAndroid Build Coastguard Worker - Neither the names of NCAR's Computational and Information Systems
12*3f1979aaSAndroid Build Coastguard Worker Laboratory, the University Corporation for Atmospheric Research,
13*3f1979aaSAndroid Build Coastguard Worker nor the names of its sponsors or contributors may be used to
14*3f1979aaSAndroid Build Coastguard Worker endorse or promote products derived from this Software without
15*3f1979aaSAndroid Build Coastguard Worker specific prior written permission.
16*3f1979aaSAndroid Build Coastguard Worker
17*3f1979aaSAndroid Build Coastguard Worker - Redistributions of source code must retain the above copyright
18*3f1979aaSAndroid Build Coastguard Worker notices, this list of conditions, and the disclaimer below.
19*3f1979aaSAndroid Build Coastguard Worker
20*3f1979aaSAndroid Build Coastguard Worker - Redistributions in binary form must reproduce the above copyright
21*3f1979aaSAndroid Build Coastguard Worker notice, this list of conditions, and the disclaimer below in the
22*3f1979aaSAndroid Build Coastguard Worker documentation and/or other materials provided with the
23*3f1979aaSAndroid Build Coastguard Worker distribution.
24*3f1979aaSAndroid Build Coastguard Worker
25*3f1979aaSAndroid Build Coastguard Worker THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26*3f1979aaSAndroid Build Coastguard Worker EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
27*3f1979aaSAndroid Build Coastguard Worker MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28*3f1979aaSAndroid Build Coastguard Worker NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
29*3f1979aaSAndroid Build Coastguard Worker HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
30*3f1979aaSAndroid Build Coastguard Worker EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31*3f1979aaSAndroid Build Coastguard Worker ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32*3f1979aaSAndroid Build Coastguard Worker CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
33*3f1979aaSAndroid Build Coastguard Worker SOFTWARE.
34*3f1979aaSAndroid Build Coastguard Worker */
35*3f1979aaSAndroid Build Coastguard Worker
36*3f1979aaSAndroid Build Coastguard Worker #ifndef PF_NEON_DBL_H
37*3f1979aaSAndroid Build Coastguard Worker #define PF_NEON_DBL_H
38*3f1979aaSAndroid Build Coastguard Worker
39*3f1979aaSAndroid Build Coastguard Worker /*
40*3f1979aaSAndroid Build Coastguard Worker NEON 64bit support macros
41*3f1979aaSAndroid Build Coastguard Worker */
42*3f1979aaSAndroid Build Coastguard Worker #if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
43*3f1979aaSAndroid Build Coastguard Worker
44*3f1979aaSAndroid Build Coastguard Worker #pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
45*3f1979aaSAndroid Build Coastguard Worker
46*3f1979aaSAndroid Build Coastguard Worker #include "pf_neon_double_from_avx.h"
47*3f1979aaSAndroid Build Coastguard Worker typedef __m256d v4sf;
48*3f1979aaSAndroid Build Coastguard Worker
49*3f1979aaSAndroid Build Coastguard Worker /* 4 doubles by simd vector */
50*3f1979aaSAndroid Build Coastguard Worker # define SIMD_SZ 4
51*3f1979aaSAndroid Build Coastguard Worker
52*3f1979aaSAndroid Build Coastguard Worker typedef union v4sf_union {
53*3f1979aaSAndroid Build Coastguard Worker v4sf v;
54*3f1979aaSAndroid Build Coastguard Worker double f[SIMD_SZ];
55*3f1979aaSAndroid Build Coastguard Worker } v4sf_union;
56*3f1979aaSAndroid Build Coastguard Worker
57*3f1979aaSAndroid Build Coastguard Worker # define VARCH "NEON"
58*3f1979aaSAndroid Build Coastguard Worker # define VREQUIRES_ALIGN 1
59*3f1979aaSAndroid Build Coastguard Worker # define VZERO() _mm256_setzero_pd()
60*3f1979aaSAndroid Build Coastguard Worker # define VMUL(a,b) _mm256_mul_pd(a,b)
61*3f1979aaSAndroid Build Coastguard Worker # define VADD(a,b) _mm256_add_pd(a,b)
62*3f1979aaSAndroid Build Coastguard Worker # define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
63*3f1979aaSAndroid Build Coastguard Worker # define VSUB(a,b) _mm256_sub_pd(a,b)
64*3f1979aaSAndroid Build Coastguard Worker # define LD_PS1(p) _mm256_set1_pd(p)
65*3f1979aaSAndroid Build Coastguard Worker # define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
66*3f1979aaSAndroid Build Coastguard Worker # define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
67*3f1979aaSAndroid Build Coastguard Worker
_mm256_insertf128_pd_1(__m256d a,__m128d b)68*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
69*3f1979aaSAndroid Build Coastguard Worker {
70*3f1979aaSAndroid Build Coastguard Worker __m256d res;
71*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[0] = a.vect_f64[0];
72*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[1] = b;
73*3f1979aaSAndroid Build Coastguard Worker return res;
74*3f1979aaSAndroid Build Coastguard Worker }
75*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_pd_00(__m128d a,__m128d b)76*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
77*3f1979aaSAndroid Build Coastguard Worker {
78*3f1979aaSAndroid Build Coastguard Worker float64x1_t al = vget_low_f64(a);
79*3f1979aaSAndroid Build Coastguard Worker float64x1_t bl = vget_low_f64(b);
80*3f1979aaSAndroid Build Coastguard Worker return vcombine_f64(al, bl);
81*3f1979aaSAndroid Build Coastguard Worker }
82*3f1979aaSAndroid Build Coastguard Worker
_mm_shuffle_pd_11(__m128d a,__m128d b)83*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
84*3f1979aaSAndroid Build Coastguard Worker {
85*3f1979aaSAndroid Build Coastguard Worker float64x1_t ah = vget_high_f64(a);
86*3f1979aaSAndroid Build Coastguard Worker float64x1_t bh = vget_high_f64(b);
87*3f1979aaSAndroid Build Coastguard Worker return vcombine_f64(ah, bh);
88*3f1979aaSAndroid Build Coastguard Worker }
89*3f1979aaSAndroid Build Coastguard Worker
_mm256_shuffle_pd_00(__m256d a,__m256d b)90*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
91*3f1979aaSAndroid Build Coastguard Worker {
92*3f1979aaSAndroid Build Coastguard Worker __m256d res;
93*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
94*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
95*3f1979aaSAndroid Build Coastguard Worker return res;
96*3f1979aaSAndroid Build Coastguard Worker }
97*3f1979aaSAndroid Build Coastguard Worker
_mm256_shuffle_pd_11(__m256d a,__m256d b)98*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
99*3f1979aaSAndroid Build Coastguard Worker {
100*3f1979aaSAndroid Build Coastguard Worker __m256d res;
101*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
102*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
103*3f1979aaSAndroid Build Coastguard Worker return res;
104*3f1979aaSAndroid Build Coastguard Worker }
105*3f1979aaSAndroid Build Coastguard Worker
_mm256_permute2f128_pd_0x20(__m256d a,__m256d b)106*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
107*3f1979aaSAndroid Build Coastguard Worker __m256d res;
108*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[0] = a.vect_f64[0];
109*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[1] = b.vect_f64[0];
110*3f1979aaSAndroid Build Coastguard Worker return res;
111*3f1979aaSAndroid Build Coastguard Worker }
112*3f1979aaSAndroid Build Coastguard Worker
113*3f1979aaSAndroid Build Coastguard Worker
_mm256_permute2f128_pd_0x31(__m256d a,__m256d b)114*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
115*3f1979aaSAndroid Build Coastguard Worker {
116*3f1979aaSAndroid Build Coastguard Worker __m256d res;
117*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[0] = a.vect_f64[1];
118*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[1] = b.vect_f64[1];
119*3f1979aaSAndroid Build Coastguard Worker return res;
120*3f1979aaSAndroid Build Coastguard Worker }
121*3f1979aaSAndroid Build Coastguard Worker
_mm256_reverse(__m256d x)122*3f1979aaSAndroid Build Coastguard Worker FORCE_INLINE __m256d _mm256_reverse(__m256d x)
123*3f1979aaSAndroid Build Coastguard Worker {
124*3f1979aaSAndroid Build Coastguard Worker __m256d res;
125*3f1979aaSAndroid Build Coastguard Worker float64x2_t low = x.vect_f64[0];
126*3f1979aaSAndroid Build Coastguard Worker float64x2_t high = x.vect_f64[1];
127*3f1979aaSAndroid Build Coastguard Worker float64x1_t a = vget_low_f64(low);
128*3f1979aaSAndroid Build Coastguard Worker float64x1_t b = vget_high_f64(low);
129*3f1979aaSAndroid Build Coastguard Worker float64x1_t c = vget_low_f64(high);
130*3f1979aaSAndroid Build Coastguard Worker float64x1_t d = vget_high_f64(high);
131*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[0] = vcombine_f64(d, c);
132*3f1979aaSAndroid Build Coastguard Worker res.vect_f64[1] = vcombine_f64(b, a);
133*3f1979aaSAndroid Build Coastguard Worker return res;
134*3f1979aaSAndroid Build Coastguard Worker }
135*3f1979aaSAndroid Build Coastguard Worker
136*3f1979aaSAndroid Build Coastguard Worker /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
137*3f1979aaSAndroid Build Coastguard Worker out1 = [ in1[0], in2[0], in1[1], in2[1] ]
138*3f1979aaSAndroid Build Coastguard Worker out2 = [ in1[2], in2[2], in1[3], in2[3] ]
139*3f1979aaSAndroid Build Coastguard Worker */
140*3f1979aaSAndroid Build Coastguard Worker # define INTERLEAVE2(in1, in2, out1, out2) { \
141*3f1979aaSAndroid Build Coastguard Worker __m128d low1__ = _mm256_castpd256_pd128(in1); \
142*3f1979aaSAndroid Build Coastguard Worker __m128d low2__ = _mm256_castpd256_pd128(in2); \
143*3f1979aaSAndroid Build Coastguard Worker __m128d high1__ = _mm256_extractf128_pd(in1, 1); \
144*3f1979aaSAndroid Build Coastguard Worker __m128d high2__ = _mm256_extractf128_pd(in2, 1); \
145*3f1979aaSAndroid Build Coastguard Worker __m256d tmp__ = _mm256_insertf128_pd_1( \
146*3f1979aaSAndroid Build Coastguard Worker _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \
147*3f1979aaSAndroid Build Coastguard Worker _mm_shuffle_pd_11(low1__, low2__)); \
148*3f1979aaSAndroid Build Coastguard Worker out2 = _mm256_insertf128_pd_1( \
149*3f1979aaSAndroid Build Coastguard Worker _mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \
150*3f1979aaSAndroid Build Coastguard Worker _mm_shuffle_pd_11(high1__, high2__)); \
151*3f1979aaSAndroid Build Coastguard Worker out1 = tmp__; \
152*3f1979aaSAndroid Build Coastguard Worker }
153*3f1979aaSAndroid Build Coastguard Worker
154*3f1979aaSAndroid Build Coastguard Worker /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
155*3f1979aaSAndroid Build Coastguard Worker out1 = [ in1[0], in1[2], in2[0], in2[2] ]
156*3f1979aaSAndroid Build Coastguard Worker out2 = [ in1[1], in1[3], in2[1], in2[3] ]
157*3f1979aaSAndroid Build Coastguard Worker */
158*3f1979aaSAndroid Build Coastguard Worker # define UNINTERLEAVE2(in1, in2, out1, out2) { \
159*3f1979aaSAndroid Build Coastguard Worker __m128d low1__ = _mm256_castpd256_pd128(in1); \
160*3f1979aaSAndroid Build Coastguard Worker __m128d low2__ = _mm256_castpd256_pd128(in2); \
161*3f1979aaSAndroid Build Coastguard Worker __m128d high1__ = _mm256_extractf128_pd(in1, 1); \
162*3f1979aaSAndroid Build Coastguard Worker __m128d high2__ = _mm256_extractf128_pd(in2, 1); \
163*3f1979aaSAndroid Build Coastguard Worker __m256d tmp__ = _mm256_insertf128_pd_1( \
164*3f1979aaSAndroid Build Coastguard Worker _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \
165*3f1979aaSAndroid Build Coastguard Worker _mm_shuffle_pd_00(low2__, high2__)); \
166*3f1979aaSAndroid Build Coastguard Worker out2 = _mm256_insertf128_pd_1( \
167*3f1979aaSAndroid Build Coastguard Worker _mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \
168*3f1979aaSAndroid Build Coastguard Worker _mm_shuffle_pd_11(low2__, high2__)); \
169*3f1979aaSAndroid Build Coastguard Worker out1 = tmp__; \
170*3f1979aaSAndroid Build Coastguard Worker }
171*3f1979aaSAndroid Build Coastguard Worker
172*3f1979aaSAndroid Build Coastguard Worker # define VTRANSPOSE4(row0, row1, row2, row3) { \
173*3f1979aaSAndroid Build Coastguard Worker __m256d tmp3, tmp2, tmp1, tmp0; \
174*3f1979aaSAndroid Build Coastguard Worker \
175*3f1979aaSAndroid Build Coastguard Worker tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \
176*3f1979aaSAndroid Build Coastguard Worker tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \
177*3f1979aaSAndroid Build Coastguard Worker tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \
178*3f1979aaSAndroid Build Coastguard Worker tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \
179*3f1979aaSAndroid Build Coastguard Worker \
180*3f1979aaSAndroid Build Coastguard Worker (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \
181*3f1979aaSAndroid Build Coastguard Worker (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \
182*3f1979aaSAndroid Build Coastguard Worker (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \
183*3f1979aaSAndroid Build Coastguard Worker (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \
184*3f1979aaSAndroid Build Coastguard Worker }
185*3f1979aaSAndroid Build Coastguard Worker
186*3f1979aaSAndroid Build Coastguard Worker /*VSWAPHL(a, b) pseudo code:
187*3f1979aaSAndroid Build Coastguard Worker return [ b[0], b[1], a[2], a[3] ]
188*3f1979aaSAndroid Build Coastguard Worker */
189*3f1979aaSAndroid Build Coastguard Worker # define VSWAPHL(a,b) \
190*3f1979aaSAndroid Build Coastguard Worker _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
191*3f1979aaSAndroid Build Coastguard Worker
192*3f1979aaSAndroid Build Coastguard Worker /* reverse/flip all floats */
193*3f1979aaSAndroid Build Coastguard Worker # define VREV_S(a) _mm256_reverse(a)
194*3f1979aaSAndroid Build Coastguard Worker
195*3f1979aaSAndroid Build Coastguard Worker /* reverse/flip complex floats */
196*3f1979aaSAndroid Build Coastguard Worker # define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
197*3f1979aaSAndroid Build Coastguard Worker
198*3f1979aaSAndroid Build Coastguard Worker # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
199*3f1979aaSAndroid Build Coastguard Worker
200*3f1979aaSAndroid Build Coastguard Worker #endif
201*3f1979aaSAndroid Build Coastguard Worker
202*3f1979aaSAndroid Build Coastguard Worker #endif /* PF_AVX_DBL_H */
203*3f1979aaSAndroid Build Coastguard Worker
204