xref: /aosp_15_r20/external/pffft/simd/pf_avx_double.h (revision 3f1979aa0d7ad34fcf3763de7b7b8f8cd67e5bdd)
1 /*
2    Copyright (c) 2020  Dario Mambro ( [email protected] )
3 */
4 
5 /* Copyright (c) 2013  Julien Pommier ( [email protected] )
6 
7    Redistribution and use of the Software in source and binary forms,
8    with or without modification, is permitted provided that the
9    following conditions are met:
10 
11    - Neither the names of NCAR's Computational and Information Systems
12    Laboratory, the University Corporation for Atmospheric Research,
13    nor the names of its sponsors or contributors may be used to
14    endorse or promote products derived from this Software without
15    specific prior written permission.
16 
17    - Redistributions of source code must retain the above copyright
18    notices, this list of conditions, and the disclaimer below.
19 
20    - Redistributions in binary form must reproduce the above copyright
21    notice, this list of conditions, and the disclaimer below in the
22    documentation and/or other materials provided with the
23    distribution.
24 
25    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
27    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
29    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
30    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
33    SOFTWARE.
34 */
35 
36 #ifndef PF_AVX_DBL_H
37 #define PF_AVX_DBL_H
38 
39 /*
40    vector support macros: the rest of the code is independant of
41    AVX -- adding support for other platforms with 4-element
42    vectors should be limited to these macros
43 */
44 
45 
46 /*
47   AVX support macros
48 */
49 #if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && !defined(PFFFT_AVX_DISABLE) && defined(__AVX__)
50 #pragma message( __FILE__ ": AVX macros are defined" )
51 
52 #include <immintrin.h>
53 typedef __m256d v4sf;
54 
55 /* 4 doubles by simd vector */
56 #  define SIMD_SZ 4
57 
58 typedef union v4sf_union {
59   v4sf  v;
60   double f[SIMD_SZ];
61 } v4sf_union;
62 
63 #  define VARCH "AVX"
64 #  define VREQUIRES_ALIGN 1
65 #  define VZERO() _mm256_setzero_pd()
66 #  define VMUL(a,b) _mm256_mul_pd(a,b)
67 #  define VADD(a,b) _mm256_add_pd(a,b)
68 #  define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
69 #  define VSUB(a,b) _mm256_sub_pd(a,b)
70 #  define LD_PS1(p) _mm256_set1_pd(p)
71 #  define VLOAD_UNALIGNED(ptr)  _mm256_loadu_pd(ptr)
72 #  define VLOAD_ALIGNED(ptr)    _mm256_load_pd(ptr)
73 
74 /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
75 out1 = [ in1[0], in2[0], in1[1], in2[1] ]
76 out2 = [ in1[2], in2[2], in1[3], in2[3] ]
77 */
78 #  define INTERLEAVE2(in1, in2, out1, out2) {							\
79 	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
80 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
81 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
82 	__m128d high2__ = _mm256_extractf128_pd(in2, 1);					\
83 	__m256d tmp__ = _mm256_insertf128_pd(								\
84 		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)),		\
85 		_mm_shuffle_pd(low1__, low2__, 3),								\
86 		1);																\
87 	out2 = _mm256_insertf128_pd(										\
88 		_mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)),	\
89 		_mm_shuffle_pd(high1__, high2__, 3),							\
90 		1);																\
91 	out1 = tmp__;														\
92 }
93 
94 /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
95 out1 = [ in1[0], in1[2], in2[0], in2[2] ]
96 out2 = [ in1[1], in1[3], in2[1], in2[3] ]
97 */
98 #  define UNINTERLEAVE2(in1, in2, out1, out2) {							\
99 	__m128d low1__ = _mm256_castpd256_pd128(in1);						\
100 	__m128d low2__ = _mm256_castpd256_pd128(in2);						\
101 	__m128d high1__ = _mm256_extractf128_pd(in1, 1);					\
102 	__m128d high2__ = _mm256_extractf128_pd(in2, 1); 					\
103 	__m256d tmp__ = _mm256_insertf128_pd(								\
104 		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)),		\
105 		_mm_shuffle_pd(low2__, high2__, 0),								\
106 		1);																\
107 	out2 = _mm256_insertf128_pd(										\
108 		_mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)),		\
109 		_mm_shuffle_pd(low2__, high2__, 3),								\
110 		1);																\
111 	out1 = tmp__;														\
112 }
113 
114 #  define VTRANSPOSE4(row0, row1, row2, row3) {				\
115         __m256d tmp3, tmp2, tmp1, tmp0;                     \
116                                                             \
117         tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0);       \
118         tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF);       \
119         tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0);       \
120         tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF);       \
121                                                             \
122         (row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20);	\
123         (row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20);  \
124         (row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31);  \
125         (row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31);  \
126     }
127 
128 /*VSWAPHL(a, b) pseudo code:
129 return [ b[0], b[1], a[2], a[3] ]
130 */
131 #  define VSWAPHL(a,b)	\
132    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1)
133 
134 /* reverse/flip all floats */
135 #  define VREV_S(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1)
136 
137 /* reverse/flip complex floats */
138 #  define VREV_C(a)    _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1)
139 
140 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
141 
142 #endif
143 
144 #endif /* PF_AVX_DBL_H */
145 
146