1 /* 2 Copyright (c) 2020 Dario Mambro ( [email protected] ) 3 */ 4 5 /* Copyright (c) 2013 Julien Pommier ( [email protected] ) 6 7 Redistribution and use of the Software in source and binary forms, 8 with or without modification, is permitted provided that the 9 following conditions are met: 10 11 - Neither the names of NCAR's Computational and Information Systems 12 Laboratory, the University Corporation for Atmospheric Research, 13 nor the names of its sponsors or contributors may be used to 14 endorse or promote products derived from this Software without 15 specific prior written permission. 16 17 - Redistributions of source code must retain the above copyright 18 notices, this list of conditions, and the disclaimer below. 19 20 - Redistributions in binary form must reproduce the above copyright 21 notice, this list of conditions, and the disclaimer below in the 22 documentation and/or other materials provided with the 23 distribution. 24 25 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF 27 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT 29 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, 30 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN 31 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 32 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE 33 SOFTWARE. 34 */ 35 36 #ifndef PF_AVX_DBL_H 37 #define PF_AVX_DBL_H 38 39 /* 40 vector support macros: the rest of the code is independant of 41 AVX -- adding support for other platforms with 4-element 42 vectors should be limited to these macros 43 */ 44 45 46 /* 47 AVX support macros 48 */ 49 #if !defined(SIMD_SZ) && !defined(PFFFT_SIMD_DISABLE) && !defined(PFFFT_AVX_DISABLE) && defined(__AVX__) 50 #pragma message( __FILE__ ": AVX macros are defined" ) 51 52 #include <immintrin.h> 53 typedef __m256d v4sf; 54 55 /* 4 doubles by simd vector */ 56 # define SIMD_SZ 4 57 58 typedef union v4sf_union { 59 v4sf v; 60 double f[SIMD_SZ]; 61 } v4sf_union; 62 63 # define VARCH "AVX" 64 # define VREQUIRES_ALIGN 1 65 # define VZERO() _mm256_setzero_pd() 66 # define VMUL(a,b) _mm256_mul_pd(a,b) 67 # define VADD(a,b) _mm256_add_pd(a,b) 68 # define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c) 69 # define VSUB(a,b) _mm256_sub_pd(a,b) 70 # define LD_PS1(p) _mm256_set1_pd(p) 71 # define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr) 72 # define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr) 73 74 /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code: 75 out1 = [ in1[0], in2[0], in1[1], in2[1] ] 76 out2 = [ in1[2], in2[2], in1[3], in2[3] ] 77 */ 78 # define INTERLEAVE2(in1, in2, out1, out2) { \ 79 __m128d low1__ = _mm256_castpd256_pd128(in1); \ 80 __m128d low2__ = _mm256_castpd256_pd128(in2); \ 81 __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ 82 __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ 83 __m256d tmp__ = _mm256_insertf128_pd( \ 84 _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, low2__, 0)), \ 85 _mm_shuffle_pd(low1__, low2__, 3), \ 86 1); \ 87 out2 = _mm256_insertf128_pd( \ 88 _mm256_castpd128_pd256(_mm_shuffle_pd(high1__, high2__, 0)), \ 89 _mm_shuffle_pd(high1__, high2__, 3), \ 90 1); \ 91 out1 = tmp__; \ 92 } 93 94 /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code: 95 out1 = [ in1[0], in1[2], in2[0], in2[2] ] 96 out2 = [ in1[1], in1[3], in2[1], in2[3] ] 97 */ 98 # define UNINTERLEAVE2(in1, in2, out1, out2) { \ 99 __m128d low1__ = _mm256_castpd256_pd128(in1); \ 100 __m128d low2__ = _mm256_castpd256_pd128(in2); \ 101 __m128d high1__ = _mm256_extractf128_pd(in1, 1); \ 102 __m128d high2__ = _mm256_extractf128_pd(in2, 1); \ 103 __m256d tmp__ = _mm256_insertf128_pd( \ 104 _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 0)), \ 105 _mm_shuffle_pd(low2__, high2__, 0), \ 106 1); \ 107 out2 = _mm256_insertf128_pd( \ 108 _mm256_castpd128_pd256(_mm_shuffle_pd(low1__, high1__, 3)), \ 109 _mm_shuffle_pd(low2__, high2__, 3), \ 110 1); \ 111 out1 = tmp__; \ 112 } 113 114 # define VTRANSPOSE4(row0, row1, row2, row3) { \ 115 __m256d tmp3, tmp2, tmp1, tmp0; \ 116 \ 117 tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0); \ 118 tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF); \ 119 tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0); \ 120 tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF); \ 121 \ 122 (row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20); \ 123 (row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20); \ 124 (row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31); \ 125 (row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31); \ 126 } 127 128 /*VSWAPHL(a, b) pseudo code: 129 return [ b[0], b[1], a[2], a[3] ] 130 */ 131 # define VSWAPHL(a,b) \ 132 _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1), 1) 133 134 /* reverse/flip all floats */ 135 # define VREV_S(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_permute_pd(_mm256_extractf128_pd(a, 1),1)), _mm_permute_pd(_mm256_castpd256_pd128(a), 1), 1) 136 137 /* reverse/flip complex floats */ 138 # define VREV_C(a) _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a), 1) 139 140 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0) 141 142 #endif 143 144 #endif /* PF_AVX_DBL_H */ 145 146