1 /****************************************************************************** 2 * 3 * Copyright 2022 Google LLC 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 19 #if __ARM_NEON && __ARM_ARCH_ISA_A64 20 21 #ifndef TEST_NEON 22 #include <arm_neon.h> 23 #endif /* TEST_NEON */ 24 25 26 /** 27 * FFT 5 Points 28 * The number of interleaved transform `n` assumed to be even 29 */ 30 #ifndef fft_5 31 #define fft_5 neon_fft_5 32 LC3_HOT static inline void neon_fft_5( 33 const struct lc3_complex *x, struct lc3_complex *y, int n) 34 { 35 static const union { float f[2]; uint64_t u64; } 36 __cos1 = { { 0.3090169944, 0.3090169944 } }, 37 __cos2 = { { -0.8090169944, -0.8090169944 } }, 38 __sin1 = { { 0.9510565163, -0.9510565163 } }, 39 __sin2 = { { 0.5877852523, -0.5877852523 } }; 40 41 float32x2_t sin1 = vcreate_f32(__sin1.u64); 42 float32x2_t sin2 = vcreate_f32(__sin2.u64); 43 float32x2_t cos1 = vcreate_f32(__cos1.u64); 44 float32x2_t cos2 = vcreate_f32(__cos2.u64); 45 46 float32x4_t sin1q = vcombine_f32(sin1, sin1); 47 float32x4_t sin2q = vcombine_f32(sin2, sin2); 48 float32x4_t cos1q = vcombine_f32(cos1, cos1); 49 float32x4_t cos2q = vcombine_f32(cos2, cos2); 50 51 for (int i = 0; i < n; i += 2, x += 2, y += 10) { 52 53 float32x4_t y0, y1, y2, y3, y4; 54 55 float32x4_t x0 = vld1q_f32( (float *)(x + 0*n) ); 56 float32x4_t x1 = vld1q_f32( (float *)(x + 1*n) ); 57 float32x4_t x2 = vld1q_f32( (float *)(x + 2*n) ); 58 float32x4_t x3 = vld1q_f32( (float *)(x + 3*n) ); 59 float32x4_t x4 = vld1q_f32( (float *)(x + 4*n) ); 60 61 float32x4_t s14 = vaddq_f32(x1, x4); 62 float32x4_t s23 = vaddq_f32(x2, x3); 63 64 float32x4_t d14 = vrev64q_f32( vsubq_f32(x1, x4) ); 65 float32x4_t d23 = vrev64q_f32( vsubq_f32(x2, x3) ); 66 67 y0 = vaddq_f32( x0, vaddq_f32(s14, s23) ); 68 69 y4 = vfmaq_f32( x0, s14, cos1q ); 70 y4 = vfmaq_f32( y4, s23, cos2q ); 71 72 y1 = vfmaq_f32( y4, d14, sin1q ); 73 y1 = vfmaq_f32( y1, d23, sin2q ); 74 75 y4 = vfmsq_f32( y4, d14, sin1q ); 76 y4 = vfmsq_f32( y4, d23, sin2q ); 77 78 y3 = vfmaq_f32( x0, s14, cos2q ); 79 y3 = vfmaq_f32( y3, s23, cos1q ); 80 81 y2 = vfmaq_f32( y3, d14, sin2q ); 82 y2 = vfmsq_f32( y2, d23, sin1q ); 83 84 y3 = vfmsq_f32( y3, d14, sin2q ); 85 y3 = vfmaq_f32( y3, d23, sin1q ); 86 87 vst1_f32( (float *)(y + 0), vget_low_f32(y0) ); 88 vst1_f32( (float *)(y + 1), vget_low_f32(y1) ); 89 vst1_f32( (float *)(y + 2), vget_low_f32(y2) ); 90 vst1_f32( (float *)(y + 3), vget_low_f32(y3) ); 91 vst1_f32( (float *)(y + 4), vget_low_f32(y4) ); 92 93 vst1_f32( (float *)(y + 5), vget_high_f32(y0) ); 94 vst1_f32( (float *)(y + 6), vget_high_f32(y1) ); 95 vst1_f32( (float *)(y + 7), vget_high_f32(y2) ); 96 vst1_f32( (float *)(y + 8), vget_high_f32(y3) ); 97 vst1_f32( (float *)(y + 9), vget_high_f32(y4) ); 98 } 99 } 100 #endif /* fft_5 */ 101 102 /** 103 * FFT Butterfly 3 Points 104 */ 105 #ifndef fft_bf3 106 #define fft_bf3 neon_fft_bf3 107 LC3_HOT static inline void neon_fft_bf3( 108 const struct lc3_fft_bf3_twiddles *twiddles, 109 const struct lc3_complex *x, struct lc3_complex *y, int n) 110 { 111 int n3 = twiddles->n3; 112 const struct lc3_complex (*w0_ptr)[2] = twiddles->t; 113 const struct lc3_complex (*w1_ptr)[2] = w0_ptr + n3; 114 const struct lc3_complex (*w2_ptr)[2] = w1_ptr + n3; 115 116 const struct lc3_complex *x0_ptr = x; 117 const struct lc3_complex *x1_ptr = x0_ptr + n*n3; 118 const struct lc3_complex *x2_ptr = x1_ptr + n*n3; 119 120 struct lc3_complex *y0_ptr = y; 121 struct lc3_complex *y1_ptr = y0_ptr + n3; 122 struct lc3_complex *y2_ptr = y1_ptr + n3; 123 124 for (int j, i = 0; i < n; i++, 125 y0_ptr += 3*n3, y1_ptr += 3*n3, y2_ptr += 3*n3) { 126 127 /* --- Process by pair --- */ 128 129 for (j = 0; j < (n3 >> 1); j++, 130 x0_ptr += 2, x1_ptr += 2, x2_ptr += 2) { 131 132 float32x4_t x0 = vld1q_f32( (float *)x0_ptr ); 133 float32x4_t x1 = vld1q_f32( (float *)x1_ptr ); 134 float32x4_t x2 = vld1q_f32( (float *)x2_ptr ); 135 136 float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 ); 137 float32x4_t x2r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x2)), x2 ); 138 139 float32x4x2_t wn; 140 float32x4_t yn; 141 142 wn = vld2q_f32( (float *)(w0_ptr + 2*j) ); 143 144 yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) ); 145 yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) ); 146 yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) ); 147 yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) ); 148 vst1q_f32( (float *)(y0_ptr + 2*j), yn ); 149 150 wn = vld2q_f32( (float *)(w1_ptr + 2*j) ); 151 152 yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) ); 153 yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) ); 154 yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) ); 155 yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) ); 156 vst1q_f32( (float *)(y1_ptr + 2*j), yn ); 157 158 wn = vld2q_f32( (float *)(w2_ptr + 2*j) ); 159 160 yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) ); 161 yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) ); 162 yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) ); 163 yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) ); 164 vst1q_f32( (float *)(y2_ptr + 2*j), yn ); 165 166 } 167 168 /* --- Last iteration --- */ 169 170 if (n3 & 1) { 171 172 float32x2x2_t wn; 173 float32x2_t yn; 174 175 float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) ); 176 float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) ); 177 float32x2_t x2 = vld1_f32( (float *)(x2_ptr++) ); 178 179 float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 ); 180 float32x2_t x2r = vtrn1_f32( vrev64_f32(vneg_f32(x2)), x2 ); 181 182 wn = vld2_f32( (float *)(w0_ptr + 2*j) ); 183 184 yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) ); 185 yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) ); 186 yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) ); 187 yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) ); 188 vst1_f32( (float *)(y0_ptr + 2*j), yn ); 189 190 wn = vld2_f32( (float *)(w1_ptr + 2*j) ); 191 192 yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) ); 193 yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) ); 194 yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) ); 195 yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) ); 196 vst1_f32( (float *)(y1_ptr + 2*j), yn ); 197 198 wn = vld2_f32( (float *)(w2_ptr + 2*j) ); 199 200 yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) ); 201 yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) ); 202 yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) ); 203 yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) ); 204 vst1_f32( (float *)(y2_ptr + 2*j), yn ); 205 } 206 207 } 208 } 209 #endif /* fft_bf3 */ 210 211 /** 212 * FFT Butterfly 2 Points 213 */ 214 #ifndef fft_bf2 215 #define fft_bf2 neon_fft_bf2 216 LC3_HOT static inline void neon_fft_bf2( 217 const struct lc3_fft_bf2_twiddles *twiddles, 218 const struct lc3_complex *x, struct lc3_complex *y, int n) 219 { 220 int n2 = twiddles->n2; 221 const struct lc3_complex *w_ptr = twiddles->t; 222 223 const struct lc3_complex *x0_ptr = x; 224 const struct lc3_complex *x1_ptr = x0_ptr + n*n2; 225 226 struct lc3_complex *y0_ptr = y; 227 struct lc3_complex *y1_ptr = y0_ptr + n2; 228 229 for (int j, i = 0; i < n; i++, y0_ptr += 2*n2, y1_ptr += 2*n2) { 230 231 /* --- Process by pair --- */ 232 233 for (j = 0; j < (n2 >> 1); j++, x0_ptr += 2, x1_ptr += 2) { 234 235 float32x4_t x0 = vld1q_f32( (float *)x0_ptr ); 236 float32x4_t x1 = vld1q_f32( (float *)x1_ptr ); 237 float32x4_t y0, y1; 238 239 float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 ); 240 241 float32x4_t w = vld1q_f32( (float *)(w_ptr + 2*j) ); 242 float32x4_t w_re = vtrn1q_f32(w, w); 243 float32x4_t w_im = vtrn2q_f32(w, w); 244 245 y0 = vfmaq_f32( x0, x1 , w_re ); 246 y0 = vfmaq_f32( y0, x1r, w_im ); 247 vst1q_f32( (float *)(y0_ptr + 2*j), y0 ); 248 249 y1 = vfmsq_f32( x0, x1 , w_re ); 250 y1 = vfmsq_f32( y1, x1r, w_im ); 251 vst1q_f32( (float *)(y1_ptr + 2*j), y1 ); 252 } 253 254 /* --- Last iteration --- */ 255 256 if (n2 & 1) { 257 258 float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) ); 259 float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) ); 260 float32x2_t y0, y1; 261 262 float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 ); 263 264 float32x2_t w = vld1_f32( (float *)(w_ptr + 2*j) ); 265 float32x2_t w_re = vtrn1_f32(w, w); 266 float32x2_t w_im = vtrn2_f32(w, w); 267 268 y0 = vfma_f32( x0, x1 , w_re ); 269 y0 = vfma_f32( y0, x1r, w_im ); 270 vst1_f32( (float *)(y0_ptr + 2*j), y0 ); 271 272 y1 = vfms_f32( x0, x1 , w_re ); 273 y1 = vfms_f32( y1, x1r, w_im ); 274 vst1_f32( (float *)(y1_ptr + 2*j), y1 ); 275 } 276 } 277 } 278 #endif /* fft_bf2 */ 279 280 #endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */ 281