1 /****************************************************************************** 2 * 3 * Copyright 2022 Google LLC 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 19 #if __ARM_NEON && __ARM_ARCH_ISA_A64 && \ 20 !defined(TEST_ARM) || defined(TEST_NEON) 21 22 #ifndef TEST_NEON 23 #include <arm_neon.h> 24 #endif /* TEST_NEON */ 25 26 27 /** 28 * FFT 5 Points 29 * The number of interleaved transform `n` assumed to be even 30 */ 31 #ifndef fft_5 32 33 LC3_HOT static inline void neon_fft_5( 34 const struct lc3_complex *x, struct lc3_complex *y, int n) 35 { 36 static const union { float f[2]; uint64_t u64; } 37 __cos1 = { { 0.3090169944, 0.3090169944 } }, 38 __cos2 = { { -0.8090169944, -0.8090169944 } }, 39 __sin1 = { { 0.9510565163, -0.9510565163 } }, 40 __sin2 = { { 0.5877852523, -0.5877852523 } }; 41 42 float32x2_t sin1 = vcreate_f32(__sin1.u64); 43 float32x2_t sin2 = vcreate_f32(__sin2.u64); 44 float32x2_t cos1 = vcreate_f32(__cos1.u64); 45 float32x2_t cos2 = vcreate_f32(__cos2.u64); 46 47 float32x4_t sin1q = vcombine_f32(sin1, sin1); 48 float32x4_t sin2q = vcombine_f32(sin2, sin2); 49 float32x4_t cos1q = vcombine_f32(cos1, cos1); 50 float32x4_t cos2q = vcombine_f32(cos2, cos2); 51 52 for (int i = 0; i < n; i += 2, x += 2, y += 10) { 53 54 float32x4_t y0, y1, y2, y3, y4; 55 56 float32x4_t x0 = vld1q_f32( (float *)(x + 0*n) ); 57 float32x4_t x1 = vld1q_f32( (float *)(x + 1*n) ); 58 float32x4_t x2 = vld1q_f32( (float *)(x + 2*n) ); 59 float32x4_t x3 = vld1q_f32( (float *)(x + 3*n) ); 60 float32x4_t x4 = vld1q_f32( (float *)(x + 4*n) ); 61 62 float32x4_t s14 = vaddq_f32(x1, x4); 63 float32x4_t s23 = vaddq_f32(x2, x3); 64 65 float32x4_t d14 = vrev64q_f32( vsubq_f32(x1, x4) ); 66 float32x4_t d23 = vrev64q_f32( vsubq_f32(x2, x3) ); 67 68 y0 = vaddq_f32( x0, vaddq_f32(s14, s23) ); 69 70 y4 = vfmaq_f32( x0, s14, cos1q ); 71 y4 = vfmaq_f32( y4, s23, cos2q ); 72 73 y1 = vfmaq_f32( y4, d14, sin1q ); 74 y1 = vfmaq_f32( y1, d23, sin2q ); 75 76 y4 = vfmsq_f32( y4, d14, sin1q ); 77 y4 = vfmsq_f32( y4, d23, sin2q ); 78 79 y3 = vfmaq_f32( x0, s14, cos2q ); 80 y3 = vfmaq_f32( y3, s23, cos1q ); 81 82 y2 = vfmaq_f32( y3, d14, sin2q ); 83 y2 = vfmsq_f32( y2, d23, sin1q ); 84 85 y3 = vfmsq_f32( y3, d14, sin2q ); 86 y3 = vfmaq_f32( y3, d23, sin1q ); 87 88 vst1_f32( (float *)(y + 0), vget_low_f32(y0) ); 89 vst1_f32( (float *)(y + 1), vget_low_f32(y1) ); 90 vst1_f32( (float *)(y + 2), vget_low_f32(y2) ); 91 vst1_f32( (float *)(y + 3), vget_low_f32(y3) ); 92 vst1_f32( (float *)(y + 4), vget_low_f32(y4) ); 93 94 vst1_f32( (float *)(y + 5), vget_high_f32(y0) ); 95 vst1_f32( (float *)(y + 6), vget_high_f32(y1) ); 96 vst1_f32( (float *)(y + 7), vget_high_f32(y2) ); 97 vst1_f32( (float *)(y + 8), vget_high_f32(y3) ); 98 vst1_f32( (float *)(y + 9), vget_high_f32(y4) ); 99 } 100 } 101 102 #ifndef TEST_NEON 103 #define fft_5 neon_fft_5 104 #endif 105 106 #endif /* fft_5 */ 107 108 /** 109 * FFT Butterfly 3 Points 110 */ 111 #ifndef fft_bf3 112 113 LC3_HOT static inline void neon_fft_bf3( 114 const struct lc3_fft_bf3_twiddles *twiddles, 115 const struct lc3_complex *x, struct lc3_complex *y, int n) 116 { 117 int n3 = twiddles->n3; 118 const struct lc3_complex (*w0_ptr)[2] = twiddles->t; 119 const struct lc3_complex (*w1_ptr)[2] = w0_ptr + n3; 120 const struct lc3_complex (*w2_ptr)[2] = w1_ptr + n3; 121 122 const struct lc3_complex *x0_ptr = x; 123 const struct lc3_complex *x1_ptr = x0_ptr + n*n3; 124 const struct lc3_complex *x2_ptr = x1_ptr + n*n3; 125 126 struct lc3_complex *y0_ptr = y; 127 struct lc3_complex *y1_ptr = y0_ptr + n3; 128 struct lc3_complex *y2_ptr = y1_ptr + n3; 129 130 for (int j, i = 0; i < n; i++, 131 y0_ptr += 3*n3, y1_ptr += 3*n3, y2_ptr += 3*n3) { 132 133 /* --- Process by pair --- */ 134 135 for (j = 0; j < (n3 >> 1); j++, 136 x0_ptr += 2, x1_ptr += 2, x2_ptr += 2) { 137 138 float32x4_t x0 = vld1q_f32( (float *)x0_ptr ); 139 float32x4_t x1 = vld1q_f32( (float *)x1_ptr ); 140 float32x4_t x2 = vld1q_f32( (float *)x2_ptr ); 141 142 float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 ); 143 float32x4_t x2r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x2)), x2 ); 144 145 float32x4x2_t wn; 146 float32x4_t yn; 147 148 wn = vld2q_f32( (float *)(w0_ptr + 2*j) ); 149 150 yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) ); 151 yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) ); 152 yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) ); 153 yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) ); 154 vst1q_f32( (float *)(y0_ptr + 2*j), yn ); 155 156 wn = vld2q_f32( (float *)(w1_ptr + 2*j) ); 157 158 yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) ); 159 yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) ); 160 yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) ); 161 yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) ); 162 vst1q_f32( (float *)(y1_ptr + 2*j), yn ); 163 164 wn = vld2q_f32( (float *)(w2_ptr + 2*j) ); 165 166 yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) ); 167 yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) ); 168 yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) ); 169 yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) ); 170 vst1q_f32( (float *)(y2_ptr + 2*j), yn ); 171 172 } 173 174 /* --- Last iteration --- */ 175 176 if (n3 & 1) { 177 178 float32x2x2_t wn; 179 float32x2_t yn; 180 181 float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) ); 182 float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) ); 183 float32x2_t x2 = vld1_f32( (float *)(x2_ptr++) ); 184 185 float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 ); 186 float32x2_t x2r = vtrn1_f32( vrev64_f32(vneg_f32(x2)), x2 ); 187 188 wn = vld2_f32( (float *)(w0_ptr + 2*j) ); 189 190 yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) ); 191 yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) ); 192 yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) ); 193 yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) ); 194 vst1_f32( (float *)(y0_ptr + 2*j), yn ); 195 196 wn = vld2_f32( (float *)(w1_ptr + 2*j) ); 197 198 yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) ); 199 yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) ); 200 yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) ); 201 yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) ); 202 vst1_f32( (float *)(y1_ptr + 2*j), yn ); 203 204 wn = vld2_f32( (float *)(w2_ptr + 2*j) ); 205 206 yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) ); 207 yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) ); 208 yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) ); 209 yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) ); 210 vst1_f32( (float *)(y2_ptr + 2*j), yn ); 211 } 212 213 } 214 } 215 216 #ifndef TEST_NEON 217 #define fft_bf3 neon_fft_bf3 218 #endif 219 220 #endif /* fft_bf3 */ 221 222 /** 223 * FFT Butterfly 2 Points 224 */ 225 #ifndef fft_bf2 226 227 LC3_HOT static inline void neon_fft_bf2( 228 const struct lc3_fft_bf2_twiddles *twiddles, 229 const struct lc3_complex *x, struct lc3_complex *y, int n) 230 { 231 int n2 = twiddles->n2; 232 const struct lc3_complex *w_ptr = twiddles->t; 233 234 const struct lc3_complex *x0_ptr = x; 235 const struct lc3_complex *x1_ptr = x0_ptr + n*n2; 236 237 struct lc3_complex *y0_ptr = y; 238 struct lc3_complex *y1_ptr = y0_ptr + n2; 239 240 for (int j, i = 0; i < n; i++, y0_ptr += 2*n2, y1_ptr += 2*n2) { 241 242 /* --- Process by pair --- */ 243 244 for (j = 0; j < (n2 >> 1); j++, x0_ptr += 2, x1_ptr += 2) { 245 246 float32x4_t x0 = vld1q_f32( (float *)x0_ptr ); 247 float32x4_t x1 = vld1q_f32( (float *)x1_ptr ); 248 float32x4_t y0, y1; 249 250 float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 ); 251 252 float32x4_t w = vld1q_f32( (float *)(w_ptr + 2*j) ); 253 float32x4_t w_re = vtrn1q_f32(w, w); 254 float32x4_t w_im = vtrn2q_f32(w, w); 255 256 y0 = vfmaq_f32( x0, x1 , w_re ); 257 y0 = vfmaq_f32( y0, x1r, w_im ); 258 vst1q_f32( (float *)(y0_ptr + 2*j), y0 ); 259 260 y1 = vfmsq_f32( x0, x1 , w_re ); 261 y1 = vfmsq_f32( y1, x1r, w_im ); 262 vst1q_f32( (float *)(y1_ptr + 2*j), y1 ); 263 } 264 265 /* --- Last iteration --- */ 266 267 if (n2 & 1) { 268 269 float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) ); 270 float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) ); 271 float32x2_t y0, y1; 272 273 float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 ); 274 275 float32x2_t w = vld1_f32( (float *)(w_ptr + 2*j) ); 276 float32x2_t w_re = vtrn1_f32(w, w); 277 float32x2_t w_im = vtrn2_f32(w, w); 278 279 y0 = vfma_f32( x0, x1 , w_re ); 280 y0 = vfma_f32( y0, x1r, w_im ); 281 vst1_f32( (float *)(y0_ptr + 2*j), y0 ); 282 283 y1 = vfms_f32( x0, x1 , w_re ); 284 y1 = vfms_f32( y1, x1r, w_im ); 285 vst1_f32( (float *)(y1_ptr + 2*j), y1 ); 286 } 287 } 288 } 289 290 #ifndef TEST_NEON 291 #define fft_bf2 neon_fft_bf2 292 #endif 293 294 #endif /* fft_bf2 */ 295 296 #endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */ 297