14930cef6SMatthias Ringwald /******************************************************************************
24930cef6SMatthias Ringwald *
34930cef6SMatthias Ringwald * Copyright 2022 Google LLC
44930cef6SMatthias Ringwald *
54930cef6SMatthias Ringwald * Licensed under the Apache License, Version 2.0 (the "License");
64930cef6SMatthias Ringwald * you may not use this file except in compliance with the License.
74930cef6SMatthias Ringwald * You may obtain a copy of the License at:
84930cef6SMatthias Ringwald *
94930cef6SMatthias Ringwald * http://www.apache.org/licenses/LICENSE-2.0
104930cef6SMatthias Ringwald *
114930cef6SMatthias Ringwald * Unless required by applicable law or agreed to in writing, software
124930cef6SMatthias Ringwald * distributed under the License is distributed on an "AS IS" BASIS,
134930cef6SMatthias Ringwald * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
144930cef6SMatthias Ringwald * See the License for the specific language governing permissions and
154930cef6SMatthias Ringwald * limitations under the License.
164930cef6SMatthias Ringwald *
174930cef6SMatthias Ringwald ******************************************************************************/
184930cef6SMatthias Ringwald
19*4c4eb519SMatthias Ringwald #if __ARM_NEON && __ARM_ARCH_ISA_A64 && \
20*4c4eb519SMatthias Ringwald !defined(TEST_ARM) || defined(TEST_NEON)
214930cef6SMatthias Ringwald
224930cef6SMatthias Ringwald #ifndef TEST_NEON
234930cef6SMatthias Ringwald #include <arm_neon.h>
244930cef6SMatthias Ringwald #endif /* TEST_NEON */
254930cef6SMatthias Ringwald
264930cef6SMatthias Ringwald
274930cef6SMatthias Ringwald /**
284930cef6SMatthias Ringwald * FFT 5 Points
294930cef6SMatthias Ringwald * The number of interleaved transform `n` assumed to be even
304930cef6SMatthias Ringwald */
314930cef6SMatthias Ringwald #ifndef fft_5
32*4c4eb519SMatthias Ringwald
neon_fft_5(const struct lc3_complex * x,struct lc3_complex * y,int n)334930cef6SMatthias Ringwald LC3_HOT static inline void neon_fft_5(
344930cef6SMatthias Ringwald const struct lc3_complex *x, struct lc3_complex *y, int n)
354930cef6SMatthias Ringwald {
364930cef6SMatthias Ringwald static const union { float f[2]; uint64_t u64; }
374930cef6SMatthias Ringwald __cos1 = { { 0.3090169944, 0.3090169944 } },
384930cef6SMatthias Ringwald __cos2 = { { -0.8090169944, -0.8090169944 } },
394930cef6SMatthias Ringwald __sin1 = { { 0.9510565163, -0.9510565163 } },
404930cef6SMatthias Ringwald __sin2 = { { 0.5877852523, -0.5877852523 } };
414930cef6SMatthias Ringwald
424930cef6SMatthias Ringwald float32x2_t sin1 = vcreate_f32(__sin1.u64);
434930cef6SMatthias Ringwald float32x2_t sin2 = vcreate_f32(__sin2.u64);
444930cef6SMatthias Ringwald float32x2_t cos1 = vcreate_f32(__cos1.u64);
454930cef6SMatthias Ringwald float32x2_t cos2 = vcreate_f32(__cos2.u64);
464930cef6SMatthias Ringwald
474930cef6SMatthias Ringwald float32x4_t sin1q = vcombine_f32(sin1, sin1);
484930cef6SMatthias Ringwald float32x4_t sin2q = vcombine_f32(sin2, sin2);
494930cef6SMatthias Ringwald float32x4_t cos1q = vcombine_f32(cos1, cos1);
504930cef6SMatthias Ringwald float32x4_t cos2q = vcombine_f32(cos2, cos2);
514930cef6SMatthias Ringwald
524930cef6SMatthias Ringwald for (int i = 0; i < n; i += 2, x += 2, y += 10) {
534930cef6SMatthias Ringwald
544930cef6SMatthias Ringwald float32x4_t y0, y1, y2, y3, y4;
554930cef6SMatthias Ringwald
564930cef6SMatthias Ringwald float32x4_t x0 = vld1q_f32( (float *)(x + 0*n) );
574930cef6SMatthias Ringwald float32x4_t x1 = vld1q_f32( (float *)(x + 1*n) );
584930cef6SMatthias Ringwald float32x4_t x2 = vld1q_f32( (float *)(x + 2*n) );
594930cef6SMatthias Ringwald float32x4_t x3 = vld1q_f32( (float *)(x + 3*n) );
604930cef6SMatthias Ringwald float32x4_t x4 = vld1q_f32( (float *)(x + 4*n) );
614930cef6SMatthias Ringwald
624930cef6SMatthias Ringwald float32x4_t s14 = vaddq_f32(x1, x4);
634930cef6SMatthias Ringwald float32x4_t s23 = vaddq_f32(x2, x3);
644930cef6SMatthias Ringwald
654930cef6SMatthias Ringwald float32x4_t d14 = vrev64q_f32( vsubq_f32(x1, x4) );
664930cef6SMatthias Ringwald float32x4_t d23 = vrev64q_f32( vsubq_f32(x2, x3) );
674930cef6SMatthias Ringwald
684930cef6SMatthias Ringwald y0 = vaddq_f32( x0, vaddq_f32(s14, s23) );
694930cef6SMatthias Ringwald
704930cef6SMatthias Ringwald y4 = vfmaq_f32( x0, s14, cos1q );
714930cef6SMatthias Ringwald y4 = vfmaq_f32( y4, s23, cos2q );
724930cef6SMatthias Ringwald
734930cef6SMatthias Ringwald y1 = vfmaq_f32( y4, d14, sin1q );
744930cef6SMatthias Ringwald y1 = vfmaq_f32( y1, d23, sin2q );
754930cef6SMatthias Ringwald
764930cef6SMatthias Ringwald y4 = vfmsq_f32( y4, d14, sin1q );
774930cef6SMatthias Ringwald y4 = vfmsq_f32( y4, d23, sin2q );
784930cef6SMatthias Ringwald
794930cef6SMatthias Ringwald y3 = vfmaq_f32( x0, s14, cos2q );
804930cef6SMatthias Ringwald y3 = vfmaq_f32( y3, s23, cos1q );
814930cef6SMatthias Ringwald
824930cef6SMatthias Ringwald y2 = vfmaq_f32( y3, d14, sin2q );
834930cef6SMatthias Ringwald y2 = vfmsq_f32( y2, d23, sin1q );
844930cef6SMatthias Ringwald
854930cef6SMatthias Ringwald y3 = vfmsq_f32( y3, d14, sin2q );
864930cef6SMatthias Ringwald y3 = vfmaq_f32( y3, d23, sin1q );
874930cef6SMatthias Ringwald
884930cef6SMatthias Ringwald vst1_f32( (float *)(y + 0), vget_low_f32(y0) );
894930cef6SMatthias Ringwald vst1_f32( (float *)(y + 1), vget_low_f32(y1) );
904930cef6SMatthias Ringwald vst1_f32( (float *)(y + 2), vget_low_f32(y2) );
914930cef6SMatthias Ringwald vst1_f32( (float *)(y + 3), vget_low_f32(y3) );
924930cef6SMatthias Ringwald vst1_f32( (float *)(y + 4), vget_low_f32(y4) );
934930cef6SMatthias Ringwald
944930cef6SMatthias Ringwald vst1_f32( (float *)(y + 5), vget_high_f32(y0) );
954930cef6SMatthias Ringwald vst1_f32( (float *)(y + 6), vget_high_f32(y1) );
964930cef6SMatthias Ringwald vst1_f32( (float *)(y + 7), vget_high_f32(y2) );
974930cef6SMatthias Ringwald vst1_f32( (float *)(y + 8), vget_high_f32(y3) );
984930cef6SMatthias Ringwald vst1_f32( (float *)(y + 9), vget_high_f32(y4) );
994930cef6SMatthias Ringwald }
1004930cef6SMatthias Ringwald }
101*4c4eb519SMatthias Ringwald
102*4c4eb519SMatthias Ringwald #ifndef TEST_NEON
103*4c4eb519SMatthias Ringwald #define fft_5 neon_fft_5
104*4c4eb519SMatthias Ringwald #endif
105*4c4eb519SMatthias Ringwald
1064930cef6SMatthias Ringwald #endif /* fft_5 */
1074930cef6SMatthias Ringwald
1084930cef6SMatthias Ringwald /**
1094930cef6SMatthias Ringwald * FFT Butterfly 3 Points
1104930cef6SMatthias Ringwald */
1114930cef6SMatthias Ringwald #ifndef fft_bf3
112*4c4eb519SMatthias Ringwald
neon_fft_bf3(const struct lc3_fft_bf3_twiddles * twiddles,const struct lc3_complex * x,struct lc3_complex * y,int n)1134930cef6SMatthias Ringwald LC3_HOT static inline void neon_fft_bf3(
1144930cef6SMatthias Ringwald const struct lc3_fft_bf3_twiddles *twiddles,
1154930cef6SMatthias Ringwald const struct lc3_complex *x, struct lc3_complex *y, int n)
1164930cef6SMatthias Ringwald {
1174930cef6SMatthias Ringwald int n3 = twiddles->n3;
1184930cef6SMatthias Ringwald const struct lc3_complex (*w0_ptr)[2] = twiddles->t;
1194930cef6SMatthias Ringwald const struct lc3_complex (*w1_ptr)[2] = w0_ptr + n3;
1204930cef6SMatthias Ringwald const struct lc3_complex (*w2_ptr)[2] = w1_ptr + n3;
1214930cef6SMatthias Ringwald
1224930cef6SMatthias Ringwald const struct lc3_complex *x0_ptr = x;
1234930cef6SMatthias Ringwald const struct lc3_complex *x1_ptr = x0_ptr + n*n3;
1244930cef6SMatthias Ringwald const struct lc3_complex *x2_ptr = x1_ptr + n*n3;
1254930cef6SMatthias Ringwald
1264930cef6SMatthias Ringwald struct lc3_complex *y0_ptr = y;
1274930cef6SMatthias Ringwald struct lc3_complex *y1_ptr = y0_ptr + n3;
1284930cef6SMatthias Ringwald struct lc3_complex *y2_ptr = y1_ptr + n3;
1294930cef6SMatthias Ringwald
1304930cef6SMatthias Ringwald for (int j, i = 0; i < n; i++,
1314930cef6SMatthias Ringwald y0_ptr += 3*n3, y1_ptr += 3*n3, y2_ptr += 3*n3) {
1324930cef6SMatthias Ringwald
1334930cef6SMatthias Ringwald /* --- Process by pair --- */
1344930cef6SMatthias Ringwald
1354930cef6SMatthias Ringwald for (j = 0; j < (n3 >> 1); j++,
1364930cef6SMatthias Ringwald x0_ptr += 2, x1_ptr += 2, x2_ptr += 2) {
1374930cef6SMatthias Ringwald
1384930cef6SMatthias Ringwald float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
1394930cef6SMatthias Ringwald float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
1404930cef6SMatthias Ringwald float32x4_t x2 = vld1q_f32( (float *)x2_ptr );
1414930cef6SMatthias Ringwald
1424930cef6SMatthias Ringwald float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );
1434930cef6SMatthias Ringwald float32x4_t x2r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x2)), x2 );
1444930cef6SMatthias Ringwald
1454930cef6SMatthias Ringwald float32x4x2_t wn;
1464930cef6SMatthias Ringwald float32x4_t yn;
1474930cef6SMatthias Ringwald
1484930cef6SMatthias Ringwald wn = vld2q_f32( (float *)(w0_ptr + 2*j) );
1494930cef6SMatthias Ringwald
1504930cef6SMatthias Ringwald yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
1514930cef6SMatthias Ringwald yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
1524930cef6SMatthias Ringwald yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
1534930cef6SMatthias Ringwald yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
1544930cef6SMatthias Ringwald vst1q_f32( (float *)(y0_ptr + 2*j), yn );
1554930cef6SMatthias Ringwald
1564930cef6SMatthias Ringwald wn = vld2q_f32( (float *)(w1_ptr + 2*j) );
1574930cef6SMatthias Ringwald
1584930cef6SMatthias Ringwald yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
1594930cef6SMatthias Ringwald yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
1604930cef6SMatthias Ringwald yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
1614930cef6SMatthias Ringwald yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
1624930cef6SMatthias Ringwald vst1q_f32( (float *)(y1_ptr + 2*j), yn );
1634930cef6SMatthias Ringwald
1644930cef6SMatthias Ringwald wn = vld2q_f32( (float *)(w2_ptr + 2*j) );
1654930cef6SMatthias Ringwald
1664930cef6SMatthias Ringwald yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
1674930cef6SMatthias Ringwald yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
1684930cef6SMatthias Ringwald yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
1694930cef6SMatthias Ringwald yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
1704930cef6SMatthias Ringwald vst1q_f32( (float *)(y2_ptr + 2*j), yn );
1714930cef6SMatthias Ringwald
1724930cef6SMatthias Ringwald }
1734930cef6SMatthias Ringwald
1744930cef6SMatthias Ringwald /* --- Last iteration --- */
1754930cef6SMatthias Ringwald
1764930cef6SMatthias Ringwald if (n3 & 1) {
1774930cef6SMatthias Ringwald
1784930cef6SMatthias Ringwald float32x2x2_t wn;
1794930cef6SMatthias Ringwald float32x2_t yn;
1804930cef6SMatthias Ringwald
1814930cef6SMatthias Ringwald float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
1824930cef6SMatthias Ringwald float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
1834930cef6SMatthias Ringwald float32x2_t x2 = vld1_f32( (float *)(x2_ptr++) );
1844930cef6SMatthias Ringwald
1854930cef6SMatthias Ringwald float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );
1864930cef6SMatthias Ringwald float32x2_t x2r = vtrn1_f32( vrev64_f32(vneg_f32(x2)), x2 );
1874930cef6SMatthias Ringwald
1884930cef6SMatthias Ringwald wn = vld2_f32( (float *)(w0_ptr + 2*j) );
1894930cef6SMatthias Ringwald
1904930cef6SMatthias Ringwald yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
1914930cef6SMatthias Ringwald yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
1924930cef6SMatthias Ringwald yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
1934930cef6SMatthias Ringwald yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
1944930cef6SMatthias Ringwald vst1_f32( (float *)(y0_ptr + 2*j), yn );
1954930cef6SMatthias Ringwald
1964930cef6SMatthias Ringwald wn = vld2_f32( (float *)(w1_ptr + 2*j) );
1974930cef6SMatthias Ringwald
1984930cef6SMatthias Ringwald yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
1994930cef6SMatthias Ringwald yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
2004930cef6SMatthias Ringwald yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
2014930cef6SMatthias Ringwald yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
2024930cef6SMatthias Ringwald vst1_f32( (float *)(y1_ptr + 2*j), yn );
2034930cef6SMatthias Ringwald
2044930cef6SMatthias Ringwald wn = vld2_f32( (float *)(w2_ptr + 2*j) );
2054930cef6SMatthias Ringwald
2064930cef6SMatthias Ringwald yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
2074930cef6SMatthias Ringwald yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
2084930cef6SMatthias Ringwald yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
2094930cef6SMatthias Ringwald yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
2104930cef6SMatthias Ringwald vst1_f32( (float *)(y2_ptr + 2*j), yn );
2114930cef6SMatthias Ringwald }
2124930cef6SMatthias Ringwald
2134930cef6SMatthias Ringwald }
2144930cef6SMatthias Ringwald }
215*4c4eb519SMatthias Ringwald
216*4c4eb519SMatthias Ringwald #ifndef TEST_NEON
217*4c4eb519SMatthias Ringwald #define fft_bf3 neon_fft_bf3
218*4c4eb519SMatthias Ringwald #endif
219*4c4eb519SMatthias Ringwald
2204930cef6SMatthias Ringwald #endif /* fft_bf3 */
2214930cef6SMatthias Ringwald
2224930cef6SMatthias Ringwald /**
2234930cef6SMatthias Ringwald * FFT Butterfly 2 Points
2244930cef6SMatthias Ringwald */
2254930cef6SMatthias Ringwald #ifndef fft_bf2
226*4c4eb519SMatthias Ringwald
neon_fft_bf2(const struct lc3_fft_bf2_twiddles * twiddles,const struct lc3_complex * x,struct lc3_complex * y,int n)2274930cef6SMatthias Ringwald LC3_HOT static inline void neon_fft_bf2(
2284930cef6SMatthias Ringwald const struct lc3_fft_bf2_twiddles *twiddles,
2294930cef6SMatthias Ringwald const struct lc3_complex *x, struct lc3_complex *y, int n)
2304930cef6SMatthias Ringwald {
2314930cef6SMatthias Ringwald int n2 = twiddles->n2;
2324930cef6SMatthias Ringwald const struct lc3_complex *w_ptr = twiddles->t;
2334930cef6SMatthias Ringwald
2344930cef6SMatthias Ringwald const struct lc3_complex *x0_ptr = x;
2354930cef6SMatthias Ringwald const struct lc3_complex *x1_ptr = x0_ptr + n*n2;
2364930cef6SMatthias Ringwald
2374930cef6SMatthias Ringwald struct lc3_complex *y0_ptr = y;
2384930cef6SMatthias Ringwald struct lc3_complex *y1_ptr = y0_ptr + n2;
2394930cef6SMatthias Ringwald
2404930cef6SMatthias Ringwald for (int j, i = 0; i < n; i++, y0_ptr += 2*n2, y1_ptr += 2*n2) {
2414930cef6SMatthias Ringwald
2424930cef6SMatthias Ringwald /* --- Process by pair --- */
2434930cef6SMatthias Ringwald
2444930cef6SMatthias Ringwald for (j = 0; j < (n2 >> 1); j++, x0_ptr += 2, x1_ptr += 2) {
2454930cef6SMatthias Ringwald
2464930cef6SMatthias Ringwald float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
2474930cef6SMatthias Ringwald float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
2484930cef6SMatthias Ringwald float32x4_t y0, y1;
2494930cef6SMatthias Ringwald
2504930cef6SMatthias Ringwald float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );
2514930cef6SMatthias Ringwald
2524930cef6SMatthias Ringwald float32x4_t w = vld1q_f32( (float *)(w_ptr + 2*j) );
2534930cef6SMatthias Ringwald float32x4_t w_re = vtrn1q_f32(w, w);
2544930cef6SMatthias Ringwald float32x4_t w_im = vtrn2q_f32(w, w);
2554930cef6SMatthias Ringwald
2564930cef6SMatthias Ringwald y0 = vfmaq_f32( x0, x1 , w_re );
2574930cef6SMatthias Ringwald y0 = vfmaq_f32( y0, x1r, w_im );
2584930cef6SMatthias Ringwald vst1q_f32( (float *)(y0_ptr + 2*j), y0 );
2594930cef6SMatthias Ringwald
2604930cef6SMatthias Ringwald y1 = vfmsq_f32( x0, x1 , w_re );
2614930cef6SMatthias Ringwald y1 = vfmsq_f32( y1, x1r, w_im );
2624930cef6SMatthias Ringwald vst1q_f32( (float *)(y1_ptr + 2*j), y1 );
2634930cef6SMatthias Ringwald }
2644930cef6SMatthias Ringwald
2654930cef6SMatthias Ringwald /* --- Last iteration --- */
2664930cef6SMatthias Ringwald
2674930cef6SMatthias Ringwald if (n2 & 1) {
2684930cef6SMatthias Ringwald
2694930cef6SMatthias Ringwald float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
2704930cef6SMatthias Ringwald float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
2714930cef6SMatthias Ringwald float32x2_t y0, y1;
2724930cef6SMatthias Ringwald
2734930cef6SMatthias Ringwald float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );
2744930cef6SMatthias Ringwald
2754930cef6SMatthias Ringwald float32x2_t w = vld1_f32( (float *)(w_ptr + 2*j) );
2764930cef6SMatthias Ringwald float32x2_t w_re = vtrn1_f32(w, w);
2774930cef6SMatthias Ringwald float32x2_t w_im = vtrn2_f32(w, w);
2784930cef6SMatthias Ringwald
2794930cef6SMatthias Ringwald y0 = vfma_f32( x0, x1 , w_re );
2804930cef6SMatthias Ringwald y0 = vfma_f32( y0, x1r, w_im );
2814930cef6SMatthias Ringwald vst1_f32( (float *)(y0_ptr + 2*j), y0 );
2824930cef6SMatthias Ringwald
2834930cef6SMatthias Ringwald y1 = vfms_f32( x0, x1 , w_re );
2844930cef6SMatthias Ringwald y1 = vfms_f32( y1, x1r, w_im );
2854930cef6SMatthias Ringwald vst1_f32( (float *)(y1_ptr + 2*j), y1 );
2864930cef6SMatthias Ringwald }
2874930cef6SMatthias Ringwald }
2884930cef6SMatthias Ringwald }
289*4c4eb519SMatthias Ringwald
290*4c4eb519SMatthias Ringwald #ifndef TEST_NEON
291*4c4eb519SMatthias Ringwald #define fft_bf2 neon_fft_bf2
292*4c4eb519SMatthias Ringwald #endif
293*4c4eb519SMatthias Ringwald
2944930cef6SMatthias Ringwald #endif /* fft_bf2 */
2954930cef6SMatthias Ringwald
2964930cef6SMatthias Ringwald #endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */
297