14930cef6SMatthias Ringwald /******************************************************************************
24930cef6SMatthias Ringwald *
34930cef6SMatthias Ringwald * Copyright 2022 Google LLC
44930cef6SMatthias Ringwald *
54930cef6SMatthias Ringwald * Licensed under the Apache License, Version 2.0 (the "License");
64930cef6SMatthias Ringwald * you may not use this file except in compliance with the License.
74930cef6SMatthias Ringwald * You may obtain a copy of the License at:
84930cef6SMatthias Ringwald *
94930cef6SMatthias Ringwald * http://www.apache.org/licenses/LICENSE-2.0
104930cef6SMatthias Ringwald *
114930cef6SMatthias Ringwald * Unless required by applicable law or agreed to in writing, software
124930cef6SMatthias Ringwald * distributed under the License is distributed on an "AS IS" BASIS,
134930cef6SMatthias Ringwald * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
144930cef6SMatthias Ringwald * See the License for the specific language governing permissions and
154930cef6SMatthias Ringwald * limitations under the License.
164930cef6SMatthias Ringwald *
174930cef6SMatthias Ringwald ******************************************************************************/
184930cef6SMatthias Ringwald
19*4c4eb519SMatthias Ringwald #if __ARM_NEON && __ARM_ARCH_ISA_A64 && \
20*4c4eb519SMatthias Ringwald !defined(TEST_ARM) || defined(TEST_NEON)
214930cef6SMatthias Ringwald
224930cef6SMatthias Ringwald #ifndef TEST_NEON
234930cef6SMatthias Ringwald #include <arm_neon.h>
244930cef6SMatthias Ringwald #endif /* TEST_NEON */
254930cef6SMatthias Ringwald
264930cef6SMatthias Ringwald
274930cef6SMatthias Ringwald /**
284930cef6SMatthias Ringwald * Import
294930cef6SMatthias Ringwald */
304930cef6SMatthias Ringwald
314930cef6SMatthias Ringwald static inline int32_t filter_hp50(struct lc3_ltpf_hp50_state *, int32_t);
324930cef6SMatthias Ringwald
334930cef6SMatthias Ringwald
344930cef6SMatthias Ringwald /**
354930cef6SMatthias Ringwald * Resample from 16 Khz to 12.8 KHz
364930cef6SMatthias Ringwald */
374930cef6SMatthias Ringwald #ifndef resample_16k_12k8
38*4c4eb519SMatthias Ringwald
neon_resample_16k_12k8(struct lc3_ltpf_hp50_state * hp50,const int16_t * x,int16_t * y,int n)394930cef6SMatthias Ringwald LC3_HOT static void neon_resample_16k_12k8(
404930cef6SMatthias Ringwald struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n)
414930cef6SMatthias Ringwald {
424930cef6SMatthias Ringwald static const int16_t h[4][20] = {
434930cef6SMatthias Ringwald
444930cef6SMatthias Ringwald { -61, 214, -398, 417, 0, -1052, 2686, -4529, 5997, 26233,
454930cef6SMatthias Ringwald 5997, -4529, 2686, -1052, 0, 417, -398, 214, -61, 0 },
464930cef6SMatthias Ringwald
474930cef6SMatthias Ringwald { -79, 180, -213, 0, 598, -1522, 2389, -2427, 0, 24506,
484930cef6SMatthias Ringwald 13068, -5289, 1873, 0, -752, 763, -457, 156, 0, -28 },
494930cef6SMatthias Ringwald
504930cef6SMatthias Ringwald { -61, 92, 0, -323, 861, -1361, 1317, 0, -3885, 19741,
514930cef6SMatthias Ringwald 19741, -3885, 0, 1317, -1361, 861, -323, 0, 92, -61 },
524930cef6SMatthias Ringwald
534930cef6SMatthias Ringwald { -28, 0, 156, -457, 763, -752, 0, 1873, -5289, 13068,
544930cef6SMatthias Ringwald 24506, 0, -2427, 2389, -1522, 598, 0, -213, 180, -79 },
554930cef6SMatthias Ringwald
564930cef6SMatthias Ringwald };
574930cef6SMatthias Ringwald
584930cef6SMatthias Ringwald x -= 20 - 1;
594930cef6SMatthias Ringwald
604930cef6SMatthias Ringwald for (int i = 0; i < 5*n; i += 5) {
614930cef6SMatthias Ringwald const int16_t *hn = h[i & 3];
624930cef6SMatthias Ringwald const int16_t *xn = x + (i >> 2);
634930cef6SMatthias Ringwald int32x4_t un;
644930cef6SMatthias Ringwald
654930cef6SMatthias Ringwald un = vmull_s16( vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
664930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
674930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
684930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
694930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
704930cef6SMatthias Ringwald
714930cef6SMatthias Ringwald int32_t yn = filter_hp50(hp50, vaddvq_s32(un));
724930cef6SMatthias Ringwald *(y++) = (yn + (1 << 15)) >> 16;
734930cef6SMatthias Ringwald }
744930cef6SMatthias Ringwald }
75*4c4eb519SMatthias Ringwald
76*4c4eb519SMatthias Ringwald #ifndef TEST_NEON
77*4c4eb519SMatthias Ringwald #define resample_16k_12k8 neon_resample_16k_12k8
78*4c4eb519SMatthias Ringwald #endif
79*4c4eb519SMatthias Ringwald
804930cef6SMatthias Ringwald #endif /* resample_16k_12k8 */
814930cef6SMatthias Ringwald
824930cef6SMatthias Ringwald /**
834930cef6SMatthias Ringwald * Resample from 32 Khz to 12.8 KHz
844930cef6SMatthias Ringwald */
854930cef6SMatthias Ringwald #ifndef resample_32k_12k8
86*4c4eb519SMatthias Ringwald
neon_resample_32k_12k8(struct lc3_ltpf_hp50_state * hp50,const int16_t * x,int16_t * y,int n)874930cef6SMatthias Ringwald LC3_HOT static void neon_resample_32k_12k8(
884930cef6SMatthias Ringwald struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n)
894930cef6SMatthias Ringwald {
904930cef6SMatthias Ringwald x -= 40 - 1;
914930cef6SMatthias Ringwald
924930cef6SMatthias Ringwald static const int16_t h[2][40] = {
934930cef6SMatthias Ringwald
944930cef6SMatthias Ringwald { -30, -31, 46, 107, 0, -199, -162, 209, 430, 0,
954930cef6SMatthias Ringwald -681, -526, 658, 1343, 0, -2264, -1943, 2999, 9871, 13116,
964930cef6SMatthias Ringwald 9871, 2999, -1943, -2264, 0, 1343, 658, -526, -681, 0,
974930cef6SMatthias Ringwald 430, 209, -162, -199, 0, 107, 46, -31, -30, 0 },
984930cef6SMatthias Ringwald
994930cef6SMatthias Ringwald { -14, -39, 0, 90, 78, -106, -229, 0, 382, 299,
1004930cef6SMatthias Ringwald -376, -761, 0, 1194, 937, -1214, -2644, 0, 6534, 12253,
1014930cef6SMatthias Ringwald 12253, 6534, 0, -2644, -1214, 937, 1194, 0, -761, -376,
1024930cef6SMatthias Ringwald 299, 382, 0, -229, -106, 78, 90, 0, -39, -14 },
1034930cef6SMatthias Ringwald
1044930cef6SMatthias Ringwald };
1054930cef6SMatthias Ringwald
1064930cef6SMatthias Ringwald for (int i = 0; i < 5*n; i += 5) {
1074930cef6SMatthias Ringwald const int16_t *hn = h[i & 1];
1084930cef6SMatthias Ringwald const int16_t *xn = x + (i >> 1);
1094930cef6SMatthias Ringwald
1104930cef6SMatthias Ringwald int32x4_t un = vmull_s16(vld1_s16(xn), vld1_s16(hn));
1114930cef6SMatthias Ringwald xn += 4, hn += 4;
1124930cef6SMatthias Ringwald
1134930cef6SMatthias Ringwald for (int i = 1; i < 10; i++)
1144930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
1154930cef6SMatthias Ringwald
1164930cef6SMatthias Ringwald int32_t yn = filter_hp50(hp50, vaddvq_s32(un));
1174930cef6SMatthias Ringwald *(y++) = (yn + (1 << 15)) >> 16;
1184930cef6SMatthias Ringwald }
1194930cef6SMatthias Ringwald }
120*4c4eb519SMatthias Ringwald
121*4c4eb519SMatthias Ringwald #ifndef TEST_NEON
122*4c4eb519SMatthias Ringwald #define resample_32k_12k8 neon_resample_32k_12k8
123*4c4eb519SMatthias Ringwald #endif
124*4c4eb519SMatthias Ringwald
1254930cef6SMatthias Ringwald #endif /* resample_32k_12k8 */
1264930cef6SMatthias Ringwald
1274930cef6SMatthias Ringwald /**
1284930cef6SMatthias Ringwald * Resample from 48 Khz to 12.8 KHz
1294930cef6SMatthias Ringwald */
1304930cef6SMatthias Ringwald #ifndef resample_48k_12k8
131*4c4eb519SMatthias Ringwald
neon_resample_48k_12k8(struct lc3_ltpf_hp50_state * hp50,const int16_t * x,int16_t * y,int n)1324930cef6SMatthias Ringwald LC3_HOT static void neon_resample_48k_12k8(
1334930cef6SMatthias Ringwald struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n)
1344930cef6SMatthias Ringwald {
1354930cef6SMatthias Ringwald static const int16_t alignas(16) h[4][64] = {
1364930cef6SMatthias Ringwald
1374930cef6SMatthias Ringwald { -13, -25, -20, 10, 51, 71, 38, -47, -133, -145,
1384930cef6SMatthias Ringwald -42, 139, 277, 242, 0, -329, -511, -351, 144, 698,
1394930cef6SMatthias Ringwald 895, 450, -535, -1510, -1697, -521, 1999, 5138, 7737, 8744,
1404930cef6SMatthias Ringwald 7737, 5138, 1999, -521, -1697, -1510, -535, 450, 895, 698,
1414930cef6SMatthias Ringwald 144, -351, -511, -329, 0, 242, 277, 139, -42, -145,
1424930cef6SMatthias Ringwald -133, -47, 38, 71, 51, 10, -20, -25, -13, 0 },
1434930cef6SMatthias Ringwald
1444930cef6SMatthias Ringwald { -9, -23, -24, 0, 41, 71, 52, -23, -115, -152,
1454930cef6SMatthias Ringwald -78, 92, 254, 272, 76, -251, -493, -427, 0, 576,
1464930cef6SMatthias Ringwald 900, 624, -262, -1309, -1763, -954, 1272, 4356, 7203, 8679,
1474930cef6SMatthias Ringwald 8169, 5886, 2767, 0, -1542, -1660, -809, 240, 848, 796,
1484930cef6SMatthias Ringwald 292, -252, -507, -398, -82, 199, 288, 183, 0, -130,
1494930cef6SMatthias Ringwald -145, -71, 20, 69, 60, 20, -15, -26, -17, -3 },
1504930cef6SMatthias Ringwald
1514930cef6SMatthias Ringwald { -6, -20, -26, -8, 31, 67, 62, 0, -94, -152,
1524930cef6SMatthias Ringwald -108, 45, 223, 287, 143, -167, -454, -480, -134, 439,
1534930cef6SMatthias Ringwald 866, 758, 0, -1071, -1748, -1295, 601, 3559, 6580, 8485,
1544930cef6SMatthias Ringwald 8485, 6580, 3559, 601, -1295, -1748, -1071, 0, 758, 866,
1554930cef6SMatthias Ringwald 439, -134, -480, -454, -167, 143, 287, 223, 45, -108,
1564930cef6SMatthias Ringwald -152, -94, 0, 62, 67, 31, -8, -26, -20, -6 },
1574930cef6SMatthias Ringwald
1584930cef6SMatthias Ringwald { -3, -17, -26, -15, 20, 60, 69, 20, -71, -145,
1594930cef6SMatthias Ringwald -130, 0, 183, 288, 199, -82, -398, -507, -252, 292,
1604930cef6SMatthias Ringwald 796, 848, 240, -809, -1660, -1542, 0, 2767, 5886, 8169,
1614930cef6SMatthias Ringwald 8679, 7203, 4356, 1272, -954, -1763, -1309, -262, 624, 900,
1624930cef6SMatthias Ringwald 576, 0, -427, -493, -251, 76, 272, 254, 92, -78,
1634930cef6SMatthias Ringwald -152, -115, -23, 52, 71, 41, 0, -24, -23, -9 },
1644930cef6SMatthias Ringwald
1654930cef6SMatthias Ringwald };
1664930cef6SMatthias Ringwald
1674930cef6SMatthias Ringwald x -= 60 - 1;
1684930cef6SMatthias Ringwald
1694930cef6SMatthias Ringwald for (int i = 0; i < 15*n; i += 15) {
1704930cef6SMatthias Ringwald const int16_t *hn = h[i & 3];
1714930cef6SMatthias Ringwald const int16_t *xn = x + (i >> 2);
1724930cef6SMatthias Ringwald
1734930cef6SMatthias Ringwald int32x4_t un = vmull_s16(vld1_s16(xn), vld1_s16(hn));
1744930cef6SMatthias Ringwald xn += 4, hn += 4;
1754930cef6SMatthias Ringwald
1764930cef6SMatthias Ringwald for (int i = 1; i < 15; i++)
1774930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4;
1784930cef6SMatthias Ringwald
1794930cef6SMatthias Ringwald int32_t yn = filter_hp50(hp50, vaddvq_s32(un));
1804930cef6SMatthias Ringwald *(y++) = (yn + (1 << 15)) >> 16;
1814930cef6SMatthias Ringwald }
1824930cef6SMatthias Ringwald }
183*4c4eb519SMatthias Ringwald
184*4c4eb519SMatthias Ringwald #ifndef TEST_NEON
185*4c4eb519SMatthias Ringwald #define resample_48k_12k8 neon_resample_48k_12k8
186*4c4eb519SMatthias Ringwald #endif
187*4c4eb519SMatthias Ringwald
1884930cef6SMatthias Ringwald #endif /* resample_48k_12k8 */
1894930cef6SMatthias Ringwald
1904930cef6SMatthias Ringwald /**
1914930cef6SMatthias Ringwald * Return dot product of 2 vectors
1924930cef6SMatthias Ringwald */
1934930cef6SMatthias Ringwald #ifndef dot
194*4c4eb519SMatthias Ringwald
neon_dot(const int16_t * a,const int16_t * b,int n)1954930cef6SMatthias Ringwald LC3_HOT static inline float neon_dot(const int16_t *a, const int16_t *b, int n)
1964930cef6SMatthias Ringwald {
1974930cef6SMatthias Ringwald int64x2_t v = vmovq_n_s64(0);
1984930cef6SMatthias Ringwald
1994930cef6SMatthias Ringwald for (int i = 0; i < (n >> 4); i++) {
2004930cef6SMatthias Ringwald int32x4_t u;
2014930cef6SMatthias Ringwald
2024930cef6SMatthias Ringwald u = vmull_s16( vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
2034930cef6SMatthias Ringwald u = vmlal_s16(u, vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
2044930cef6SMatthias Ringwald v = vpadalq_s32(v, u);
2054930cef6SMatthias Ringwald
2064930cef6SMatthias Ringwald u = vmull_s16( vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
2074930cef6SMatthias Ringwald u = vmlal_s16(u, vld1_s16(a), vld1_s16(b)), a += 4, b += 4;
2084930cef6SMatthias Ringwald v = vpadalq_s32(v, u);
2094930cef6SMatthias Ringwald }
2104930cef6SMatthias Ringwald
2114930cef6SMatthias Ringwald int32_t v32 = (vaddvq_s64(v) + (1 << 5)) >> 6;
2124930cef6SMatthias Ringwald return (float)v32;
2134930cef6SMatthias Ringwald }
214*4c4eb519SMatthias Ringwald
215*4c4eb519SMatthias Ringwald #ifndef TEST_NEON
216*4c4eb519SMatthias Ringwald #define dot neon_dot
217*4c4eb519SMatthias Ringwald #endif
218*4c4eb519SMatthias Ringwald
2194930cef6SMatthias Ringwald #endif /* dot */
2204930cef6SMatthias Ringwald
2214930cef6SMatthias Ringwald /**
2224930cef6SMatthias Ringwald * Return vector of correlations
2234930cef6SMatthias Ringwald */
2244930cef6SMatthias Ringwald #ifndef correlate
225*4c4eb519SMatthias Ringwald
neon_correlate(const int16_t * a,const int16_t * b,int n,float * y,int nc)2264930cef6SMatthias Ringwald LC3_HOT static void neon_correlate(
2274930cef6SMatthias Ringwald const int16_t *a, const int16_t *b, int n, float *y, int nc)
2284930cef6SMatthias Ringwald {
2294930cef6SMatthias Ringwald for ( ; nc >= 4; nc -= 4, b -= 4) {
2304930cef6SMatthias Ringwald const int16_t *an = (const int16_t *)a;
2314930cef6SMatthias Ringwald const int16_t *bn = (const int16_t *)b;
2324930cef6SMatthias Ringwald
2334930cef6SMatthias Ringwald int64x2_t v0 = vmovq_n_s64(0), v1 = v0, v2 = v0, v3 = v0;
2344930cef6SMatthias Ringwald int16x4_t ax, b0, b1;
2354930cef6SMatthias Ringwald
2364930cef6SMatthias Ringwald b0 = vld1_s16(bn-4);
2374930cef6SMatthias Ringwald
2384930cef6SMatthias Ringwald for (int i=0; i < (n >> 4); i++ )
2394930cef6SMatthias Ringwald for (int j = 0; j < 2; j++) {
2404930cef6SMatthias Ringwald int32x4_t u0, u1, u2, u3;
2414930cef6SMatthias Ringwald
2424930cef6SMatthias Ringwald b1 = b0;
2434930cef6SMatthias Ringwald b0 = vld1_s16(bn), bn += 4;
2444930cef6SMatthias Ringwald ax = vld1_s16(an), an += 4;
2454930cef6SMatthias Ringwald
2464930cef6SMatthias Ringwald u0 = vmull_s16(ax, b0);
2474930cef6SMatthias Ringwald u1 = vmull_s16(ax, vext_s16(b1, b0, 3));
2484930cef6SMatthias Ringwald u2 = vmull_s16(ax, vext_s16(b1, b0, 2));
2494930cef6SMatthias Ringwald u3 = vmull_s16(ax, vext_s16(b1, b0, 1));
2504930cef6SMatthias Ringwald
2514930cef6SMatthias Ringwald b1 = b0;
2524930cef6SMatthias Ringwald b0 = vld1_s16(bn), bn += 4;
2534930cef6SMatthias Ringwald ax = vld1_s16(an), an += 4;
2544930cef6SMatthias Ringwald
2554930cef6SMatthias Ringwald u0 = vmlal_s16(u0, ax, b0);
2564930cef6SMatthias Ringwald u1 = vmlal_s16(u1, ax, vext_s16(b1, b0, 3));
2574930cef6SMatthias Ringwald u2 = vmlal_s16(u2, ax, vext_s16(b1, b0, 2));
2584930cef6SMatthias Ringwald u3 = vmlal_s16(u3, ax, vext_s16(b1, b0, 1));
2594930cef6SMatthias Ringwald
2604930cef6SMatthias Ringwald v0 = vpadalq_s32(v0, u0);
2614930cef6SMatthias Ringwald v1 = vpadalq_s32(v1, u1);
2624930cef6SMatthias Ringwald v2 = vpadalq_s32(v2, u2);
2634930cef6SMatthias Ringwald v3 = vpadalq_s32(v3, u3);
2644930cef6SMatthias Ringwald }
2654930cef6SMatthias Ringwald
2664930cef6SMatthias Ringwald *(y++) = (float)((int32_t)((vaddvq_s64(v0) + (1 << 5)) >> 6));
2674930cef6SMatthias Ringwald *(y++) = (float)((int32_t)((vaddvq_s64(v1) + (1 << 5)) >> 6));
2684930cef6SMatthias Ringwald *(y++) = (float)((int32_t)((vaddvq_s64(v2) + (1 << 5)) >> 6));
2694930cef6SMatthias Ringwald *(y++) = (float)((int32_t)((vaddvq_s64(v3) + (1 << 5)) >> 6));
2704930cef6SMatthias Ringwald }
2714930cef6SMatthias Ringwald
2724930cef6SMatthias Ringwald for ( ; nc > 0; nc--)
2734930cef6SMatthias Ringwald *(y++) = neon_dot(a, b--, n);
2744930cef6SMatthias Ringwald }
2754930cef6SMatthias Ringwald #endif /* correlate */
2764930cef6SMatthias Ringwald
277*4c4eb519SMatthias Ringwald #ifndef TEST_NEON
278*4c4eb519SMatthias Ringwald #define correlate neon_correlate
279*4c4eb519SMatthias Ringwald #endif
280*4c4eb519SMatthias Ringwald
2814930cef6SMatthias Ringwald #endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */
282