1*4930cef6SMatthias Ringwald /****************************************************************************** 2*4930cef6SMatthias Ringwald * 3*4930cef6SMatthias Ringwald * Copyright 2022 Google LLC 4*4930cef6SMatthias Ringwald * 5*4930cef6SMatthias Ringwald * Licensed under the Apache License, Version 2.0 (the "License"); 6*4930cef6SMatthias Ringwald * you may not use this file except in compliance with the License. 7*4930cef6SMatthias Ringwald * You may obtain a copy of the License at: 8*4930cef6SMatthias Ringwald * 9*4930cef6SMatthias Ringwald * http://www.apache.org/licenses/LICENSE-2.0 10*4930cef6SMatthias Ringwald * 11*4930cef6SMatthias Ringwald * Unless required by applicable law or agreed to in writing, software 12*4930cef6SMatthias Ringwald * distributed under the License is distributed on an "AS IS" BASIS, 13*4930cef6SMatthias Ringwald * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14*4930cef6SMatthias Ringwald * See the License for the specific language governing permissions and 15*4930cef6SMatthias Ringwald * limitations under the License. 16*4930cef6SMatthias Ringwald * 17*4930cef6SMatthias Ringwald ******************************************************************************/ 18*4930cef6SMatthias Ringwald 19*4930cef6SMatthias Ringwald #if __ARM_NEON && __ARM_ARCH_ISA_A64 20*4930cef6SMatthias Ringwald 21*4930cef6SMatthias Ringwald #ifndef TEST_NEON 22*4930cef6SMatthias Ringwald #include <arm_neon.h> 23*4930cef6SMatthias Ringwald #endif /* TEST_NEON */ 24*4930cef6SMatthias Ringwald 25*4930cef6SMatthias Ringwald 26*4930cef6SMatthias Ringwald /** 27*4930cef6SMatthias Ringwald * Import 28*4930cef6SMatthias Ringwald */ 29*4930cef6SMatthias Ringwald 30*4930cef6SMatthias Ringwald static inline int32_t filter_hp50(struct lc3_ltpf_hp50_state *, int32_t); 31*4930cef6SMatthias Ringwald 32*4930cef6SMatthias Ringwald 33*4930cef6SMatthias Ringwald /** 34*4930cef6SMatthias Ringwald * Resample from 16 Khz to 12.8 KHz 35*4930cef6SMatthias Ringwald */ 36*4930cef6SMatthias Ringwald #ifndef resample_16k_12k8 37*4930cef6SMatthias Ringwald #define resample_16k_12k8 neon_resample_16k_12k8 38*4930cef6SMatthias Ringwald LC3_HOT static void neon_resample_16k_12k8( 39*4930cef6SMatthias Ringwald struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) 40*4930cef6SMatthias Ringwald { 41*4930cef6SMatthias Ringwald static const int16_t h[4][20] = { 42*4930cef6SMatthias Ringwald 43*4930cef6SMatthias Ringwald { -61, 214, -398, 417, 0, -1052, 2686, -4529, 5997, 26233, 44*4930cef6SMatthias Ringwald 5997, -4529, 2686, -1052, 0, 417, -398, 214, -61, 0 }, 45*4930cef6SMatthias Ringwald 46*4930cef6SMatthias Ringwald { -79, 180, -213, 0, 598, -1522, 2389, -2427, 0, 24506, 47*4930cef6SMatthias Ringwald 13068, -5289, 1873, 0, -752, 763, -457, 156, 0, -28 }, 48*4930cef6SMatthias Ringwald 49*4930cef6SMatthias Ringwald { -61, 92, 0, -323, 861, -1361, 1317, 0, -3885, 19741, 50*4930cef6SMatthias Ringwald 19741, -3885, 0, 1317, -1361, 861, -323, 0, 92, -61 }, 51*4930cef6SMatthias Ringwald 52*4930cef6SMatthias Ringwald { -28, 0, 156, -457, 763, -752, 0, 1873, -5289, 13068, 53*4930cef6SMatthias Ringwald 24506, 0, -2427, 2389, -1522, 598, 0, -213, 180, -79 }, 54*4930cef6SMatthias Ringwald 55*4930cef6SMatthias Ringwald }; 56*4930cef6SMatthias Ringwald 57*4930cef6SMatthias Ringwald x -= 20 - 1; 58*4930cef6SMatthias Ringwald 59*4930cef6SMatthias Ringwald for (int i = 0; i < 5*n; i += 5) { 60*4930cef6SMatthias Ringwald const int16_t *hn = h[i & 3]; 61*4930cef6SMatthias Ringwald const int16_t *xn = x + (i >> 2); 62*4930cef6SMatthias Ringwald int32x4_t un; 63*4930cef6SMatthias Ringwald 64*4930cef6SMatthias Ringwald un = vmull_s16( vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; 65*4930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; 66*4930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; 67*4930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; 68*4930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; 69*4930cef6SMatthias Ringwald 70*4930cef6SMatthias Ringwald int32_t yn = filter_hp50(hp50, vaddvq_s32(un)); 71*4930cef6SMatthias Ringwald *(y++) = (yn + (1 << 15)) >> 16; 72*4930cef6SMatthias Ringwald } 73*4930cef6SMatthias Ringwald } 74*4930cef6SMatthias Ringwald #endif /* resample_16k_12k8 */ 75*4930cef6SMatthias Ringwald 76*4930cef6SMatthias Ringwald /** 77*4930cef6SMatthias Ringwald * Resample from 32 Khz to 12.8 KHz 78*4930cef6SMatthias Ringwald */ 79*4930cef6SMatthias Ringwald #ifndef resample_32k_12k8 80*4930cef6SMatthias Ringwald #define resample_32k_12k8 neon_resample_32k_12k8 81*4930cef6SMatthias Ringwald LC3_HOT static void neon_resample_32k_12k8( 82*4930cef6SMatthias Ringwald struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) 83*4930cef6SMatthias Ringwald { 84*4930cef6SMatthias Ringwald x -= 40 - 1; 85*4930cef6SMatthias Ringwald 86*4930cef6SMatthias Ringwald static const int16_t h[2][40] = { 87*4930cef6SMatthias Ringwald 88*4930cef6SMatthias Ringwald { -30, -31, 46, 107, 0, -199, -162, 209, 430, 0, 89*4930cef6SMatthias Ringwald -681, -526, 658, 1343, 0, -2264, -1943, 2999, 9871, 13116, 90*4930cef6SMatthias Ringwald 9871, 2999, -1943, -2264, 0, 1343, 658, -526, -681, 0, 91*4930cef6SMatthias Ringwald 430, 209, -162, -199, 0, 107, 46, -31, -30, 0 }, 92*4930cef6SMatthias Ringwald 93*4930cef6SMatthias Ringwald { -14, -39, 0, 90, 78, -106, -229, 0, 382, 299, 94*4930cef6SMatthias Ringwald -376, -761, 0, 1194, 937, -1214, -2644, 0, 6534, 12253, 95*4930cef6SMatthias Ringwald 12253, 6534, 0, -2644, -1214, 937, 1194, 0, -761, -376, 96*4930cef6SMatthias Ringwald 299, 382, 0, -229, -106, 78, 90, 0, -39, -14 }, 97*4930cef6SMatthias Ringwald 98*4930cef6SMatthias Ringwald }; 99*4930cef6SMatthias Ringwald 100*4930cef6SMatthias Ringwald for (int i = 0; i < 5*n; i += 5) { 101*4930cef6SMatthias Ringwald const int16_t *hn = h[i & 1]; 102*4930cef6SMatthias Ringwald const int16_t *xn = x + (i >> 1); 103*4930cef6SMatthias Ringwald 104*4930cef6SMatthias Ringwald int32x4_t un = vmull_s16(vld1_s16(xn), vld1_s16(hn)); 105*4930cef6SMatthias Ringwald xn += 4, hn += 4; 106*4930cef6SMatthias Ringwald 107*4930cef6SMatthias Ringwald for (int i = 1; i < 10; i++) 108*4930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; 109*4930cef6SMatthias Ringwald 110*4930cef6SMatthias Ringwald int32_t yn = filter_hp50(hp50, vaddvq_s32(un)); 111*4930cef6SMatthias Ringwald *(y++) = (yn + (1 << 15)) >> 16; 112*4930cef6SMatthias Ringwald } 113*4930cef6SMatthias Ringwald } 114*4930cef6SMatthias Ringwald #endif /* resample_32k_12k8 */ 115*4930cef6SMatthias Ringwald 116*4930cef6SMatthias Ringwald /** 117*4930cef6SMatthias Ringwald * Resample from 48 Khz to 12.8 KHz 118*4930cef6SMatthias Ringwald */ 119*4930cef6SMatthias Ringwald #ifndef resample_48k_12k8 120*4930cef6SMatthias Ringwald #define resample_48k_12k8 neon_resample_48k_12k8 121*4930cef6SMatthias Ringwald LC3_HOT static void neon_resample_48k_12k8( 122*4930cef6SMatthias Ringwald struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) 123*4930cef6SMatthias Ringwald { 124*4930cef6SMatthias Ringwald static const int16_t alignas(16) h[4][64] = { 125*4930cef6SMatthias Ringwald 126*4930cef6SMatthias Ringwald { -13, -25, -20, 10, 51, 71, 38, -47, -133, -145, 127*4930cef6SMatthias Ringwald -42, 139, 277, 242, 0, -329, -511, -351, 144, 698, 128*4930cef6SMatthias Ringwald 895, 450, -535, -1510, -1697, -521, 1999, 5138, 7737, 8744, 129*4930cef6SMatthias Ringwald 7737, 5138, 1999, -521, -1697, -1510, -535, 450, 895, 698, 130*4930cef6SMatthias Ringwald 144, -351, -511, -329, 0, 242, 277, 139, -42, -145, 131*4930cef6SMatthias Ringwald -133, -47, 38, 71, 51, 10, -20, -25, -13, 0 }, 132*4930cef6SMatthias Ringwald 133*4930cef6SMatthias Ringwald { -9, -23, -24, 0, 41, 71, 52, -23, -115, -152, 134*4930cef6SMatthias Ringwald -78, 92, 254, 272, 76, -251, -493, -427, 0, 576, 135*4930cef6SMatthias Ringwald 900, 624, -262, -1309, -1763, -954, 1272, 4356, 7203, 8679, 136*4930cef6SMatthias Ringwald 8169, 5886, 2767, 0, -1542, -1660, -809, 240, 848, 796, 137*4930cef6SMatthias Ringwald 292, -252, -507, -398, -82, 199, 288, 183, 0, -130, 138*4930cef6SMatthias Ringwald -145, -71, 20, 69, 60, 20, -15, -26, -17, -3 }, 139*4930cef6SMatthias Ringwald 140*4930cef6SMatthias Ringwald { -6, -20, -26, -8, 31, 67, 62, 0, -94, -152, 141*4930cef6SMatthias Ringwald -108, 45, 223, 287, 143, -167, -454, -480, -134, 439, 142*4930cef6SMatthias Ringwald 866, 758, 0, -1071, -1748, -1295, 601, 3559, 6580, 8485, 143*4930cef6SMatthias Ringwald 8485, 6580, 3559, 601, -1295, -1748, -1071, 0, 758, 866, 144*4930cef6SMatthias Ringwald 439, -134, -480, -454, -167, 143, 287, 223, 45, -108, 145*4930cef6SMatthias Ringwald -152, -94, 0, 62, 67, 31, -8, -26, -20, -6 }, 146*4930cef6SMatthias Ringwald 147*4930cef6SMatthias Ringwald { -3, -17, -26, -15, 20, 60, 69, 20, -71, -145, 148*4930cef6SMatthias Ringwald -130, 0, 183, 288, 199, -82, -398, -507, -252, 292, 149*4930cef6SMatthias Ringwald 796, 848, 240, -809, -1660, -1542, 0, 2767, 5886, 8169, 150*4930cef6SMatthias Ringwald 8679, 7203, 4356, 1272, -954, -1763, -1309, -262, 624, 900, 151*4930cef6SMatthias Ringwald 576, 0, -427, -493, -251, 76, 272, 254, 92, -78, 152*4930cef6SMatthias Ringwald -152, -115, -23, 52, 71, 41, 0, -24, -23, -9 }, 153*4930cef6SMatthias Ringwald 154*4930cef6SMatthias Ringwald }; 155*4930cef6SMatthias Ringwald 156*4930cef6SMatthias Ringwald x -= 60 - 1; 157*4930cef6SMatthias Ringwald 158*4930cef6SMatthias Ringwald for (int i = 0; i < 15*n; i += 15) { 159*4930cef6SMatthias Ringwald const int16_t *hn = h[i & 3]; 160*4930cef6SMatthias Ringwald const int16_t *xn = x + (i >> 2); 161*4930cef6SMatthias Ringwald 162*4930cef6SMatthias Ringwald int32x4_t un = vmull_s16(vld1_s16(xn), vld1_s16(hn)); 163*4930cef6SMatthias Ringwald xn += 4, hn += 4; 164*4930cef6SMatthias Ringwald 165*4930cef6SMatthias Ringwald for (int i = 1; i < 15; i++) 166*4930cef6SMatthias Ringwald un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; 167*4930cef6SMatthias Ringwald 168*4930cef6SMatthias Ringwald int32_t yn = filter_hp50(hp50, vaddvq_s32(un)); 169*4930cef6SMatthias Ringwald *(y++) = (yn + (1 << 15)) >> 16; 170*4930cef6SMatthias Ringwald } 171*4930cef6SMatthias Ringwald } 172*4930cef6SMatthias Ringwald #endif /* resample_48k_12k8 */ 173*4930cef6SMatthias Ringwald 174*4930cef6SMatthias Ringwald /** 175*4930cef6SMatthias Ringwald * Return dot product of 2 vectors 176*4930cef6SMatthias Ringwald */ 177*4930cef6SMatthias Ringwald #ifndef dot 178*4930cef6SMatthias Ringwald #define dot neon_dot 179*4930cef6SMatthias Ringwald LC3_HOT static inline float neon_dot(const int16_t *a, const int16_t *b, int n) 180*4930cef6SMatthias Ringwald { 181*4930cef6SMatthias Ringwald int64x2_t v = vmovq_n_s64(0); 182*4930cef6SMatthias Ringwald 183*4930cef6SMatthias Ringwald for (int i = 0; i < (n >> 4); i++) { 184*4930cef6SMatthias Ringwald int32x4_t u; 185*4930cef6SMatthias Ringwald 186*4930cef6SMatthias Ringwald u = vmull_s16( vld1_s16(a), vld1_s16(b)), a += 4, b += 4; 187*4930cef6SMatthias Ringwald u = vmlal_s16(u, vld1_s16(a), vld1_s16(b)), a += 4, b += 4; 188*4930cef6SMatthias Ringwald v = vpadalq_s32(v, u); 189*4930cef6SMatthias Ringwald 190*4930cef6SMatthias Ringwald u = vmull_s16( vld1_s16(a), vld1_s16(b)), a += 4, b += 4; 191*4930cef6SMatthias Ringwald u = vmlal_s16(u, vld1_s16(a), vld1_s16(b)), a += 4, b += 4; 192*4930cef6SMatthias Ringwald v = vpadalq_s32(v, u); 193*4930cef6SMatthias Ringwald } 194*4930cef6SMatthias Ringwald 195*4930cef6SMatthias Ringwald int32_t v32 = (vaddvq_s64(v) + (1 << 5)) >> 6; 196*4930cef6SMatthias Ringwald return (float)v32; 197*4930cef6SMatthias Ringwald } 198*4930cef6SMatthias Ringwald #endif /* dot */ 199*4930cef6SMatthias Ringwald 200*4930cef6SMatthias Ringwald /** 201*4930cef6SMatthias Ringwald * Return vector of correlations 202*4930cef6SMatthias Ringwald */ 203*4930cef6SMatthias Ringwald #ifndef correlate 204*4930cef6SMatthias Ringwald #define correlate neon_correlate 205*4930cef6SMatthias Ringwald LC3_HOT static void neon_correlate( 206*4930cef6SMatthias Ringwald const int16_t *a, const int16_t *b, int n, float *y, int nc) 207*4930cef6SMatthias Ringwald { 208*4930cef6SMatthias Ringwald for ( ; nc >= 4; nc -= 4, b -= 4) { 209*4930cef6SMatthias Ringwald const int16_t *an = (const int16_t *)a; 210*4930cef6SMatthias Ringwald const int16_t *bn = (const int16_t *)b; 211*4930cef6SMatthias Ringwald 212*4930cef6SMatthias Ringwald int64x2_t v0 = vmovq_n_s64(0), v1 = v0, v2 = v0, v3 = v0; 213*4930cef6SMatthias Ringwald int16x4_t ax, b0, b1; 214*4930cef6SMatthias Ringwald 215*4930cef6SMatthias Ringwald b0 = vld1_s16(bn-4); 216*4930cef6SMatthias Ringwald 217*4930cef6SMatthias Ringwald for (int i=0; i < (n >> 4); i++ ) 218*4930cef6SMatthias Ringwald for (int j = 0; j < 2; j++) { 219*4930cef6SMatthias Ringwald int32x4_t u0, u1, u2, u3; 220*4930cef6SMatthias Ringwald 221*4930cef6SMatthias Ringwald b1 = b0; 222*4930cef6SMatthias Ringwald b0 = vld1_s16(bn), bn += 4; 223*4930cef6SMatthias Ringwald ax = vld1_s16(an), an += 4; 224*4930cef6SMatthias Ringwald 225*4930cef6SMatthias Ringwald u0 = vmull_s16(ax, b0); 226*4930cef6SMatthias Ringwald u1 = vmull_s16(ax, vext_s16(b1, b0, 3)); 227*4930cef6SMatthias Ringwald u2 = vmull_s16(ax, vext_s16(b1, b0, 2)); 228*4930cef6SMatthias Ringwald u3 = vmull_s16(ax, vext_s16(b1, b0, 1)); 229*4930cef6SMatthias Ringwald 230*4930cef6SMatthias Ringwald b1 = b0; 231*4930cef6SMatthias Ringwald b0 = vld1_s16(bn), bn += 4; 232*4930cef6SMatthias Ringwald ax = vld1_s16(an), an += 4; 233*4930cef6SMatthias Ringwald 234*4930cef6SMatthias Ringwald u0 = vmlal_s16(u0, ax, b0); 235*4930cef6SMatthias Ringwald u1 = vmlal_s16(u1, ax, vext_s16(b1, b0, 3)); 236*4930cef6SMatthias Ringwald u2 = vmlal_s16(u2, ax, vext_s16(b1, b0, 2)); 237*4930cef6SMatthias Ringwald u3 = vmlal_s16(u3, ax, vext_s16(b1, b0, 1)); 238*4930cef6SMatthias Ringwald 239*4930cef6SMatthias Ringwald v0 = vpadalq_s32(v0, u0); 240*4930cef6SMatthias Ringwald v1 = vpadalq_s32(v1, u1); 241*4930cef6SMatthias Ringwald v2 = vpadalq_s32(v2, u2); 242*4930cef6SMatthias Ringwald v3 = vpadalq_s32(v3, u3); 243*4930cef6SMatthias Ringwald } 244*4930cef6SMatthias Ringwald 245*4930cef6SMatthias Ringwald *(y++) = (float)((int32_t)((vaddvq_s64(v0) + (1 << 5)) >> 6)); 246*4930cef6SMatthias Ringwald *(y++) = (float)((int32_t)((vaddvq_s64(v1) + (1 << 5)) >> 6)); 247*4930cef6SMatthias Ringwald *(y++) = (float)((int32_t)((vaddvq_s64(v2) + (1 << 5)) >> 6)); 248*4930cef6SMatthias Ringwald *(y++) = (float)((int32_t)((vaddvq_s64(v3) + (1 << 5)) >> 6)); 249*4930cef6SMatthias Ringwald } 250*4930cef6SMatthias Ringwald 251*4930cef6SMatthias Ringwald for ( ; nc > 0; nc--) 252*4930cef6SMatthias Ringwald *(y++) = neon_dot(a, b--, n); 253*4930cef6SMatthias Ringwald } 254*4930cef6SMatthias Ringwald #endif /* correlate */ 255*4930cef6SMatthias Ringwald 256*4930cef6SMatthias Ringwald #endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */ 257