xref: /aosp_15_r20/external/libxaac/encoder/ixheaace_sbr_hbe_fft_ifft_32x32.c (revision 15dc779a375ca8b5125643b829a8aa4b70d7f451)
1 /******************************************************************************
2  *                                                                            *
3  * Copyright (C) 2023 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 
21 #include <stdlib.h>
22 #include <ixheaac_type_def.h>
23 #include "ixheaac_constants.h"
24 #include "ixheaace_constants.h"
25 #include "iusace_basic_ops_flt.h"
26 #include "ixheaace_common_utils.h"
27 #include "ixheaac_fft_ifft_rom.h"
28 #include "ixheaac_basic_ops32.h"
29 #include "ixheaac_basic_ops40.h"
30 #include "ixheaac_basic_ops.h"
31 
32 #define DIG_REV(i, m, j)                                    \
33   do {                                                      \
34     unsigned _ = (i);                                       \
35     _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
36     _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
37     _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
38     (j) = _ >> (m);                                         \
39   } while (0)
40 
41 #define CPLX_MPY_FFT(re, im, a, b, c, d) \
42   do {                                   \
43     re = ((a * c) - (b * d));            \
44     im = ((a * d) + (b * c));            \
45   } while (0)
46 
47 #define CPLX_MPY_IFFT(re, im, a, b, c, d) \
48   do {                                    \
49     re = ((a * c) + (b * d));             \
50     im = (-(a * d) + (b * c));            \
51   } while (0)
52 
ixheaace_hbe_apply_ifft_7(FLOAT32 * ptr_inp,FLOAT32 * ptr_op)53 VOID ixheaace_hbe_apply_ifft_7(FLOAT32 *ptr_inp, FLOAT32 *ptr_op) {
54   FLOAT32 x0r, x1r, x2r, x3r, x4r, x5r, x6r, x7r, x8r;
55   FLOAT32 x0i, x1i, x2i, x3i, x4i, x5i, x6i, x7i, x8i;
56   FLOAT32 y0r, y1r, y2r, y3r, y4r, y5r, y6r, y7r, y8r;
57   FLOAT32 y0i, y1i, y2i, y3i, y4i, y5i, y6i, y7i, y8i;
58 
59   /*
60    * Node 1 of Winograd FFT for 7 point
61    *
62    * 1   0   0   0   0   0   0
63    * 0   1   0   0   0   0   1
64    * 0   1   0   0   0   0  -1
65    * 0   0   1   0   0   1   0
66    * 0   0   1   0   0  -1   0
67    * 0   0   0   1   1   0   0
68    * 0   0   0  -1   1   0   0
69    *
70    */
71 
72   x0r = ptr_inp[0];
73   x0i = ptr_inp[1];
74   x1r = ptr_inp[2] + ptr_inp[12];
75   x1i = ptr_inp[3] + ptr_inp[13];
76   x2r = ptr_inp[2] - ptr_inp[12];
77   x2i = ptr_inp[3] - ptr_inp[13];
78   x3r = ptr_inp[4] + ptr_inp[10];
79   x3i = ptr_inp[5] + ptr_inp[11];
80   x4r = ptr_inp[4] - ptr_inp[10];
81   x4i = ptr_inp[5] - ptr_inp[11];
82   x5r = ptr_inp[8] + ptr_inp[6];
83   x5i = ptr_inp[9] + ptr_inp[7];
84   x6r = ptr_inp[8] - ptr_inp[6];
85   x6i = ptr_inp[9] - ptr_inp[7];
86 
87   /*
88    * Node 2 of Winograd FFT for 7 point
89    *
90    * 1   0   0   0   0   0   0
91    * 0   1   0   1   0   1   0
92    * 0   1   0  -1   0   0   0
93    * 0  -1   0   0   0   1   0
94    * 0   0   0   1   0  -1   0
95    * 0   0   1   0   1   0   1
96    * 0   0   1   0  -1   0   0
97    * 0   0  -1   0   0   0   1
98    * 0   0   0   0   1   0  -1
99    *
100    */
101 
102   y0r = x0r;
103   y0i = x0i;
104   y1r = x1r + x3r + x5r;
105   y1i = x1i + x3i + x5i;
106   y2r = x1r - x3r;
107   y2i = x1i - x3i;
108   y3r = x5r - x1r;
109   y3i = x5i - x1i;
110   y4r = x3r - x5r;
111   y4i = x3i - x5i;
112   y5r = x2r + x4r + x6r;
113   y5i = x2i + x4i + x6i;
114   y6r = x2r - x4r;
115   y6i = x2i - x4i;
116   y7r = x6r - x2r;
117   y7i = x6i - x2i;
118   y8r = x4r - x6r;
119   y8i = x4i - x6i;
120 
121   /*
122    * Node 3 of Winograd FFT for 7 point
123    *
124    * 1    1    0    0    0     0     0     0     0
125    * 1  c70    0    0    0     0     0     0     0
126    * 0    0  c71    0    0     0     0     0     0
127    * 0    0    0  c72    0     0     0     0     0
128    * 0    0    0    0  c73     0     0     0     0
129    * 0    0    0    0    0  jc74     0     0     0
130    * 0    0    0    0    0     0  jc75     0     0
131    * 0    0    0    0    0     0     0  jc76     0
132    * 0    0    0    0    0     0     0     0  jc77
133    *
134    */
135   x0r = y0r + y1r;
136   x0i = y0i + y1i;
137   x1r = y0r + C70 * y1r;
138   x1i = y0i + C70 * y1i;
139   x2r = C71 * y2r;
140   x2i = C71 * y2i;
141   x3r = C72 * y3r;
142   x3i = C72 * y3i;
143   x4r = C73 * y4r;
144   x4i = C73 * y4i;
145   x5r = C74 * y5i;
146   x5i = -C74 * y5r;
147   x6r = C75 * y6i;
148   x6i = -C75 * y6r;
149   x7r = C76 * y7i;
150   x7i = -C76 * y7r;
151   x8r = C77 * y8i;
152   x8i = -C77 * y8r;
153 
154   /*
155    * Node 4 of Winograd FFT for 7 point
156    *
157    * 1   0   0   0   0   0   0   0   0
158    * 0   1   1   0   1   0   0   0   0
159    * 0   1  -1  -1   0   0   0   0   0
160    * 0   1   0   1  -1   0   0   0   0
161    * 0   0   0   0   0   1   1   0   1
162    * 0   0   0   0   0   1  -1  -1   0
163    * 0   0   0   0   0   1   0   1  -1
164    *
165    */
166 
167   y0r = x0r;
168   y0i = x0i;
169   y1r = x1r + x2r + x4r;
170   y1i = x1i + x2i + x4i;
171   y2r = x1r - x2r - x3r;
172   y2i = x1i - x2i - x3i;
173   y3r = x1r + x3r - x4r;
174   y3i = x1i + x3i - x4i;
175   y4r = x5r + x6r + x8r;
176   y4i = x5i + x6i + x8i;
177   y5r = x5r - x6r - x7r;
178   y5i = x5i - x6i - x7i;
179   y6r = x5r + x7r - x8r;
180   y6i = x5i + x7i - x8i;
181 
182   /*
183    * Node 5 of Winograd FFT for 7 point
184    *
185    * 1   0   0   0   0   0   0
186    * 0   1   0   0   1   0   0
187    * 0   0   0   1   0   0   1
188    * 0   0   1   0   0  -1   0
189    * 0   0   1   0   0   1   0
190    * 0   0   0   1   0   0  -1
191    * 0   1   0   0  -1   0   0
192    *
193    */
194   x0r = y0r;
195   x0i = y0i;
196   x1r = y1r + y4r;
197   x1i = y1i + y4i;
198   x2r = y3r + y6r;
199   x2i = y3i + y6i;
200   x3r = y2r - y5r;
201   x3i = y2i - y5i;
202   x4r = y2r + y5r;
203   x4i = y2i + y5i;
204   x5r = y3r - y6r;
205   x5i = y3i - y6i;
206   x6r = y1r - y4r;
207   x6i = y1i - y4i;
208 
209   ptr_op[0] = x0r;
210   ptr_op[1] = x0i;
211   ptr_op[2] = x1r;
212   ptr_op[3] = x1i;
213   ptr_op[4] = x2r;
214   ptr_op[5] = x2i;
215   ptr_op[6] = x3r;
216   ptr_op[7] = x3i;
217   ptr_op[8] = x4r;
218   ptr_op[9] = x4i;
219   ptr_op[10] = x5r;
220   ptr_op[11] = x5i;
221   ptr_op[12] = x6r;
222   ptr_op[13] = x6i;
223 }
224 
ixheaace_hbe_apply_fft_3(FLOAT32 * ptr_inp,FLOAT32 * ptr_op,WORD32 i_sign)225 VOID ixheaace_hbe_apply_fft_3(FLOAT32 *ptr_inp, FLOAT32 *ptr_op, WORD32 i_sign) {
226   FLOAT32 add_r, sub_r;
227   FLOAT32 add_i, sub_i;
228   FLOAT32 x_01_r, x_01_i, temp;
229 
230   FLOAT32 p1, p2, p3, p4;
231 
232   /* mu = PI / 3; The cos and sin values are in Q31
233      cosmu is 0.5 so used >> 1 instead of multiplication */
234 
235   FLOAT64 sinmu;
236   sinmu = -0.866025403784439 * (FLOAT64)i_sign;
237 
238   x_01_r = ptr_inp[0] + ptr_inp[2];
239   x_01_i = ptr_inp[1] + ptr_inp[3];
240 
241   add_r = ptr_inp[2] + ptr_inp[4];
242   add_i = ptr_inp[3] + ptr_inp[5];
243 
244   sub_r = ptr_inp[2] - ptr_inp[4];
245   sub_i = ptr_inp[3] - ptr_inp[5];
246 
247   p1 = add_r / (FLOAT32)2.0;
248   p4 = add_i / (FLOAT32)2.0;
249   p2 = (FLOAT32)((FLOAT64)sub_i * sinmu);
250   p3 = (FLOAT32)((FLOAT64)sub_r * sinmu);
251 
252   temp = ptr_inp[0] - p1;
253 
254   ptr_op[0] = x_01_r + ptr_inp[4];
255   ptr_op[1] = x_01_i + ptr_inp[5];
256   ptr_op[2] = temp + p2;
257   ptr_op[3] = (ptr_inp[1] - p3) - p4;
258   ptr_op[4] = temp - p2;
259   ptr_op[5] = (ptr_inp[1] + p3) - p4;
260 }
261 
ixheaace_hbe_apply_tw_mult_ifft(FLOAT32 * ptr_inp,FLOAT32 * ptr_op,WORD32 dim1,WORD32 dim2,const FLOAT32 * ptr_tw)262 VOID ixheaace_hbe_apply_tw_mult_ifft(FLOAT32 *ptr_inp, FLOAT32 *ptr_op, WORD32 dim1, WORD32 dim2,
263                                      const FLOAT32 *ptr_tw) {
264   FLOAT32 accu1, accu2;
265   WORD32 i, j;
266   WORD32 step_val = (dim2 - 1) << 1;
267   for (i = 0; i < (dim2); i++) {
268     ptr_op[0] = ptr_inp[0];
269     ptr_op[1] = ptr_inp[1];
270     ptr_op += 2;
271     ptr_inp += 2;
272   }
273 
274   for (j = 0; j < (dim1 - 1); j++) {
275     ptr_op[0] = ptr_inp[0];
276     ptr_op[1] = ptr_inp[1];
277     ptr_inp += 2;
278     ptr_op += 2;
279     for (i = 0; i < (dim2 - 1); i++) {
280       CPLX_MPY_IFFT(accu1, accu2, ptr_inp[2 * i + 0], ptr_inp[2 * i + 1], ptr_tw[2 * i + 1],
281                     ptr_tw[2 * i]);
282       ptr_op[2 * i + 0] = accu1;
283       ptr_op[2 * i + 1] = accu2;
284     }
285     ptr_inp += step_val;
286     ptr_op += step_val;
287     ptr_tw += (dim2 - 1) * 2;
288   }
289 }
290 
ixheaace_hbe_apply_tw_mult_fft(FLOAT32 * ptr_inp,FLOAT32 * ptr_op,WORD32 dim1,WORD32 dim2,const FLOAT32 * ptr_tw)291 VOID ixheaace_hbe_apply_tw_mult_fft(FLOAT32 *ptr_inp, FLOAT32 *ptr_op, WORD32 dim1, WORD32 dim2,
292                                     const FLOAT32 *ptr_tw) {
293   FLOAT32 accu1, accu2;
294   WORD32 i, j;
295   WORD32 step_val = (dim2 - 1) << 1;
296   for (i = 0; i < (dim2); i++) {
297     ptr_op[0] = ptr_inp[0];
298     ptr_op[1] = ptr_inp[1];
299     ptr_op += 2;
300     ptr_inp += 2;
301   }
302 
303   for (j = 0; j < (dim1 - 1); j++) {
304     ptr_op[0] = ptr_inp[0];
305     ptr_op[1] = ptr_inp[1];
306     ptr_inp += 2;
307     ptr_op += 2;
308     for (i = 0; i < (dim2 - 1); i++) {
309       CPLX_MPY_FFT(accu1, accu2, ptr_inp[2 * i + 0], ptr_inp[2 * i + 1], ptr_tw[2 * i + 1],
310                    ptr_tw[2 * i]);
311       ptr_op[2 * i + 0] = accu1;
312       ptr_op[2 * i + 1] = accu2;
313     }
314     ptr_inp += step_val;
315     ptr_op += step_val;
316     ptr_tw += (dim2 - 1) * 2;
317   }
318 }
319 
ixheaace_hbe_apply_cfftn(FLOAT32 re[],FLOAT32 * ptr_scratch,WORD32 n_pass,WORD32 i_sign)320 VOID ixheaace_hbe_apply_cfftn(FLOAT32 re[], FLOAT32 *ptr_scratch, WORD32 n_pass, WORD32 i_sign) {
321   WORD32 i, j, k, n_stages, h2;
322   FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
323   WORD32 del, nodespacing, in_loop_cnt;
324   WORD32 not_power_4;
325   WORD32 dig_rev_shift;
326   WORD32 mpass = n_pass;
327   WORD32 npoints = n_pass;
328   const FLOAT64 *ptr_w;
329   FLOAT32 *ptr_x = ptr_scratch;
330   FLOAT32 *y = ptr_scratch + (2 * n_pass);
331   FLOAT32 *ptr_y = y;
332 
333   dig_rev_shift = ixheaac_norm32(mpass) + 1 - 16;
334   n_stages = 30 - ixheaac_norm32(mpass); /* log2(npoints), if npoints=2^m */
335   not_power_4 = n_stages & 1;
336 
337   n_stages = n_stages >> 1;
338 
339   ptr_w = ixheaac_twid_tbl_fft_double;
340   ptr_x = re;
341 
342   dig_rev_shift = MAX(dig_rev_shift, 0);
343 
344   if (i_sign == -1) {
345     for (i = 0; i < npoints; i += 4) {
346       FLOAT32 *ptr_inp = ptr_x;
347       FLOAT32 tmk;
348 
349       DIG_REV(i, dig_rev_shift, h2);
350       if (not_power_4) {
351         h2 += 1;
352         h2 &= ~1;
353       }
354       ptr_inp += (h2);
355 
356       x0r = *ptr_inp;
357       x0i = *(ptr_inp + 1);
358       ptr_inp += (npoints >> 1);
359 
360       x1r = *ptr_inp;
361       x1i = *(ptr_inp + 1);
362       ptr_inp += (npoints >> 1);
363 
364       x2r = *ptr_inp;
365       x2i = *(ptr_inp + 1);
366       ptr_inp += (npoints >> 1);
367 
368       x3r = *ptr_inp;
369       x3i = *(ptr_inp + 1);
370 
371       x0r = x0r + x2r;
372       x0i = x0i + x2i;
373 
374       tmk = x0r - x2r;
375       x2r = tmk - x2r;
376       tmk = x0i - x2i;
377       x2i = tmk - x2i;
378 
379       x1r = x1r + x3r;
380       x1i = x1i + x3i;
381 
382       tmk = x1r - x3r;
383       x3r = tmk - x3r;
384       tmk = x1i - x3i;
385       x3i = tmk - x3i;
386 
387       x0r = x0r + x1r;
388       x0i = x0i + x1i;
389 
390       tmk = x0r - x1r;
391       x1r = tmk - x1r;
392       tmk = x0i - x1i;
393       x1i = tmk - x1i;
394 
395       x2r = x2r + x3i;
396       x2i = x2i - x3r;
397 
398       tmk = x2r - x3i;
399       x3i = tmk - x3i;
400       tmk = x2i + x3r;
401       x3r = tmk + x3r;
402 
403       *ptr_y++ = x0r;
404       *ptr_y++ = x0i;
405       *ptr_y++ = x2r;
406       *ptr_y++ = x2i;
407       *ptr_y++ = x1r;
408       *ptr_y++ = x1i;
409       *ptr_y++ = x3i;
410       *ptr_y++ = x3r;
411     }
412     ptr_y -= 2 * npoints;
413     del = 4;
414     nodespacing = 64;
415     in_loop_cnt = npoints >> 4;
416     for (i = n_stages - 1; i > 0; i--) {
417       const FLOAT64 *ptr_twiddle = ptr_w;
418       FLOAT32 *data = ptr_y;
419       FLOAT64 w_1, w_2, w_3, w_4, w_5, w_6;
420       WORD32 sec_loop_cnt;
421 
422       for (k = in_loop_cnt; k != 0; k--) {
423         x0r = (*data);
424         x0i = (*(data + 1));
425         data += ((SIZE_T)del << 1);
426 
427         x1r = (*data);
428         x1i = (*(data + 1));
429         data += ((SIZE_T)del << 1);
430 
431         x2r = (*data);
432         x2i = (*(data + 1));
433         data += ((SIZE_T)del << 1);
434 
435         x3r = (*data);
436         x3i = (*(data + 1));
437         data -= 3 * (del << 1);
438 
439         x0r = x0r + x2r;
440         x0i = x0i + x2i;
441         x2r = x0r - (x2r * 2);
442         x2i = x0i - (x2i * 2);
443         x1r = x1r + x3r;
444         x1i = x1i + x3i;
445         x3r = x1r - (x3r * 2);
446         x3i = x1i - (x3i * 2);
447 
448         x0r = x0r + x1r;
449         x0i = x0i + x1i;
450         x1r = x0r - (x1r * 2);
451         x1i = x0i - (x1i * 2);
452         x2r = x2r + x3i;
453         x2i = x2i - x3r;
454         x3i = x2r - (x3i * 2);
455         x3r = x2i + (x3r * 2);
456 
457         *data = x0r;
458         *(data + 1) = x0i;
459         data += ((SIZE_T)del << 1);
460 
461         *data = x2r;
462         *(data + 1) = x2i;
463         data += ((SIZE_T)del << 1);
464 
465         *data = x1r;
466         *(data + 1) = x1i;
467         data += ((SIZE_T)del << 1);
468 
469         *data = x3i;
470         *(data + 1) = x3r;
471         data += ((SIZE_T)del << 1);
472       }
473       data = ptr_y + 2;
474 
475       sec_loop_cnt = (nodespacing * del);
476       sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
477                      (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
478                      (sec_loop_cnt / 256);
479 
480       for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
481         w_1 = *(ptr_twiddle + j);
482         w_4 = *(ptr_twiddle + j + 257);
483         w_2 = *(ptr_twiddle + ((SIZE_T)j << 1));
484         w_5 = *(ptr_twiddle + ((SIZE_T)j << 1) + 257);
485         w_3 = *(ptr_twiddle + j + ((SIZE_T)j << 1));
486         w_6 = *(ptr_twiddle + j + ((SIZE_T)j << 1) + 257);
487 
488         for (k = in_loop_cnt; k != 0; k--) {
489           FLOAT32 tmp;
490           FLOAT32 x0r1, x0i1, x1r1, x1i1, x2r1, x2i1, x3r1, x3i1;
491           /*x0 is loaded later to avoid register crunch*/
492 
493           data += ((SIZE_T)del << 1);
494 
495           x1r1 = *data;
496           x1i1 = *(data + 1);
497           data += ((SIZE_T)del << 1);
498 
499           x2r1 = *data;
500           x2i1 = *(data + 1);
501           data += ((SIZE_T)del << 1);
502 
503           x3r1 = *data;
504           x3i1 = *(data + 1);
505           data -= 3 * (del << 1);
506 
507           tmp =
508               (FLOAT32)(ixheaace_dmult((FLOAT64)x1r1, w_1) - ixheaace_dmult((FLOAT64)x1i1, w_4));
509           x1i1 = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r1, w_4), (FLOAT64)x1i1, w_1);
510           x1r1 = tmp;
511 
512           tmp =
513               (FLOAT32)(ixheaace_dmult((FLOAT64)x2r1, w_2) - ixheaace_dmult((FLOAT64)x2i1, w_5));
514           x2i1 = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r1, w_5), (FLOAT64)x2i1, w_2);
515           x2r1 = tmp;
516 
517           tmp =
518               (FLOAT32)(ixheaace_dmult((FLOAT64)x3r1, w_3) - ixheaace_dmult((FLOAT64)x3i1, w_6));
519           x3i1 = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r1, w_6), (FLOAT64)x3i1, w_3);
520           x3r1 = tmp;
521 
522           x0r1 = (*data);
523           x0i1 = (*(data + 1));
524 
525           x0r1 = x0r1 + (x2r1);
526           x0i1 = x0i1 + (x2i1);
527           x2r1 = x0r1 - (x2r1 * 2);
528           x2i1 = x0i1 - (x2i1 * 2);
529           x1r1 = x1r1 + x3r1;
530           x1i1 = x1i1 + x3i1;
531           x3r1 = x1r1 - (x3r1 * 2);
532           x3i1 = x1i1 - (x3i1 * 2);
533 
534           x0r1 = x0r1 + (x1r1);
535           x0i1 = x0i1 + (x1i1);
536           x1r1 = x0r1 - (x1r1 * 2);
537           x1i1 = x0i1 - (x1i1 * 2);
538           x2r1 = x2r1 + (x3i1);
539           x2i1 = x2i1 - (x3r1);
540           x3i1 = x2r1 - (x3i1 * 2);
541           x3r1 = x2i1 + (x3r1 * 2);
542 
543           *data = x0r1;
544           *(data + 1) = x0i1;
545           data += ((SIZE_T)del << 1);
546 
547           *data = x2r1;
548           *(data + 1) = x2i1;
549           data += ((SIZE_T)del << 1);
550 
551           *data = x1r1;
552           *(data + 1) = x1i1;
553           data += ((SIZE_T)del << 1);
554 
555           *data = x3i1;
556           *(data + 1) = x3r1;
557           data += ((SIZE_T)del << 1);
558         }
559         data -= 2 * npoints;
560         data += 2;
561       }
562       for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
563         w_1 = *(ptr_twiddle + j);
564         w_4 = *(ptr_twiddle + j + 257);
565         w_2 = *(ptr_twiddle + ((SIZE_T)j << 1));
566         w_5 = *(ptr_twiddle + ((SIZE_T)j << 1) + 257);
567         w_3 = *(ptr_twiddle + j + ((SIZE_T)j << 1) - 256);
568         w_6 = *(ptr_twiddle + j + ((SIZE_T)j << 1) + 1);
569 
570         for (k = in_loop_cnt; k != 0; k--) {
571           FLOAT32 tmp;
572           FLOAT32 x0r1, x0i1, x1r1, x1i1, x2r1, x2i1, x3r1, x3i1;
573           /*x0 is loaded later to avoid register crunch*/
574 
575           data += ((SIZE_T)del << 1);
576 
577           x1r1 = *data;
578           x1i1 = *(data + 1);
579           data += ((SIZE_T)del << 1);
580 
581           x2r1 = *data;
582           x2i1 = *(data + 1);
583           data += ((SIZE_T)del << 1);
584 
585           x3r1 = *data;
586           x3i1 = *(data + 1);
587           data -= 3 * (del << 1);
588 
589           tmp =
590               (FLOAT32)(ixheaace_dmult((FLOAT64)x1r1, w_1) - ixheaace_dmult((FLOAT64)x1i1, w_4));
591           x1i1 = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r1, w_4), (FLOAT64)x1i1, w_1);
592           x1r1 = tmp;
593 
594           tmp =
595               (FLOAT32)(ixheaace_dmult((FLOAT64)x2r1, w_2) - ixheaace_dmult((FLOAT64)x2i1, w_5));
596           x2i1 = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r1, w_5), (FLOAT64)x2i1, w_2);
597           x2r1 = tmp;
598 
599           tmp =
600               (FLOAT32)(ixheaace_dmult((FLOAT64)x3r1, w_6) + ixheaace_dmult((FLOAT64)x3i1, w_3));
601           x3i1 =
602               (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r1, w_3) + ixheaace_dmult((FLOAT64)x3i1, w_6));
603           x3r1 = tmp;
604 
605           x0r1 = (*data);
606           x0i1 = (*(data + 1));
607 
608           x0r1 = x0r1 + (x2r1);
609           x0i1 = x0i1 + (x2i1);
610           x2r1 = x0r1 - (x2r1 * 2);
611           x2i1 = x0i1 - (x2i1 * 2);
612           x1r1 = x1r1 + x3r1;
613           x1i1 = x1i1 + x3i1;
614           x3r1 = x1r1 - (x3r1 * 2);
615           x3i1 = x1i1 - (x3i1 * 2);
616 
617           x0r1 = x0r1 + (x1r1);
618           x0i1 = x0i1 + (x1i1);
619           x1r1 = x0r1 - (x1r1 * 2);
620           x1i1 = x0i1 - (x1i1 * 2);
621           x2r1 = x2r1 + (x3i1);
622           x2i1 = x2i1 - (x3r1);
623           x3i1 = x2r1 - (x3i1 * 2);
624           x3r1 = x2i1 + (x3r1 * 2);
625 
626           *data = x0r1;
627           *(data + 1) = x0i1;
628           data += ((SIZE_T)del << 1);
629 
630           *data = x2r1;
631           *(data + 1) = x2i1;
632           data += ((SIZE_T)del << 1);
633 
634           *data = x1r1;
635           *(data + 1) = x1i1;
636           data += ((SIZE_T)del << 1);
637 
638           *data = x3i1;
639           *(data + 1) = x3r1;
640           data += ((SIZE_T)del << 1);
641         }
642         data -= 2 * npoints;
643         data += 2;
644       }
645       for (; j <= sec_loop_cnt * 2; j += nodespacing) {
646         w_1 = *(ptr_twiddle + j);
647         w_4 = *(ptr_twiddle + j + 257);
648         w_2 = *(ptr_twiddle + ((SIZE_T)j << 1) - 256);
649         w_5 = *(ptr_twiddle + ((SIZE_T)j << 1) + 1);
650         w_3 = *(ptr_twiddle + j + ((SIZE_T)j << 1) - 256);
651         w_6 = *(ptr_twiddle + j + ((SIZE_T)j << 1) + 1);
652 
653         for (k = in_loop_cnt; k != 0; k--) {
654           FLOAT32 tmp;
655           FLOAT32 x0r1, x0i1, x1r1, x1i1, x2r1, x2i1, x3r1, x3i1;
656           /*x0 is loaded later to avoid register crunch*/
657 
658           data += ((SIZE_T)del << 1);
659 
660           x1r1 = *data;
661           x1i1 = *(data + 1);
662           data += ((SIZE_T)del << 1);
663 
664           x2r1 = *data;
665           x2i1 = *(data + 1);
666           data += ((SIZE_T)del << 1);
667 
668           x3r1 = *data;
669           x3i1 = *(data + 1);
670           data -= 3 * (del << 1);
671 
672           tmp =
673               (FLOAT32)(ixheaace_dmult((FLOAT64)x1r1, w_1) - ixheaace_dmult((FLOAT64)x1i1, w_4));
674           x1i1 = (FLOAT32)ixheaace_dmac(ixheaace_dmult(x1r1, w_4), x1i1, w_1);
675           x1r1 = tmp;
676 
677           tmp =
678               (FLOAT32)(ixheaace_dmult((FLOAT64)x2r1, w_5) + ixheaace_dmult((FLOAT64)x2i1, w_2));
679           x2i1 = (FLOAT32)(-ixheaace_dmult(x2r1, w_2) + ixheaace_dmult(x2i1, w_5));
680           x2r1 = tmp;
681 
682           tmp =
683               (FLOAT32)(ixheaace_dmult((FLOAT64)x3r1, w_6) + ixheaace_dmult((FLOAT64)x3i1, w_3));
684           x3i1 =
685               (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r1, w_3) + ixheaace_dmult((FLOAT64)x3i1, w_6));
686           x3r1 = tmp;
687 
688           x0r1 = (*data);
689           x0i1 = (*(data + 1));
690 
691           x0r1 = x0r1 + (x2r1);
692           x0i1 = x0i1 + (x2i1);
693           x2r1 = x0r1 - (x2r1 * 2);
694           x2i1 = x0i1 - (x2i1 * 2);
695           x1r1 = x1r1 + x3r1;
696           x1i1 = x1i1 + x3i1;
697           x3r1 = x1r1 - (x3r1 * 2);
698           x3i1 = x1i1 - (x3i1 * 2);
699 
700           x0r1 = x0r1 + (x1r1);
701           x0i1 = x0i1 + (x1i1);
702           x1r1 = x0r1 - (x1r1 * 2);
703           x1i1 = x0i1 - (x1i1 * 2);
704           x2r1 = x2r1 + (x3i1);
705           x2i1 = x2i1 - (x3r1);
706           x3i1 = x2r1 - (x3i1 * 2);
707           x3r1 = x2i1 + (x3r1 * 2);
708 
709           *data = x0r1;
710           *(data + 1) = x0i1;
711           data += ((SIZE_T)del << 1);
712 
713           *data = x2r1;
714           *(data + 1) = x2i1;
715           data += ((SIZE_T)del << 1);
716 
717           *data = x1r1;
718           *(data + 1) = x1i1;
719           data += ((SIZE_T)del << 1);
720 
721           *data = x3i1;
722           *(data + 1) = x3r1;
723           data += ((SIZE_T)del << 1);
724         }
725         data -= 2 * npoints;
726         data += 2;
727       }
728       for (; j < nodespacing * del; j += nodespacing) {
729         w_1 = *(ptr_twiddle + j);
730         w_4 = *(ptr_twiddle + j + 257);
731         w_2 = *(ptr_twiddle + ((SIZE_T)j << 1) - 256);
732         w_5 = *(ptr_twiddle + ((SIZE_T)j << 1) + 1);
733         w_3 = *(ptr_twiddle + j + ((SIZE_T)j << 1) - 512);
734         w_6 = *(ptr_twiddle + j + ((SIZE_T)j << 1) - 512 + 257);
735 
736         for (k = in_loop_cnt; k != 0; k--) {
737           FLOAT32 tmp;
738           FLOAT32 x0r1, x0i1, x1r1, x1i1, x2r1, x2i1, x3r1, x3i1;
739           /*x0 is loaded later to avoid register crunch*/
740 
741           data += ((SIZE_T)del << 1);
742 
743           x1r1 = *data;
744           x1i1 = *(data + 1);
745           data += ((SIZE_T)del << 1);
746 
747           x2r1 = *data;
748           x2i1 = *(data + 1);
749           data += ((SIZE_T)del << 1);
750 
751           x3r1 = *data;
752           x3i1 = *(data + 1);
753           data -= 3 * (del << 1);
754 
755           tmp =
756               (FLOAT32)(ixheaace_dmult((FLOAT64)x1r1, w_1) - ixheaace_dmult((FLOAT64)x1i1, w_4));
757           x1i1 = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r1, w_4), (FLOAT64)x1i1, w_1);
758           x1r1 = tmp;
759 
760           tmp =
761               (FLOAT32)(ixheaace_dmult((FLOAT64)x2r1, w_5) + ixheaace_dmult((FLOAT64)x2i1, w_2));
762           x2i1 =
763               (FLOAT32)(-ixheaace_dmult((FLOAT64)x2r1, w_2) + ixheaace_dmult((FLOAT64)x2i1, w_5));
764           x2r1 = tmp;
765 
766           tmp =
767               (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r1, w_3) + ixheaace_dmult((FLOAT64)x3i1, w_6));
768           x3i1 = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r1, w_6), (FLOAT64)x3i1, w_3);
769           x3r1 = tmp;
770 
771           x0r1 = (*data);
772           x0i1 = (*(data + 1));
773 
774           x0r1 = x0r1 + (x2r1);
775           x0i1 = x0i1 + (x2i1);
776           x2r1 = x0r1 - (x2r1 * 2);
777           x2i1 = x0i1 - (x2i1 * 2);
778           x1r1 = x1r1 + x3r1;
779           x1i1 = x1i1 - x3i1;
780           x3r1 = x1r1 - (x3r1 * 2);
781           x3i1 = x1i1 + (x3i1 * 2);
782 
783           x0r1 = x0r1 + (x1r1);
784           x0i1 = x0i1 + (x1i1);
785           x1r1 = x0r1 - (x1r1 * 2);
786           x1i1 = x0i1 - (x1i1 * 2);
787           x2r1 = x2r1 + (x3i1);
788           x2i1 = x2i1 - (x3r1);
789           x3i1 = x2r1 - (x3i1 * 2);
790           x3r1 = x2i1 + (x3r1 * 2);
791 
792           *data = x0r1;
793           *(data + 1) = x0i1;
794           data += ((SIZE_T)del << 1);
795 
796           *data = x2r1;
797           *(data + 1) = x2i1;
798           data += ((SIZE_T)del << 1);
799 
800           *data = x1r1;
801           *(data + 1) = x1i1;
802           data += ((SIZE_T)del << 1);
803 
804           *data = x3i1;
805           *(data + 1) = x3r1;
806           data += ((SIZE_T)del << 1);
807         }
808         data -= 2 * npoints;
809         data += 2;
810       }
811       nodespacing >>= 2;
812       del <<= 2;
813       in_loop_cnt >>= 2;
814     }
815     if (not_power_4) {
816       const double *ptr_twiddle = ptr_w;
817       nodespacing <<= 1;
818 
819       for (j = del / 2; j != 0; j--) {
820         FLOAT64 w_1 = *ptr_twiddle;
821         FLOAT64 w_4 = *(ptr_twiddle + 257);
822         FLOAT32 tmp;
823         ptr_twiddle += nodespacing;
824 
825         x0r = *ptr_y;
826         x0i = *(ptr_y + 1);
827         ptr_y += ((SIZE_T)del << 1);
828 
829         x1r = *ptr_y;
830         x1i = *(ptr_y + 1);
831 
832         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
833         x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
834         x1r = tmp;
835 
836         *ptr_y = (x0r) - (x1r);
837         *(ptr_y + 1) = (x0i) - (x1i);
838         ptr_y -= ((SIZE_T)del << 1);
839 
840         *ptr_y = (x0r) + (x1r);
841         *(ptr_y + 1) = (x0i) + (x1i);
842         ptr_y += 2;
843       }
844       ptr_twiddle = ptr_w;
845       for (j = del / 2; j != 0; j--) {
846         FLOAT64 w_1 = *ptr_twiddle;
847         FLOAT64 w_4 = *(ptr_twiddle + 257);
848         FLOAT32 tmp;
849         ptr_twiddle += nodespacing;
850 
851         x0r = *ptr_y;
852         x0i = *(ptr_y + 1);
853         ptr_y += ((SIZE_T)del << 1);
854 
855         x1r = *ptr_y;
856         x1i = *(ptr_y + 1);
857 
858         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_4) + ixheaace_dmult((FLOAT64)x1i, w_1));
859         x1i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x1r, w_1) + ixheaace_dmult((FLOAT64)x1i, w_4));
860         x1r = tmp;
861 
862         *ptr_y = (x0r) - (x1r);
863         *(ptr_y + 1) = (x0i) - (x1i);
864         ptr_y -= ((SIZE_T)del << 1);
865 
866         *ptr_y = (x0r) + (x1r);
867         *(ptr_y + 1) = (x0i) + (x1i);
868         ptr_y += 2;
869       }
870     }
871   }
872 
873   /**********************IFFT******************************************/
874 
875   else {
876     for (i = 0; i < npoints; i += 4) {
877       FLOAT32 *ptr_inp = ptr_x;
878 
879       DIG_REV(i, dig_rev_shift, h2);
880       if (not_power_4) {
881         h2 += 1;
882         h2 &= ~1;
883       }
884       ptr_inp += (h2);
885 
886       x0r = *ptr_inp;
887       x0i = *(ptr_inp + 1);
888       ptr_inp += (npoints >> 1);
889 
890       x1r = *ptr_inp;
891       x1i = *(ptr_inp + 1);
892       ptr_inp += (npoints >> 1);
893 
894       x2r = *ptr_inp;
895       x2i = *(ptr_inp + 1);
896       ptr_inp += (npoints >> 1);
897 
898       x3r = *ptr_inp;
899       x3i = *(ptr_inp + 1);
900 
901       x0r = x0r + x2r;
902       x0i = x0i + x2i;
903       x2r = x0r - (x2r * 2);
904       x2i = x0i - (x2i * 2);
905       x1r = x1r + x3r;
906       x1i = x1i + x3i;
907       x3r = x1r - (x3r * 2);
908       x3i = x1i - (x3i * 2);
909 
910       x0r = x0r + x1r;
911       x0i = x0i + x1i;
912       x1r = x0r - (x1r * 2);
913       x1i = x0i - (x1i * 2);
914       x2r = x2r - x3i;
915       x2i = x2i + x3r;
916       x3i = x2r + (x3i * 2);
917       x3r = x2i - (x3r * 2);
918 
919       *ptr_y++ = x0r;
920       *ptr_y++ = x0i;
921       *ptr_y++ = x2r;
922       *ptr_y++ = x2i;
923       *ptr_y++ = x1r;
924       *ptr_y++ = x1i;
925       *ptr_y++ = x3i;
926       *ptr_y++ = x3r;
927     }
928     ptr_y -= 2 * npoints;
929     del = 4;
930     nodespacing = 64;
931     in_loop_cnt = npoints >> 4;
932     for (i = n_stages - 1; i > 0; i--) {
933       const double *ptr_twiddle = ptr_w;
934       float *data = ptr_y;
935       double w_1, w_2, w_3, w_4, w_5, w_6;
936       int sec_loop_cnt;
937 
938       for (k = in_loop_cnt; k != 0; k--) {
939         x0r = (*data);
940         x0i = (*(data + 1));
941         data += ((SIZE_T)del << 1);
942 
943         x1r = (*data);
944         x1i = (*(data + 1));
945         data += ((SIZE_T)del << 1);
946 
947         x2r = (*data);
948         x2i = (*(data + 1));
949         data += ((SIZE_T)del << 1);
950 
951         x3r = (*data);
952         x3i = (*(data + 1));
953         data -= 3 * (del << 1);
954 
955         x0r = x0r + x2r;
956         x0i = x0i + x2i;
957         x2r = x0r - (x2r * 2);
958         x2i = x0i - (x2i * 2);
959         x1r = x1r + x3r;
960         x1i = x1i + x3i;
961         x3r = x1r - (x3r * 2);
962         x3i = x1i - (x3i * 2);
963 
964         x0r = x0r + x1r;
965         x0i = x0i + x1i;
966         x1r = x0r - (x1r * 2);
967         x1i = x0i - (x1i * 2);
968         x2r = x2r - x3i;
969         x2i = x2i + x3r;
970         x3i = x2r + (x3i * 2);
971         x3r = x2i - (x3r * 2);
972 
973         *data = x0r;
974         *(data + 1) = x0i;
975         data += ((SIZE_T)del << 1);
976 
977         *data = x2r;
978         *(data + 1) = x2i;
979         data += ((SIZE_T)del << 1);
980 
981         *data = x1r;
982         *(data + 1) = x1i;
983         data += ((SIZE_T)del << 1);
984 
985         *data = x3i;
986         *(data + 1) = x3r;
987         data += ((SIZE_T)del << 1);
988       }
989       data = ptr_y + 2;
990 
991       sec_loop_cnt = (nodespacing * del);
992       sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
993                      (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
994                      (sec_loop_cnt / 256);
995 
996       for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
997         w_1 = *(ptr_twiddle + j);
998         w_4 = *(ptr_twiddle + j + 257);
999         w_2 = *(ptr_twiddle + ((SIZE_T)j << 1));
1000         w_5 = *(ptr_twiddle + ((SIZE_T)j << 1) + 257);
1001         w_3 = *(ptr_twiddle + j + ((SIZE_T)j << 1));
1002         w_6 = *(ptr_twiddle + j + ((SIZE_T)j << 1) + 257);
1003 
1004         for (k = in_loop_cnt; k != 0; k--) {
1005           FLOAT32 tmp;
1006           FLOAT32 x0r1, x0i1, x1r1, x1i1, x2r1, x2i1, x3r1, x3i1;
1007           /*x0 is loaded later to avoid register crunch*/
1008 
1009           data += ((SIZE_T)del << 1);
1010 
1011           x1r1 = *data;
1012           x1i1 = *(data + 1);
1013           data += ((SIZE_T)del << 1);
1014 
1015           x2r1 = *data;
1016           x2i1 = *(data + 1);
1017           data += ((SIZE_T)del << 1);
1018 
1019           x3r1 = *data;
1020           x3i1 = *(data + 1);
1021           data -= 3 * (del << 1);
1022 
1023           tmp = (FLOAT32)(((FLOAT64)x1r1 * w_1) + ((FLOAT64)x1i1 * w_4));
1024           x1i1 = (FLOAT32)(-((FLOAT64)x1r1 * w_4) + (FLOAT64)x1i1 * w_1);
1025           x1r1 = tmp;
1026 
1027           tmp = (FLOAT32)(((FLOAT64)x2r1 * w_2) + ((FLOAT64)x2i1 * w_5));
1028           x2i1 = (FLOAT32)(-((FLOAT64)x2r1 * w_5) + (FLOAT64)x2i1 * w_2);
1029           x2r1 = tmp;
1030 
1031           tmp = (FLOAT32)(((FLOAT64)x3r1 * w_3) + ((FLOAT64)x3i1 * w_6));
1032           x3i1 = (FLOAT32)(-((FLOAT64)x3r1 * w_6) + (FLOAT64)x3i1 * w_3);
1033           x3r1 = tmp;
1034 
1035           x0r1 = (*data);
1036           x0i1 = (*(data + 1));
1037 
1038           x0r1 = x0r1 + (x2r1);
1039           x0i1 = x0i1 + (x2i1);
1040           x2r1 = x0r1 - (x2r1 * 2);
1041           x2i1 = x0i1 - (x2i1 * 2);
1042           x1r1 = x1r1 + x3r1;
1043           x1i1 = x1i1 + x3i1;
1044           x3r1 = x1r1 - (x3r1 * 2);
1045           x3i1 = x1i1 - (x3i1 * 2);
1046 
1047           x0r1 = x0r1 + (x1r1);
1048           x0i1 = x0i1 + (x1i1);
1049           x1r1 = x0r1 - (x1r1 * 2);
1050           x1i1 = x0i1 - (x1i1 * 2);
1051           x2r1 = x2r1 - (x3i1);
1052           x2i1 = x2i1 + (x3r1);
1053           x3i1 = x2r1 + (x3i1 * 2);
1054           x3r1 = x2i1 - (x3r1 * 2);
1055 
1056           *data = x0r1;
1057           *(data + 1) = x0i1;
1058           data += ((SIZE_T)del << 1);
1059 
1060           *data = x2r1;
1061           *(data + 1) = x2i1;
1062           data += ((SIZE_T)del << 1);
1063 
1064           *data = x1r1;
1065           *(data + 1) = x1i1;
1066           data += ((SIZE_T)del << 1);
1067 
1068           *data = x3i1;
1069           *(data + 1) = x3r1;
1070           data += ((SIZE_T)del << 1);
1071         }
1072         data -= 2 * npoints;
1073         data += 2;
1074       }
1075       for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
1076         w_1 = *(ptr_twiddle + j);
1077         w_4 = *(ptr_twiddle + j + 257);
1078         w_2 = *(ptr_twiddle + ((SIZE_T)j << 1));
1079         w_5 = *(ptr_twiddle + ((SIZE_T)j << 1) + 257);
1080         w_3 = *(ptr_twiddle + j + ((SIZE_T)j << 1) - 256);
1081         w_6 = *(ptr_twiddle + j + ((SIZE_T)j << 1) + 1);
1082 
1083         for (k = in_loop_cnt; k != 0; k--) {
1084           FLOAT32 tmp;
1085           FLOAT32 x0r1, x0i1, x1r1, x1i1, x2r1, x2i1, x3r1, x3i1;
1086           /*x0 is loaded later to avoid register crunch*/
1087 
1088           data += ((SIZE_T)del << 1);
1089 
1090           x1r1 = *data;
1091           x1i1 = *(data + 1);
1092           data += ((SIZE_T)del << 1);
1093 
1094           x2r1 = *data;
1095           x2i1 = *(data + 1);
1096           data += ((SIZE_T)del << 1);
1097 
1098           x3r1 = *data;
1099           x3i1 = *(data + 1);
1100           data -= 3 * (del << 1);
1101 
1102           tmp = (FLOAT32)(((FLOAT64)x1r1 * w_1) + ((FLOAT64)x1i1 * w_4));
1103           x1i1 = (FLOAT32)(-((FLOAT64)x1r1 * w_4) + (FLOAT64)x1i1 * w_1);
1104           x1r1 = tmp;
1105 
1106           tmp = (FLOAT32)(((FLOAT64)x2r1 * w_2) + ((FLOAT64)x2i1 * w_5));
1107           x2i1 = (FLOAT32)(-((FLOAT64)x2r1 * w_5) + (FLOAT64)x2i1 * w_2);
1108           x2r1 = tmp;
1109 
1110           tmp = (FLOAT32)(((FLOAT64)x3r1 * w_6) - ((FLOAT64)x3i1 * w_3));
1111           x3i1 = (FLOAT32)(((FLOAT64)x3r1 * w_3) + ((FLOAT64)x3i1 * w_6));
1112           x3r1 = tmp;
1113 
1114           x0r1 = (*data);
1115           x0i1 = (*(data + 1));
1116 
1117           x0r1 = x0r1 + (x2r1);
1118           x0i1 = x0i1 + (x2i1);
1119           x2r1 = x0r1 - (x2r1 * 2);
1120           x2i1 = x0i1 - (x2i1 * 2);
1121           x1r1 = x1r1 + x3r1;
1122           x1i1 = x1i1 + x3i1;
1123           x3r1 = x1r1 - (x3r1 * 2);
1124           x3i1 = x1i1 - (x3i1 * 2);
1125 
1126           x0r1 = x0r1 + (x1r1);
1127           x0i1 = x0i1 + (x1i1);
1128           x1r1 = x0r1 - (x1r1 * 2);
1129           x1i1 = x0i1 - (x1i1 * 2);
1130           x2r1 = x2r1 - (x3i1);
1131           x2i1 = x2i1 + (x3r1);
1132           x3i1 = x2r1 + (x3i1 * 2);
1133           x3r1 = x2i1 - (x3r1 * 2);
1134 
1135           *data = x0r1;
1136           *(data + 1) = x0i1;
1137           data += ((SIZE_T)del << 1);
1138 
1139           *data = x2r1;
1140           *(data + 1) = x2i1;
1141           data += ((SIZE_T)del << 1);
1142 
1143           *data = x1r1;
1144           *(data + 1) = x1i1;
1145           data += ((SIZE_T)del << 1);
1146 
1147           *data = x3i1;
1148           *(data + 1) = x3r1;
1149           data += ((SIZE_T)del << 1);
1150         }
1151         data -= 2 * npoints;
1152         data += 2;
1153       }
1154       for (; j <= sec_loop_cnt * 2; j += nodespacing) {
1155         w_1 = *(ptr_twiddle + j);
1156         w_4 = *(ptr_twiddle + j + 257);
1157         w_2 = *(ptr_twiddle + ((SIZE_T)j << 1) - 256);
1158         w_5 = *(ptr_twiddle + ((SIZE_T)j << 1) + 1);
1159         w_3 = *(ptr_twiddle + j + ((SIZE_T)j << 1) - 256);
1160         w_6 = *(ptr_twiddle + j + ((SIZE_T)j << 1) + 1);
1161 
1162         for (k = in_loop_cnt; k != 0; k--) {
1163           FLOAT32 tmp;
1164           FLOAT32 x0r1, x0i1, x1r1, x1i1, x2r1, x2i1, x3r1, x3i1;
1165           /*x0 is loaded later to avoid register crunch*/
1166 
1167           data += ((SIZE_T)del << 1);
1168 
1169           x1r1 = *data;
1170           x1i1 = *(data + 1);
1171           data += ((SIZE_T)del << 1);
1172 
1173           x2r1 = *data;
1174           x2i1 = *(data + 1);
1175           data += ((SIZE_T)del << 1);
1176 
1177           x3r1 = *data;
1178           x3i1 = *(data + 1);
1179           data -= 3 * (del << 1);
1180 
1181           tmp = (FLOAT32)(((FLOAT64)x1r1 * w_1) + ((FLOAT64)x1i1 * w_4));
1182           x1i1 = (FLOAT32)(-((FLOAT64)x1r1 * w_4) + (FLOAT64)x1i1 * w_1);
1183           x1r1 = tmp;
1184 
1185           tmp = (FLOAT32)(((FLOAT64)x2r1 * w_5) - ((FLOAT64)x2i1 * w_2));
1186           x2i1 = (FLOAT32)(((FLOAT64)x2r1 * w_2) + ((FLOAT64)x2i1 * w_5));
1187           x2r1 = tmp;
1188 
1189           tmp = (FLOAT32)(((FLOAT64)x3r1 * w_6) - ((FLOAT64)x3i1 * w_3));
1190           x3i1 = (FLOAT32)(((FLOAT64)x3r1 * w_3) + ((FLOAT64)x3i1 * w_6));
1191           x3r1 = tmp;
1192 
1193           x0r1 = (*data);
1194           x0i1 = (*(data + 1));
1195 
1196           x0r1 = x0r1 + (x2r1);
1197           x0i1 = x0i1 + (x2i1);
1198           x2r1 = x0r1 - (x2r1 * 2);
1199           x2i1 = x0i1 - (x2i1 * 2);
1200           x1r1 = x1r1 + x3r1;
1201           x1i1 = x1i1 + x3i1;
1202           x3r1 = x1r1 - (x3r1 * 2);
1203           x3i1 = x1i1 - (x3i1 * 2);
1204 
1205           x0r1 = x0r1 + (x1r1);
1206           x0i1 = x0i1 + (x1i1);
1207           x1r1 = x0r1 - (x1r1 * 2);
1208           x1i1 = x0i1 - (x1i1 * 2);
1209           x2r1 = x2r1 - (x3i1);
1210           x2i1 = x2i1 + (x3r1);
1211           x3i1 = x2r1 + (x3i1 * 2);
1212           x3r1 = x2i1 - (x3r1 * 2);
1213 
1214           *data = x0r1;
1215           *(data + 1) = x0i1;
1216           data += ((SIZE_T)del << 1);
1217 
1218           *data = x2r1;
1219           *(data + 1) = x2i1;
1220           data += ((SIZE_T)del << 1);
1221 
1222           *data = x1r1;
1223           *(data + 1) = x1i1;
1224           data += ((SIZE_T)del << 1);
1225 
1226           *data = x3i1;
1227           *(data + 1) = x3r1;
1228           data += ((SIZE_T)del << 1);
1229         }
1230         data -= 2 * npoints;
1231         data += 2;
1232       }
1233       for (; j < nodespacing * del; j += nodespacing) {
1234         w_1 = *(ptr_twiddle + j);
1235         w_4 = *(ptr_twiddle + j + 257);
1236         w_2 = *(ptr_twiddle + ((SIZE_T)j << 1) - 256);
1237         w_5 = *(ptr_twiddle + ((SIZE_T)j << 1) + 1);
1238         w_3 = *(ptr_twiddle + j + ((SIZE_T)j << 1) - 512);
1239         w_6 = *(ptr_twiddle + j + ((SIZE_T)j << 1) - 512 + 257);
1240 
1241         for (k = in_loop_cnt; k != 0; k--) {
1242           FLOAT32 tmp;
1243           FLOAT32 x0r1, x0i1, x1r1, x1i1, x2r1, x2i1, x3r1, x3i1;
1244           /*x0 is loaded later to avoid register crunch*/
1245 
1246           data += ((SIZE_T)del << 1);
1247 
1248           x1r1 = *data;
1249           x1i1 = *(data + 1);
1250           data += ((SIZE_T)del << 1);
1251 
1252           x2r1 = *data;
1253           x2i1 = *(data + 1);
1254           data += ((SIZE_T)del << 1);
1255 
1256           x3r1 = *data;
1257           x3i1 = *(data + 1);
1258           data -= 3 * (del << 1);
1259 
1260           tmp = (FLOAT32)(((FLOAT64)x1r1 * w_1) + ((FLOAT64)x1i1 * w_4));
1261           x1i1 = (FLOAT32)(-((FLOAT64)x1r1 * w_4) + (FLOAT64)x1i1 * w_1);
1262           x1r1 = tmp;
1263 
1264           tmp = (FLOAT32)(((FLOAT64)x2r1 * w_5) - ((FLOAT64)x2i1 * w_2));
1265           x2i1 = (FLOAT32)(((FLOAT64)x2r1 * w_2) + ((FLOAT64)x2i1 * w_5));
1266           x2r1 = tmp;
1267 
1268           tmp = (FLOAT32)(-((FLOAT64)x3r1 * w_3) - ((FLOAT64)x3i1 * w_6));
1269           x3i1 = (FLOAT32)(-((FLOAT64)x3r1 * w_6) + (FLOAT64)x3i1 * w_3);
1270           x3r1 = tmp;
1271 
1272           x0r1 = (*data);
1273           x0i1 = (*(data + 1));
1274 
1275           x0r1 = x0r1 + (x2r1);
1276           x0i1 = x0i1 + (x2i1);
1277           x2r1 = x0r1 - (x2r1 * 2);
1278           x2i1 = x0i1 - (x2i1 * 2);
1279           x1r1 = x1r1 + x3r1;
1280           x1i1 = x1i1 - x3i1;
1281           x3r1 = x1r1 - (x3r1 * 2);
1282           x3i1 = x1i1 + (x3i1 * 2);
1283 
1284           x0r1 = x0r1 + (x1r1);
1285           x0i1 = x0i1 + (x1i1);
1286           x1r1 = x0r1 - (x1r1 * 2);
1287           x1i1 = x0i1 - (x1i1 * 2);
1288           x2r1 = x2r1 - (x3i1);
1289           x2i1 = x2i1 + (x3r1);
1290           x3i1 = x2r1 + (x3i1 * 2);
1291           x3r1 = x2i1 - (x3r1 * 2);
1292 
1293           *data = x0r1;
1294           *(data + 1) = x0i1;
1295           data += ((SIZE_T)del << 1);
1296 
1297           *data = x2r1;
1298           *(data + 1) = x2i1;
1299           data += ((SIZE_T)del << 1);
1300 
1301           *data = x1r1;
1302           *(data + 1) = x1i1;
1303           data += ((SIZE_T)del << 1);
1304 
1305           *data = x3i1;
1306           *(data + 1) = x3r1;
1307           data += ((SIZE_T)del << 1);
1308         }
1309         data -= 2 * npoints;
1310         data += 2;
1311       }
1312       nodespacing >>= 2;
1313       del <<= 2;
1314       in_loop_cnt >>= 2;
1315     }
1316 
1317     if (not_power_4) {
1318       const FLOAT64 *ptr_twiddle = ptr_w;
1319       nodespacing <<= 1;
1320 
1321       for (j = del / 2; j != 0; j--) {
1322         FLOAT64 w_1 = *ptr_twiddle;
1323         FLOAT64 w_4 = *(ptr_twiddle + 257);
1324         FLOAT32 tmp;
1325         ptr_twiddle += nodespacing;
1326 
1327         x0r = *ptr_y;
1328         x0i = *(ptr_y + 1);
1329         ptr_y += ((SIZE_T)del << 1);
1330 
1331         x1r = *ptr_y;
1332         x1i = *(ptr_y + 1);
1333 
1334         tmp = (FLOAT32)(((FLOAT64)x1r * w_1) + ((FLOAT64)x1i * w_4));
1335         x1i = (FLOAT32)(-((FLOAT64)x1r * w_4) + (FLOAT64)x1i * w_1);
1336         x1r = tmp;
1337 
1338         *ptr_y = (x0r) - (x1r);
1339         *(ptr_y + 1) = (x0i) - (x1i);
1340         ptr_y -= ((SIZE_T)del << 1);
1341 
1342         *ptr_y = (x0r) + (x1r);
1343         *(ptr_y + 1) = (x0i) + (x1i);
1344         ptr_y += 2;
1345       }
1346       ptr_twiddle = ptr_w;
1347       for (j = del / 2; j != 0; j--) {
1348         FLOAT64 w_1 = *ptr_twiddle;
1349         FLOAT64 w_4 = *(ptr_twiddle + 257);
1350         FLOAT32 tmp;
1351         ptr_twiddle += nodespacing;
1352 
1353         x0r = *ptr_y;
1354         x0i = *(ptr_y + 1);
1355         ptr_y += ((SIZE_T)del << 1);
1356 
1357         x1r = *ptr_y;
1358         x1i = *(ptr_y + 1);
1359 
1360         tmp = (FLOAT32)(((FLOAT64)x1r * w_4) - ((FLOAT64)x1i * w_1));
1361         x1i = (FLOAT32)(((FLOAT64)x1r * w_1) + ((FLOAT64)x1i * w_4));
1362         x1r = tmp;
1363 
1364         *ptr_y = (x0r) - (x1r);
1365         *(ptr_y + 1) = (x0i) - (x1i);
1366         ptr_y -= ((SIZE_T)del << 1);
1367 
1368         *ptr_y = (x0r) + (x1r);
1369         *(ptr_y + 1) = (x0i) + (x1i);
1370         ptr_y += 2;
1371       }
1372     }
1373   }
1374 
1375   for (i = 0; i < n_pass; i++) {
1376     re[2 * i + 0] = y[2 * i + 0];
1377     re[2 * i + 1] = y[2 * i + 1];
1378   }
1379 }
1380 
ixheaace_hbe_apply_cfftn_gen(FLOAT32 in[],FLOAT32 * ptr_scratch,WORD32 n_pass,WORD32 i_sign)1381 VOID ixheaace_hbe_apply_cfftn_gen(FLOAT32 in[], FLOAT32 *ptr_scratch, WORD32 n_pass,
1382                                   WORD32 i_sign) {
1383   WORD32 i, j;
1384   WORD32 m_points = n_pass;
1385   FLOAT32 *y, *re_3;
1386   FLOAT32 *ptr_x, *ptr_y;
1387   ptr_x = ptr_scratch;
1388   ptr_scratch += 2 * m_points;
1389   ptr_y = y = ptr_scratch;
1390   ptr_scratch += 4 * m_points;
1391   re_3 = ptr_scratch;
1392   ptr_scratch += 2 * m_points;
1393   WORD32 cnfac;
1394   WORD32 mpass = n_pass;
1395 
1396   cnfac = 0;
1397   while (mpass % 3 == 0) {
1398     mpass /= 3;
1399     cnfac++;
1400   }
1401 
1402   for (i = 0; i < 3 * cnfac; i++) {
1403     for (j = 0; j < mpass; j++) {
1404       re_3[2 * j + 0] = in[6 * j + 2 * i + 0];
1405       re_3[2 * j + 1] = in[6 * j + 2 * i + 1];
1406     }
1407 
1408     ixheaace_hbe_apply_cfftn(re_3, ptr_scratch, mpass, i_sign);
1409 
1410     for (j = 0; j < mpass; j++) {
1411       in[6 * j + 2 * i + 0] = re_3[2 * j + 0];
1412       in[6 * j + 2 * i + 1] = re_3[2 * j + 1];
1413     }
1414   }
1415 
1416   {
1417     FLOAT64 *ptr_w1r, *ptr_w1i;
1418     FLOAT32 tmp;
1419     ptr_w1r = (FLOAT64 *)ixheaac_twid_tbl_fft_ntwt3r;
1420     ptr_w1i = (FLOAT64 *)ixheaac_twid_tbl_fft_ntwt3i;
1421 
1422     if (i_sign < 0) {
1423       i = 0;
1424       while (i < n_pass) {
1425         tmp =
1426             (FLOAT32)((FLOAT64)in[2 * i + 0] * (*ptr_w1r) - (FLOAT64)in[2 * i + 1] * (*ptr_w1i));
1427         in[2 * i + 1] =
1428             (FLOAT32)((FLOAT64)in[2 * i + 0] * (*ptr_w1i) + (FLOAT64)in[2 * i + 1] * (*ptr_w1r));
1429         in[2 * i + 0] = tmp;
1430 
1431         ptr_w1r++;
1432         ptr_w1i++;
1433 
1434         tmp =
1435             (FLOAT32)((FLOAT64)in[2 * i + 2] * (*ptr_w1r) - (FLOAT64)in[2 * i + 3] * (*ptr_w1i));
1436         in[2 * i + 3] =
1437             (FLOAT32)((FLOAT64)in[2 * i + 2] * (*ptr_w1i) + (FLOAT64)in[2 * i + 3] * (*ptr_w1r));
1438         in[2 * i + 2] = tmp;
1439 
1440         ptr_w1r++;
1441         ptr_w1i++;
1442 
1443         tmp =
1444             (FLOAT32)((FLOAT64)in[2 * i + 4] * (*ptr_w1r) - (FLOAT64)in[2 * i + 5] * (*ptr_w1i));
1445         in[2 * i + 5] =
1446             (FLOAT32)((FLOAT64)in[2 * i + 4] * (*ptr_w1i) + (FLOAT64)in[2 * i + 5] * (*ptr_w1r));
1447         in[2 * i + 4] = tmp;
1448 
1449         ptr_w1r += 3 * (128 / mpass - 1) + 1;
1450         ptr_w1i += 3 * (128 / mpass - 1) + 1;
1451         i += 3;
1452       }
1453     }
1454 
1455     else {
1456       i = 0;
1457       while (i < n_pass) {
1458         tmp =
1459             (FLOAT32)((FLOAT64)in[2 * i + 0] * (*ptr_w1r) + (FLOAT64)in[2 * i + 1] * (*ptr_w1i));
1460         in[2 * i + 1] =
1461             (FLOAT32)(-(FLOAT64)in[2 * i + 0] * (*ptr_w1i) + (FLOAT64)in[2 * i + 1] * (*ptr_w1r));
1462         in[2 * i + 0] = tmp;
1463 
1464         ptr_w1r++;
1465         ptr_w1i++;
1466 
1467         tmp =
1468             (FLOAT32)((FLOAT64)in[2 * i + 2] * (*ptr_w1r) + (FLOAT64)in[2 * i + 3] * (*ptr_w1i));
1469         in[2 * i + 3] =
1470             (FLOAT32)(-(FLOAT64)in[2 * i + 2] * (*ptr_w1i) + (FLOAT64)in[2 * i + 3] * (*ptr_w1r));
1471         in[2 * i + 2] = tmp;
1472 
1473         ptr_w1r++;
1474         ptr_w1i++;
1475 
1476         tmp =
1477             (FLOAT32)((FLOAT64)in[2 * i + 4] * (*ptr_w1r) + (FLOAT64)in[2 * i + 5] * (*ptr_w1i));
1478         in[2 * i + 5] =
1479             (FLOAT32)(-(FLOAT64)in[2 * i + 4] * (*ptr_w1i) + (FLOAT64)in[2 * i + 5] * (*ptr_w1r));
1480         in[2 * i + 4] = tmp;
1481 
1482         ptr_w1r += 3 * (128 / mpass - 1) + 1;
1483         ptr_w1i += 3 * (128 / mpass - 1) + 1;
1484         i += 3;
1485       }
1486     }
1487   }
1488 
1489   for (i = 0; i < n_pass; i++) {
1490     ptr_x[2 * i + 0] = in[2 * i + 0];
1491     ptr_x[2 * i + 1] = in[2 * i + 1];
1492   }
1493   for (i = 0; i < mpass; i++) {
1494     ixheaace_hbe_apply_fft_3(ptr_x, ptr_y, i_sign);
1495 
1496     ptr_x = ptr_x + 6;
1497     ptr_y = ptr_y + 6;
1498   }
1499 
1500   for (i = 0; i < mpass; i++) {
1501     in[2 * i + 0] = y[6 * i + 0];
1502     in[2 * i + 1] = y[6 * i + 1];
1503   }
1504 
1505   for (i = 0; i < mpass; i++) {
1506     in[2 * mpass + 2 * i + 0] = y[6 * i + 2];
1507     in[2 * mpass + 2 * i + 1] = y[6 * i + 3];
1508   }
1509 
1510   for (i = 0; i < mpass; i++) {
1511     in[4 * mpass + 2 * i + 0] = y[6 * i + 4];
1512     in[4 * mpass + 2 * i + 1] = y[6 * i + 5];
1513   }
1514 }
1515 
ixheaace_hbe_apply_fft_288(FLOAT32 * ptr_inp,FLOAT32 * ptr_scratch,WORD32 len,WORD32 i_sign)1516 VOID ixheaace_hbe_apply_fft_288(FLOAT32 *ptr_inp, FLOAT32 *ptr_scratch, WORD32 len,
1517                                 WORD32 i_sign) {
1518   /* Dividing the 288-point FFT into 96x3 i.e nx3*/
1519   FLOAT32 *ptr_op = ptr_scratch;
1520   WORD32 mpoints = len / 96;
1521   WORD32 fpoints = len / 3;
1522   WORD32 ii, jj;
1523   ptr_scratch += 2 * len;
1524 
1525   for (ii = 0; ii < mpoints; ii++) {
1526     for (jj = 0; jj < fpoints; jj++) {
1527       ptr_op[2 * jj + 0] = ptr_inp[2 * mpoints * jj + 2 * ii];
1528       ptr_op[2 * jj + 1] = ptr_inp[2 * mpoints * jj + 2 * ii + 1];
1529     }
1530 
1531     /* 96-point (32x3-point) of FFT */
1532     if (fpoints & (fpoints - 1))
1533       ixheaace_hbe_apply_cfftn_gen(ptr_op, ptr_scratch, fpoints, i_sign);
1534     else
1535       ixheaace_hbe_apply_cfftn(ptr_op, ptr_scratch, fpoints, i_sign);
1536 
1537     for (jj = 0; jj < fpoints; jj++) {
1538       ptr_inp[mpoints * 2 * jj + 2 * ii + 0] = ptr_op[2 * jj + 0];
1539       ptr_inp[mpoints * 2 * jj + 2 * ii + 1] = ptr_op[2 * jj + 1];
1540     }
1541   }
1542 
1543   /* Multiplication FFT with twiddle table */
1544   ixheaace_hbe_apply_tw_mult_fft(ptr_inp, ptr_op, fpoints, mpoints, ixheaac_twid_tbl_fft_288);
1545 
1546   for (ii = 0; ii < fpoints; ii++) {
1547     /* 3-point of FFT */
1548     ixheaace_hbe_apply_fft_3(ptr_op, ptr_scratch, i_sign);
1549     ptr_op = ptr_op + (mpoints * 2);
1550     ptr_scratch = ptr_scratch + (mpoints * 2);
1551   }
1552 
1553   ptr_scratch -= fpoints * mpoints * 2;
1554 
1555   for (jj = 0; jj < fpoints; jj++) {
1556     ptr_inp[2 * jj + 0] = ptr_scratch[6 * jj];
1557     ptr_inp[2 * jj + 1] = ptr_scratch[6 * jj + 1];
1558   }
1559   for (jj = 0; jj < fpoints; jj++) {
1560     ptr_inp[2 * fpoints + 2 * jj + 0] = ptr_scratch[6 * jj + 2];
1561     ptr_inp[2 * fpoints + 2 * jj + 1] = ptr_scratch[6 * jj + 3];
1562   }
1563   for (jj = 0; jj < fpoints; jj++) {
1564     ptr_inp[4 * fpoints + 2 * jj + 0] = ptr_scratch[6 * jj + 4];
1565     ptr_inp[4 * fpoints + 2 * jj + 1] = ptr_scratch[6 * jj + 5];
1566   }
1567 }
1568 
ixheaace_hbe_apply_ifft_224(FLOAT32 * ptr_inp,FLOAT32 * ptr_scratch,WORD32 len,WORD32 i_sign)1569 VOID ixheaace_hbe_apply_ifft_224(FLOAT32 *ptr_inp, FLOAT32 *ptr_scratch, WORD32 len,
1570                                  WORD32 i_sign) {
1571   /* Dividing 224-point IFFT into 32x7 */
1572   WORD32 mpoints = len / 32;
1573   WORD32 fpoints = len / 7;
1574   WORD32 ii, jj;
1575   FLOAT32 *ptr_op = ptr_scratch;
1576   ptr_scratch += 2 * len;
1577 
1578   for (ii = 0; ii < mpoints; ii++) {
1579     for (jj = 0; jj < fpoints; jj++) {
1580       ptr_op[2 * jj + 0] = ptr_inp[2 * mpoints * jj + 2 * ii];
1581       ptr_op[2 * jj + 1] = ptr_inp[2 * mpoints * jj + 2 * ii + 1];
1582     }
1583 
1584     /* 32-point of IFFT*/
1585     if (fpoints & (fpoints - 1))
1586       ixheaace_hbe_apply_cfftn_gen(ptr_op, ptr_scratch, fpoints, i_sign);
1587     else
1588       ixheaace_hbe_apply_cfftn(ptr_op, ptr_scratch, fpoints, i_sign);
1589 
1590     for (jj = 0; jj < fpoints; jj++) {
1591       ptr_inp[mpoints * 2 * jj + 2 * ii + 0] = ptr_op[2 * jj + 0];
1592       ptr_inp[mpoints * 2 * jj + 2 * ii + 1] = ptr_op[2 * jj + 1];
1593     }
1594   }
1595 
1596   /* Multiplication IFFT with twiddle table */
1597   ixheaace_hbe_apply_tw_mult_ifft(ptr_inp, ptr_op, fpoints, mpoints, ixheaac_twid_tbl_fft_224);
1598 
1599   for (ii = 0; ii < fpoints; ii++) {
1600     /* 7-point of IFFT */
1601     ixheaace_hbe_apply_ifft_7(ptr_op, ptr_scratch);
1602     ptr_scratch += (mpoints * 2);
1603     ptr_op += (mpoints * 2);
1604   }
1605 
1606   ptr_scratch -= fpoints * mpoints * 2;
1607 
1608   for (jj = 0; jj < fpoints; jj++) {
1609     for (ii = 0; ii < mpoints; ii++) {
1610       ptr_inp[fpoints * ii * 2 + 2 * jj + 0] = ptr_scratch[mpoints * jj * 2 + 2 * ii + 0];
1611       ptr_inp[fpoints * ii * 2 + 2 * jj + 1] = ptr_scratch[mpoints * jj * 2 + 2 * ii + 1];
1612     }
1613   }
1614 }
1615 
ixheaace_hbe_apply_ifft_336(FLOAT32 * ptr_inp,FLOAT32 * ptr_scratch,WORD32 len,WORD32 i_sign)1616 VOID ixheaace_hbe_apply_ifft_336(FLOAT32 *ptr_inp, FLOAT32 *ptr_scratch, WORD32 len,
1617                                  WORD32 i_sign) {
1618   WORD32 i, j;
1619   WORD32 m_points = len / 7;
1620   WORD32 n_points = len / 48;
1621   FLOAT32 *ptr_real, *ptr_imag, *ptr_real_1, *ptr_scratch_local;
1622   ptr_real = ptr_scratch;
1623   ptr_scratch += 2 * len;
1624   ptr_imag = ptr_scratch;
1625   ptr_scratch += len;
1626   ptr_scratch_local = ptr_scratch;
1627   ptr_scratch += len;
1628   ptr_real_1 = ptr_scratch;
1629   ptr_scratch += len;
1630 
1631   for (i = 0; i < len; i++) {
1632     ptr_real[i] = ptr_inp[2 * i + 0];
1633     ptr_imag[i] = ptr_inp[2 * i + 1];
1634   }
1635 
1636   for (i = 0; i < m_points; i++) {
1637     for (j = 0; j < n_points; j++) {
1638       ptr_real_1[2 * j + 0] = ptr_inp[m_points * 2 * j + 2 * i + 0];
1639       ptr_real_1[2 * j + 1] = ptr_inp[m_points * 2 * j + 2 * i + 1];
1640     }
1641 
1642     ixheaace_hbe_apply_ifft_7(ptr_real_1, ptr_scratch);
1643 
1644     for (j = 0; j < n_points; j++) {
1645       ptr_inp[m_points * 2 * j + 2 * i + 0] = ptr_scratch[2 * j + 0];
1646       ptr_inp[m_points * 2 * j + 2 * i + 1] = ptr_scratch[2 * j + 1];
1647     }
1648   }
1649 
1650   switch (m_points) {
1651     case 48:
1652       ixheaace_hbe_apply_tw_mult_ifft(ptr_inp, ptr_scratch_local, n_points, m_points,
1653                                       ixheaac_twid_tbl_fft_336);
1654       break;
1655 
1656     default:
1657       ixheaace_hbe_apply_tw_mult_ifft(ptr_inp, ptr_scratch_local, n_points, m_points,
1658                                       ixheaac_twid_tbl_fft_168);
1659       break;
1660   }
1661   for (i = 0; i < len; i++) {
1662     ptr_real[2 * i + 0] = ptr_scratch_local[2 * i + 0];
1663     ptr_real[2 * i + 1] = ptr_scratch_local[2 * i + 1];
1664   }
1665 
1666   for (i = 0; i < n_points; i++) {
1667     ixheaace_hbe_apply_cfftn_gen(ptr_real, ptr_scratch, m_points, i_sign);
1668     ptr_real += (2 * m_points);
1669   }
1670 
1671   ptr_real -= n_points * 2 * m_points;
1672 
1673   for (j = 0; j < n_points; j++) {
1674     for (i = 0; i < m_points; i++) {
1675       ptr_inp[n_points * 2 * i + 2 * j + 0] = ptr_real[2 * m_points * j + 2 * i + 0];
1676       ptr_inp[n_points * 2 * i + 2 * j + 1] = ptr_real[2 * m_points * j + 2 * i + 1];
1677     }
1678   }
1679 }
1680