xref: /aosp_15_r20/external/libxaac/encoder/iusace_fft.c (revision 15dc779a375ca8b5125643b829a8aa4b70d7f451)
1 /******************************************************************************
2  *                                                                            *
3  * Copyright (C) 2023 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 
21 #include <string.h>
22 #include "ixheaac_type_def.h"
23 #include "ixheaace_adjust_threshold_data.h"
24 #include "iusace_cnst.h"
25 #include "iusace_block_switch_const.h"
26 #include "iusace_rom.h"
27 #include "iusace_bitbuffer.h"
28 
29 /* DRC */
30 #include "impd_drc_common_enc.h"
31 #include "impd_drc_uni_drc.h"
32 #include "impd_drc_tables.h"
33 #include "impd_drc_api.h"
34 #include "impd_drc_uni_drc_eq.h"
35 #include "impd_drc_uni_drc_filter_bank.h"
36 #include "impd_drc_gain_enc.h"
37 #include "impd_drc_struct_def.h"
38 
39 #include "iusace_tns_usac.h"
40 #include "iusace_psy_mod.h"
41 #include "iusace_config.h"
42 #include "iusace_fft.h"
43 #include "iusace_basic_ops_flt.h"
44 #include "ixheaac_constants.h"
45 #include "ixheaace_aac_constants.h"
46 #include "ixheaac_basic_ops32.h"
47 #include "ixheaace_common_utils.h"
48 #include "ixheaac_error_standards.h"
49 #include "ixheaace_error_codes.h"
50 
51 #define DIG_REV(i, m, j)                                    \
52   do {                                                      \
53     unsigned _ = (i);                                       \
54     _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
55     _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
56     _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
57     (j) = _ >> (m);                                         \
58   } while (0)
59 
iusace_calc_norm(WORD32 a)60 static PLATFORM_INLINE WORD8 iusace_calc_norm(WORD32 a) {
61   WORD8 norm_val;
62 
63   if (a == 0) {
64     norm_val = 31;
65   } else {
66     if (a == (WORD32)0xffffffffL) {
67       norm_val = 31;
68     } else {
69       if (a < 0) {
70         a = ~a;
71       }
72       for (norm_val = 0; a < (WORD32)0x40000000L; norm_val++) {
73         a <<= 1;
74       }
75     }
76   }
77 
78   return norm_val;
79 }
80 
iusace_complex_3point_fft(FLOAT32 * ptr_in,FLOAT32 * ptr_out)81 static PLATFORM_INLINE VOID iusace_complex_3point_fft(FLOAT32 *ptr_in, FLOAT32 *ptr_out) {
82   FLOAT32 add_r, sub_r;
83   FLOAT32 add_i, sub_i;
84   FLOAT32 x01r, x01i, temp;
85   FLOAT32 p1, p2, p3, p4;
86   FLOAT64 sinmu;
87 
88   sinmu = 0.866025403784439;
89 
90   x01r = ptr_in[0] + ptr_in[2];
91   x01i = ptr_in[1] + ptr_in[3];
92 
93   add_r = ptr_in[2] + ptr_in[4];
94   add_i = ptr_in[3] + ptr_in[5];
95 
96   sub_r = ptr_in[2] - ptr_in[4];
97   sub_i = ptr_in[3] - ptr_in[5];
98 
99   p1 = add_r / (FLOAT32)2.0;
100   p4 = add_i / (FLOAT32)2.0;
101   p2 = (FLOAT32)((FLOAT64)sub_i * sinmu);
102   p3 = (FLOAT32)((FLOAT64)sub_r * sinmu);
103 
104   temp = ptr_in[0] - p1;
105 
106   ptr_out[0] = x01r + ptr_in[4];
107   ptr_out[1] = x01i + ptr_in[5];
108   ptr_out[2] = temp + p2;
109   ptr_out[3] = (ptr_in[1] - p3) - p4;
110   ptr_out[4] = temp - p2;
111   ptr_out[5] = (ptr_in[1] + p3) - p4;
112 
113   return;
114 }
115 
iusace_complex_fft_p2(FLOAT32 * ptr_x,WORD32 nlength,FLOAT32 * scratch_fft_p2_y)116 VOID iusace_complex_fft_p2(FLOAT32 *ptr_x, WORD32 nlength, FLOAT32 *scratch_fft_p2_y) {
117   WORD32 i, j, k, n_stages, h2;
118   FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
119   FLOAT32 tmp;
120   WORD32 del, nodespacing, in_loop_cnt;
121   WORD32 not_power_4;
122   WORD32 dig_rev_shift;
123   FLOAT32 *y = scratch_fft_p2_y;
124   WORD32 mpass = nlength;
125   WORD32 npoints = nlength;
126   FLOAT32 *ptr_y = y;
127   const FLOAT64 *ptr_w;
128 
129   dig_rev_shift = iusace_calc_norm(mpass) + 1 - 16;
130   n_stages = 30 - iusace_calc_norm(mpass);
131   not_power_4 = n_stages & 1;
132 
133   n_stages = n_stages >> 1;
134 
135   ptr_w = iusace_twiddle_table_fft_32x32;
136 
137   if (dig_rev_shift < 0) {
138     dig_rev_shift = 0;
139   }
140 
141   for (i = 0; i < npoints; i += 4) {
142     FLOAT32 *inp = ptr_x;
143     FLOAT32 tmk;
144 
145     DIG_REV(i, dig_rev_shift, h2);
146     if (not_power_4) {
147       h2 += 1;
148       h2 &= ~1;
149     }
150     inp += (h2);
151 
152     x0r = *inp;
153     x0i = *(inp + 1);
154     inp += (npoints >> 1);
155 
156     x1r = *inp;
157     x1i = *(inp + 1);
158     inp += (npoints >> 1);
159 
160     x2r = *inp;
161     x2i = *(inp + 1);
162     inp += (npoints >> 1);
163 
164     x3r = *inp;
165     x3i = *(inp + 1);
166 
167     x0r = x0r + x2r;
168     x0i = x0i + x2i;
169 
170     tmk = x0r - x2r;
171     x2r = tmk - x2r;
172     tmk = x0i - x2i;
173     x2i = tmk - x2i;
174 
175     x1r = x1r + x3r;
176     x1i = x1i + x3i;
177 
178     tmk = x1r - x3r;
179     x3r = tmk - x3r;
180     tmk = x1i - x3i;
181     x3i = tmk - x3i;
182 
183     x0r = x0r + x1r;
184     x0i = x0i + x1i;
185 
186     tmk = x0r - x1r;
187     x1r = tmk - x1r;
188     tmk = x0i - x1i;
189     x1i = tmk - x1i;
190 
191     x2r = x2r + x3i;
192     x2i = x2i - x3r;
193 
194     tmk = x2r - x3i;
195     x3i = tmk - x3i;
196     tmk = x2i + x3r;
197     x3r = tmk + x3r;
198 
199     *ptr_y++ = x0r;
200     *ptr_y++ = x0i;
201     *ptr_y++ = x2r;
202     *ptr_y++ = x2i;
203     *ptr_y++ = x1r;
204     *ptr_y++ = x1i;
205     *ptr_y++ = x3i;
206     *ptr_y++ = x3r;
207   }
208   ptr_y -= 2 * npoints;
209   del = 4;
210   nodespacing = 64;
211   in_loop_cnt = npoints >> 4;
212   for (i = n_stages - 1; i > 0; i--) {
213     const FLOAT64 *twiddles = ptr_w;
214     FLOAT32 *data = ptr_y;
215     FLOAT64 w_1, w_2, w_3, w_4, w_5, w_6;
216     WORD32 sec_loop_cnt;
217 
218     for (k = in_loop_cnt; k != 0; k--) {
219       x0r = (*data);
220       x0i = (*(data + 1));
221       data += ((SIZE_T)del << 1);
222 
223       x1r = (*data);
224       x1i = (*(data + 1));
225       data += ((SIZE_T)del << 1);
226 
227       x2r = (*data);
228       x2i = (*(data + 1));
229       data += ((SIZE_T)del << 1);
230 
231       x3r = (*data);
232       x3i = (*(data + 1));
233       data -= 3 * (del << 1);
234 
235       x0r = x0r + x2r;
236       x0i = x0i + x2i;
237       x2r = x0r - (x2r * 2);
238       x2i = x0i - (x2i * 2);
239       x1r = x1r + x3r;
240       x1i = x1i + x3i;
241       x3r = x1r - (x3r * 2);
242       x3i = x1i - (x3i * 2);
243 
244       x0r = x0r + x1r;
245       x0i = x0i + x1i;
246       x1r = x0r - (x1r * 2);
247       x1i = x0i - (x1i * 2);
248       x2r = x2r + x3i;
249       x2i = x2i - x3r;
250       x3i = x2r - (x3i * 2);
251       x3r = x2i + (x3r * 2);
252 
253       *data = x0r;
254       *(data + 1) = x0i;
255       data += ((SIZE_T)del << 1);
256 
257       *data = x2r;
258       *(data + 1) = x2i;
259       data += ((SIZE_T)del << 1);
260 
261       *data = x1r;
262       *(data + 1) = x1i;
263       data += ((SIZE_T)del << 1);
264 
265       *data = x3i;
266       *(data + 1) = x3r;
267       data += ((SIZE_T)del << 1);
268     }
269     data = ptr_y + 2;
270 
271     sec_loop_cnt = (nodespacing * del);
272     sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
273                    (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
274                    (sec_loop_cnt / 256);
275 
276     for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
277       w_1 = *(twiddles + j);
278       w_4 = *(twiddles + j + 257);
279       w_2 = *(twiddles + ((SIZE_T)j << 1));
280       w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
281       w_3 = *(twiddles + j + ((SIZE_T)j << 1));
282       w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
283 
284       for (k = in_loop_cnt; k != 0; k--) {
285         data += ((SIZE_T)del << 1);
286 
287         x1r = *data;
288         x1i = *(data + 1);
289         data += ((SIZE_T)del << 1);
290 
291         x2r = *data;
292         x2i = *(data + 1);
293         data += ((SIZE_T)del << 1);
294 
295         x3r = *data;
296         x3i = *(data + 1);
297         data -= 3 * (del << 1);
298 
299         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
300         x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
301         x1r = tmp;
302 
303         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
304         x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
305         x2r = tmp;
306 
307         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_3) - ixheaace_dmult((FLOAT64)x3i, w_6));
308         x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
309         x3r = tmp;
310 
311         x0r = (*data);
312         x0i = (*(data + 1));
313 
314         x0r = x0r + (x2r);
315         x0i = x0i + (x2i);
316         x2r = x0r - (x2r * 2);
317         x2i = x0i - (x2i * 2);
318         x1r = x1r + x3r;
319         x1i = x1i + x3i;
320         x3r = x1r - (x3r * 2);
321         x3i = x1i - (x3i * 2);
322 
323         x0r = x0r + (x1r);
324         x0i = x0i + (x1i);
325         x1r = x0r - (x1r * 2);
326         x1i = x0i - (x1i * 2);
327         x2r = x2r + (x3i);
328         x2i = x2i - (x3r);
329         x3i = x2r - (x3i * 2);
330         x3r = x2i + (x3r * 2);
331 
332         *data = x0r;
333         *(data + 1) = x0i;
334         data += ((SIZE_T)del << 1);
335 
336         *data = x2r;
337         *(data + 1) = x2i;
338         data += ((SIZE_T)del << 1);
339 
340         *data = x1r;
341         *(data + 1) = x1i;
342         data += ((SIZE_T)del << 1);
343 
344         *data = x3i;
345         *(data + 1) = x3r;
346         data += ((SIZE_T)del << 1);
347       }
348       data -= 2 * npoints;
349       data += 2;
350     }
351     for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
352       w_1 = *(twiddles + j);
353       w_4 = *(twiddles + j + 257);
354       w_2 = *(twiddles + ((SIZE_T)j << 1));
355       w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
356       w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
357       w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
358 
359       for (k = in_loop_cnt; k != 0; k--) {
360         data += ((SIZE_T)del << 1);
361 
362         x1r = *data;
363         x1i = *(data + 1);
364         data += ((SIZE_T)del << 1);
365 
366         x2r = *data;
367         x2i = *(data + 1);
368         data += ((SIZE_T)del << 1);
369 
370         x3r = *data;
371         x3i = *(data + 1);
372         data -= 3 * (del << 1);
373 
374         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
375         x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
376         x1r = tmp;
377 
378         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_2) - ixheaace_dmult((FLOAT64)x2i, w_5));
379         x2i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x2r, w_5), (FLOAT64)x2i, w_2);
380         x2r = tmp;
381 
382         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
383         x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
384         x3r = tmp;
385 
386         x0r = (*data);
387         x0i = (*(data + 1));
388 
389         x0r = x0r + (x2r);
390         x0i = x0i + (x2i);
391         x2r = x0r - (x2r * 2);
392         x2i = x0i - (x2i * 2);
393         x1r = x1r + x3r;
394         x1i = x1i + x3i;
395         x3r = x1r - (x3r * 2);
396         x3i = x1i - (x3i * 2);
397 
398         x0r = x0r + (x1r);
399         x0i = x0i + (x1i);
400         x1r = x0r - (x1r * 2);
401         x1i = x0i - (x1i * 2);
402         x2r = x2r + (x3i);
403         x2i = x2i - (x3r);
404         x3i = x2r - (x3i * 2);
405         x3r = x2i + (x3r * 2);
406 
407         *data = x0r;
408         *(data + 1) = x0i;
409         data += ((SIZE_T)del << 1);
410 
411         *data = x2r;
412         *(data + 1) = x2i;
413         data += ((SIZE_T)del << 1);
414 
415         *data = x1r;
416         *(data + 1) = x1i;
417         data += ((SIZE_T)del << 1);
418 
419         *data = x3i;
420         *(data + 1) = x3r;
421         data += ((SIZE_T)del << 1);
422       }
423       data -= 2 * npoints;
424       data += 2;
425     }
426     for (; j <= sec_loop_cnt * 2; j += nodespacing) {
427       w_1 = *(twiddles + j);
428       w_4 = *(twiddles + j + 257);
429       w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
430       w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
431       w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
432       w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
433 
434       for (k = in_loop_cnt; k != 0; k--) {
435         data += ((SIZE_T)del << 1);
436 
437         x1r = *data;
438         x1i = *(data + 1);
439         data += ((SIZE_T)del << 1);
440 
441         x2r = *data;
442         x2i = *(data + 1);
443         data += ((SIZE_T)del << 1);
444 
445         x3r = *data;
446         x3i = *(data + 1);
447         data -= 3 * (del << 1);
448 
449         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
450         x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult(x1r, w_4), x1i, w_1);
451         x1r = tmp;
452 
453         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
454         x2i = (FLOAT32)(-ixheaace_dmult(x2r, w_2) + ixheaace_dmult(x2i, w_5));
455         x2r = tmp;
456 
457         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x3r, w_6) + ixheaace_dmult((FLOAT64)x3i, w_3));
458         x3i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
459         x3r = tmp;
460 
461         x0r = (*data);
462         x0i = (*(data + 1));
463 
464         x0r = x0r + (x2r);
465         x0i = x0i + (x2i);
466         x2r = x0r - (x2r * 2);
467         x2i = x0i - (x2i * 2);
468         x1r = x1r + x3r;
469         x1i = x1i + x3i;
470         x3r = x1r - (x3r * 2);
471         x3i = x1i - (x3i * 2);
472 
473         x0r = x0r + (x1r);
474         x0i = x0i + (x1i);
475         x1r = x0r - (x1r * 2);
476         x1i = x0i - (x1i * 2);
477         x2r = x2r + (x3i);
478         x2i = x2i - (x3r);
479         x3i = x2r - (x3i * 2);
480         x3r = x2i + (x3r * 2);
481 
482         *data = x0r;
483         *(data + 1) = x0i;
484         data += ((SIZE_T)del << 1);
485 
486         *data = x2r;
487         *(data + 1) = x2i;
488         data += ((SIZE_T)del << 1);
489 
490         *data = x1r;
491         *(data + 1) = x1i;
492         data += ((SIZE_T)del << 1);
493 
494         *data = x3i;
495         *(data + 1) = x3r;
496         data += ((SIZE_T)del << 1);
497       }
498       data -= 2 * npoints;
499       data += 2;
500     }
501     for (; j < nodespacing * del; j += nodespacing) {
502       w_1 = *(twiddles + j);
503       w_4 = *(twiddles + j + 257);
504       w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
505       w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
506       w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
507       w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
508 
509       for (k = in_loop_cnt; k != 0; k--) {
510         data += ((SIZE_T)del << 1);
511 
512         x1r = *data;
513         x1i = *(data + 1);
514         data += ((SIZE_T)del << 1);
515 
516         x2r = *data;
517         x2i = *(data + 1);
518         data += ((SIZE_T)del << 1);
519 
520         x3r = *data;
521         x3i = *(data + 1);
522         data -= 3 * ((SIZE_T)del << 1);
523 
524         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
525         x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
526         x1r = tmp;
527 
528         tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x2r, w_5) + ixheaace_dmult((FLOAT64)x2i, w_2));
529         x2i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x2r, w_2) + ixheaace_dmult((FLOAT64)x2i, w_5));
530         x2r = tmp;
531 
532         tmp = (FLOAT32)(-ixheaace_dmult((FLOAT64)x3r, w_3) + ixheaace_dmult((FLOAT64)x3i, w_6));
533         x3i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x3r, w_6), (FLOAT64)x3i, w_3);
534         x3r = tmp;
535 
536         x0r = (*data);
537         x0i = (*(data + 1));
538 
539         x0r = x0r + (x2r);
540         x0i = x0i + (x2i);
541         x2r = x0r - (x2r * 2);
542         x2i = x0i - (x2i * 2);
543         x1r = x1r + x3r;
544         x1i = x1i - x3i;
545         x3r = x1r - (x3r * 2);
546         x3i = x1i + (x3i * 2);
547 
548         x0r = x0r + (x1r);
549         x0i = x0i + (x1i);
550         x1r = x0r - (x1r * 2);
551         x1i = x0i - (x1i * 2);
552         x2r = x2r + (x3i);
553         x2i = x2i - (x3r);
554         x3i = x2r - (x3i * 2);
555         x3r = x2i + (x3r * 2);
556 
557         *data = x0r;
558         *(data + 1) = x0i;
559         data += ((SIZE_T)del << 1);
560 
561         *data = x2r;
562         *(data + 1) = x2i;
563         data += ((SIZE_T)del << 1);
564 
565         *data = x1r;
566         *(data + 1) = x1i;
567         data += ((SIZE_T)del << 1);
568 
569         *data = x3i;
570         *(data + 1) = x3r;
571         data += ((SIZE_T)del << 1);
572       }
573       data -= 2 * npoints;
574       data += 2;
575     }
576     nodespacing >>= 2;
577     del <<= 2;
578     in_loop_cnt >>= 2;
579   }
580   if (not_power_4) {
581     const FLOAT64 *twiddles = ptr_w;
582     nodespacing <<= 1;
583 
584     for (j = del / 2; j != 0; j--) {
585       FLOAT64 w_1 = *twiddles;
586       FLOAT64 w_4 = *(twiddles + 257);
587       twiddles += nodespacing;
588 
589       x0r = *ptr_y;
590       x0i = *(ptr_y + 1);
591       ptr_y += ((SIZE_T)del << 1);
592 
593       x1r = *ptr_y;
594       x1i = *(ptr_y + 1);
595 
596       tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_1) - ixheaace_dmult((FLOAT64)x1i, w_4));
597       x1i = (FLOAT32)ixheaace_dmac(ixheaace_dmult((FLOAT64)x1r, w_4), (FLOAT64)x1i, w_1);
598       x1r = tmp;
599 
600       *ptr_y = (x0r) - (x1r);
601       *(ptr_y + 1) = (x0i) - (x1i);
602       ptr_y -= ((SIZE_T)del << 1);
603 
604       *ptr_y = (x0r) + (x1r);
605       *(ptr_y + 1) = (x0i) + (x1i);
606       ptr_y += 2;
607     }
608     twiddles = ptr_w;
609     for (j = del / 2; j != 0; j--) {
610       FLOAT64 w_1 = *twiddles;
611       FLOAT64 w_4 = *(twiddles + 257);
612       twiddles += nodespacing;
613 
614       x0r = *ptr_y;
615       x0i = *(ptr_y + 1);
616       ptr_y += ((SIZE_T)del << 1);
617 
618       x1r = *ptr_y;
619       x1i = *(ptr_y + 1);
620 
621       tmp = (FLOAT32)(ixheaace_dmult((FLOAT64)x1r, w_4) + ixheaace_dmult((FLOAT64)x1i, w_1));
622       x1i = (FLOAT32)(-ixheaace_dmult((FLOAT64)x1r, w_1) + ixheaace_dmult((FLOAT64)x1i, w_4));
623       x1r = tmp;
624 
625       *ptr_y = (x0r) - (x1r);
626       *(ptr_y + 1) = (x0i) - (x1i);
627       ptr_y -= ((SIZE_T)del << 1);
628 
629       *ptr_y = (x0r) + (x1r);
630       *(ptr_y + 1) = (x0i) + (x1i);
631       ptr_y += 2;
632     }
633   }
634 
635   for (i = 0; i < nlength; i++) {
636     *(ptr_x + 2 * i) = y[2 * i];
637     *(ptr_x + 2 * i + 1) = y[2 * i + 1];
638   }
639 }
640 
iusace_complex_fft_p3(FLOAT32 * data,WORD32 nlength,iusace_scratch_mem * pstr_scratch)641 static VOID iusace_complex_fft_p3(FLOAT32 *data, WORD32 nlength,
642                                   iusace_scratch_mem *pstr_scratch) {
643   WORD32 i, j;
644   FLOAT32 *data_3 = pstr_scratch->p_fft_p3_data_3;
645   FLOAT32 *y = pstr_scratch->p_fft_p3_y;
646   WORD32 cnfac;
647   WORD32 mpass = nlength;
648   FLOAT32 *ptr_x = data;
649   FLOAT32 *ptr_y = y;
650 
651   cnfac = 0;
652   while (mpass % 3 == 0) {
653     mpass /= 3;
654     cnfac++;
655   }
656 
657   for (i = 0; i < 3 * cnfac; i++) {
658     for (j = 0; j < mpass; j++) {
659       data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
660       data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
661     }
662     iusace_complex_fft_p2(data_3, mpass, pstr_scratch->p_fft_p2_y);
663 
664     for (j = 0; j < mpass; j++) {
665       data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
666       data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
667     }
668   }
669 
670   {
671     const FLOAT64 *w1r, *w1i;
672     FLOAT32 tmp;
673     w1r = iusace_twiddle_table_3pr;
674     w1i = iusace_twiddle_table_3pi;
675 
676     for (i = 0; i < nlength; i += 3) {
677       tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
678       data[2 * i + 1] =
679           (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
680       data[2 * i] = tmp;
681 
682       w1r++;
683       w1i++;
684 
685       tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
686                       (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
687       data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
688                                         (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
689       data[2 * (i + 1)] = tmp;
690 
691       w1r++;
692       w1i++;
693 
694       tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
695                       (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
696       data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
697                                         (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
698       data[2 * (i + 2)] = tmp;
699 
700       w1r += 3 * (128 / mpass - 1) + 1;
701       w1i += 3 * (128 / mpass - 1) + 1;
702     }
703   }
704 
705   for (i = 0; i < mpass; i++) {
706     iusace_complex_3point_fft(ptr_x, ptr_y);
707 
708     ptr_x = ptr_x + 6;
709     ptr_y = ptr_y + 6;
710   }
711 
712   for (i = 0; i < mpass; i++) {
713     data[2 * i] = y[6 * i];
714     data[2 * i + 1] = y[6 * i + 1];
715   }
716 
717   for (i = 0; i < mpass; i++) {
718     data[2 * (i + mpass)] = y[6 * i + 2];
719     data[2 * (i + mpass) + 1] = y[6 * i + 3];
720   }
721 
722   for (i = 0; i < mpass; i++) {
723     data[2 * (i + 2 * mpass)] = y[6 * i + 4];
724     data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
725   }
726 }
727 
iusace_complex_fft_p3_no_scratch(FLOAT32 * data,WORD32 nlength)728 VOID iusace_complex_fft_p3_no_scratch(FLOAT32 *data, WORD32 nlength) {
729   WORD32 i, j;
730 
731   FLOAT32 data_3[800];
732   FLOAT32 y[1024];
733   FLOAT32 p_fft_p2_y[2048];
734   WORD32 cnfac;
735   WORD32 mpass = nlength;
736   FLOAT32 *ptr_x = data;
737   FLOAT32 *ptr_y = y;
738 
739   cnfac = 0;
740   while (mpass % 3 == 0) {
741     mpass /= 3;
742     cnfac++;
743   }
744 
745   for (i = 0; i < 3 * cnfac; i++) {
746     for (j = 0; j < mpass; j++) {
747       data_3[2 * j] = data[3 * (2 * j) + (2 * i)];
748       data_3[2 * j + 1] = data[3 * (2 * j) + 1 + (2 * i)];
749     }
750     iusace_complex_fft_p2(data_3, mpass, p_fft_p2_y);
751 
752     for (j = 0; j < mpass; j++) {
753       data[3 * (2 * j) + (2 * i)] = data_3[2 * j];
754       data[3 * (2 * j) + 1 + (2 * i)] = data_3[2 * j + 1];
755     }
756   }
757 
758   {
759     const FLOAT64 *w1r, *w1i;
760     FLOAT32 tmp;
761     w1r = iusace_twiddle_table_3pr;
762     w1i = iusace_twiddle_table_3pi;
763 
764     for (i = 0; i < nlength; i += 3) {
765       tmp = (FLOAT32)((FLOAT64)data[2 * i] * (*w1r) - (FLOAT64)data[2 * i + 1] * (*w1i));
766       data[2 * i + 1] =
767           (FLOAT32)((FLOAT64)data[2 * i] * (*w1i) + (FLOAT64)data[2 * i + 1] * (*w1r));
768       data[2 * i] = tmp;
769 
770       w1r++;
771       w1i++;
772 
773       tmp = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1r) -
774                       (FLOAT64)data[2 * (i + 1) + 1] * (*w1i));
775       data[2 * (i + 1) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 1)] * (*w1i) +
776                                         (FLOAT64)data[2 * (i + 1) + 1] * (*w1r));
777       data[2 * (i + 1)] = tmp;
778 
779       w1r++;
780       w1i++;
781 
782       tmp = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1r) -
783                       (FLOAT64)data[2 * (i + 2) + 1] * (*w1i));
784       data[2 * (i + 2) + 1] = (FLOAT32)((FLOAT64)data[2 * (i + 2)] * (*w1i) +
785                                         (FLOAT64)data[2 * (i + 2) + 1] * (*w1r));
786       data[2 * (i + 2)] = tmp;
787 
788       w1r += 3 * (128 / mpass - 1) + 1;
789       w1i += 3 * (128 / mpass - 1) + 1;
790     }
791   }
792 
793   for (i = 0; i < mpass; i++) {
794     iusace_complex_3point_fft(ptr_x, ptr_y);
795 
796     ptr_x = ptr_x + 6;
797     ptr_y = ptr_y + 6;
798   }
799 
800   for (i = 0; i < mpass; i++) {
801     data[2 * i] = y[6 * i];
802     data[2 * i + 1] = y[6 * i + 1];
803   }
804 
805   for (i = 0; i < mpass; i++) {
806     data[2 * (i + mpass)] = y[6 * i + 2];
807     data[2 * (i + mpass) + 1] = y[6 * i + 3];
808   }
809 
810   for (i = 0; i < mpass; i++) {
811     data[2 * (i + 2 * mpass)] = y[6 * i + 4];
812     data[2 * (i + 2 * mpass) + 1] = y[6 * i + 5];
813   }
814 }
815 
iusace_calc_pre_twid_enc(FLOAT64 * ptr_in,FLOAT32 * fft_ptr,WORD32 npoints,const FLOAT64 * cos_ptr,const FLOAT64 * sin_ptr,const WORD32 tx_flag)816 static VOID iusace_calc_pre_twid_enc(FLOAT64 *ptr_in, FLOAT32 *fft_ptr, WORD32 npoints,
817                                      const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
818                                      const WORD32 tx_flag) {
819   WORD32 i, n;
820   WORD32 b = npoints >> 1;
821   WORD32 a = npoints - b;
822   WORD32 nlength = npoints >> 2;
823   FLOAT64 tempr, tempi;
824 
825   if (tx_flag == 0) {
826     FLOAT64 norm;
827     for (i = 0; i < b; i++) {
828       norm = ptr_in[i]; /* reuse MDCT: spectrally reverse all bins */
829       ptr_in[i] = ptr_in[npoints - 1 - i];
830       ptr_in[npoints - 1 - i] = norm;
831     }
832   }
833   for (i = 0; i < nlength; i++) {
834     n = npoints / 2 - 1 - 2 * i;
835     if (i < b / 4) {
836       tempr = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
837     } else {
838       tempr = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
839     }
840     n = 2 * i;
841     if (i < a / 4) {
842       tempi = ptr_in[a / 2 + n] - ptr_in[a / 2 - 1 - n];
843     } else {
844       tempi = ptr_in[a / 2 + n] + ptr_in[npoints + a / 2 - 1 - n];
845     }
846 
847     fft_ptr[2 * i] = (FLOAT32)(tempr * (*cos_ptr) + tempi * (*sin_ptr));
848     fft_ptr[2 * i + 1] = (FLOAT32)(tempi * (*cos_ptr++) - tempr * (*sin_ptr++));
849   }
850 }
851 
iusace_complex_fft(FLOAT32 * data,WORD32 nlength,iusace_scratch_mem * pstr_scratch)852 VOID iusace_complex_fft(FLOAT32 *data, WORD32 nlength, iusace_scratch_mem *pstr_scratch) {
853   if (nlength & (nlength - 1)) {
854     iusace_complex_fft_p3(data, nlength, pstr_scratch);
855   } else {
856     iusace_complex_fft_p2(data, nlength, pstr_scratch->p_fft_p2_y);
857   }
858 }
859 
iusace_calc_post_twid_enc(FLOAT64 * ptr_out,FLOAT32 * fft_ptr,WORD32 npoints,const FLOAT64 * cos_ptr,const FLOAT64 * sin_ptr,const WORD32 tx_flag)860 static VOID iusace_calc_post_twid_enc(FLOAT64 *ptr_out, FLOAT32 *fft_ptr, WORD32 npoints,
861                                       const FLOAT64 *cos_ptr, const FLOAT64 *sin_ptr,
862                                       const WORD32 tx_flag) {
863   WORD32 i;
864   WORD32 nlength = npoints >> 2;
865   FLOAT64 tempr, tempi;
866 
867   /* post-twiddle FFT output and then get output data */
868   for (i = 0; i < nlength; i++) {
869     tempr =
870         2 * ((FLOAT64)(fft_ptr[2 * i]) * (*cos_ptr) + (FLOAT64)(fft_ptr[2 * i + 1]) * (*sin_ptr));
871     tempi = 2 * ((FLOAT64)(fft_ptr[2 * i + 1]) * (*cos_ptr++) -
872                  (FLOAT64)(fft_ptr[2 * i]) * (*sin_ptr++));
873 
874     ptr_out[2 * i] = -tempr;
875     ptr_out[npoints / 2 - 1 - 2 * i] = tempi;
876     ptr_out[npoints / 2 + 2 * i] = -tempi;
877     ptr_out[npoints - 1 - 2 * i] = tempr;
878   }
879   if (tx_flag == 0) {
880     for (i = 0; i < npoints; i += 2) {
881       ptr_out[i] *= -1; /* reuse MDCT: flip signs at odd indices */
882     }
883   }
884 }
885 
iusace_fft_based_mdct(FLOAT64 * ptr_in,FLOAT64 * ptr_out,WORD32 npoints,const WORD32 tx_flag,iusace_scratch_mem * pstr_scratch)886 IA_ERRORCODE iusace_fft_based_mdct(FLOAT64 *ptr_in, FLOAT64 *ptr_out, WORD32 npoints,
887                                    const WORD32 tx_flag, iusace_scratch_mem *pstr_scratch) {
888   FLOAT32 *ptr_scratch1 = pstr_scratch->p_fft_mdct_buf;
889   const FLOAT64 *cos_ptr = NULL;
890   const FLOAT64 *sin_ptr = NULL;
891   WORD32 nlength = npoints >> 1;
892   WORD32 n_total = npoints << 1;
893 
894   memset(ptr_scratch1, 0, ((SIZE_T)n_total << 1) * sizeof(*ptr_scratch1));
895 
896   switch (npoints) {
897     case (96):
898       cos_ptr = iexheaac_pre_post_twid_cos_192;
899       sin_ptr = iexheaac_pre_post_twid_sin_192;
900       break;
901     case (128):
902       cos_ptr = iusace_pre_post_twid_cos_256;
903       sin_ptr = iusace_pre_post_twid_sin_256;
904       break;
905     case (768):
906       cos_ptr = iexheaac_pre_post_twid_cos_1536;
907       sin_ptr = iexheaac_pre_post_twid_sin_1536;
908       break;
909     case (1024):
910       cos_ptr = iusace_pre_post_twid_cos_2048;
911       sin_ptr = iusace_pre_post_twid_sin_2048;
912       break;
913     default:
914       return IA_EXHEAACE_EXE_FATAL_USAC_INVALID_WINDOW_LENGTH;
915   }
916 
917   /* pre-twiddle */
918   iusace_calc_pre_twid_enc(ptr_in, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
919 
920   /* complex FFT */
921   iusace_complex_fft(ptr_scratch1, nlength, pstr_scratch);
922 
923   /* post-twiddle */
924   iusace_calc_post_twid_enc(ptr_out, ptr_scratch1, npoints << 1, cos_ptr, sin_ptr, tx_flag);
925 
926   return IA_NO_ERROR;
927 }
928 
iusace_complex_fft_2048(FLOAT32 * ptr_x,FLOAT32 * scratch_fft)929 VOID iusace_complex_fft_2048(FLOAT32 *ptr_x, FLOAT32 *scratch_fft) {
930   WORD32 i;
931   FLOAT32 re, im, c_v, s_v, tmp_re, tmp_im;
932   FLOAT32 *ptr_re, *ptr_im, *ptr_re_h, *ptr_im_h;
933   FLOAT32 *ptr_cos_val, *ptr_sin_val;
934   iusace_complex_fft_p2(ptr_x, 1024, scratch_fft);
935   iusace_complex_fft_p2(ptr_x + 2048, 1024, scratch_fft);
936 
937   ptr_re = ptr_x;
938   ptr_im = ptr_x + 1;
939   ptr_re_h = ptr_x + 2048;
940   ptr_im_h = ptr_x + 2048 + 1;
941   ptr_cos_val = (FLOAT32 *)&iusace_twiddle_cos_2048[0];
942   ptr_sin_val = (FLOAT32 *)&iusace_twiddle_sin_2048[0];
943   for (i = 0; i < 1024; i++) {
944     re = *ptr_re_h;
945     im = *ptr_im_h;
946     c_v = ptr_cos_val[i];
947     s_v = ptr_sin_val[i];
948     tmp_re = (re * c_v) + (im * s_v);
949     tmp_im = -(re * s_v) + (im * c_v);
950     re = *ptr_re;
951     im = *ptr_im;
952 
953     *ptr_re = re + tmp_re;
954     *ptr_im = im + tmp_im;
955     *ptr_re_h = re - tmp_re;
956     *ptr_im_h = im - tmp_im;
957 
958     ptr_re += 2;
959     ptr_im += 2;
960     ptr_re_h += 2;
961     ptr_im_h += 2;
962   }
963 }
ixheaace_rad2_cplx_fft(FLOAT32 * ptr_real,FLOAT32 * ptr_imag,WORD32 n_points,FLOAT32 * ptr_scratch)964 static VOID ixheaace_rad2_cplx_fft(FLOAT32 *ptr_real, FLOAT32 *ptr_imag, WORD32 n_points,
965                                    FLOAT32 *ptr_scratch) {
966   WORD32 i, j, k, n_stages, h2;
967   FLOAT32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
968   WORD32 del, nodespacing, in_loop_cnt;
969   WORD32 not_power_4;
970   WORD32 dig_rev_shift;
971   WORD32 m_points = n_points;
972   FLOAT32 *ptr_x = ptr_scratch;
973   FLOAT32 *y = ptr_scratch + 2048;
974   FLOAT32 *ptr_y = y;
975   const FLOAT32 *ptr_w;
976 
977   dig_rev_shift = ixheaac_norm32(m_points) + 1 - 16;
978   n_stages = 30 - ixheaac_norm32(m_points);
979   not_power_4 = n_stages & 1;
980 
981   n_stages = n_stages >> 1;
982 
983   ptr_w = ia_fft_twiddle_table_float;
984 
985   for (i = 0; i < n_points; i++) {
986     ptr_x[2 * i] = ptr_real[i];
987     ptr_x[2 * i + 1] = ptr_imag[i];
988   }
989   dig_rev_shift = max(dig_rev_shift, 0);
990   for (i = 0; i < n_points; i += 4) {
991     FLOAT32 *inp = ptr_x;
992     FLOAT32 tmk;
993 
994     DIG_REV(i, dig_rev_shift, h2);
995     if (not_power_4) {
996       h2 += 1;
997       h2 &= ~1;
998     }
999     inp += (h2);
1000 
1001     x0r = *inp;
1002     x0i = *(inp + 1);
1003     inp += (n_points >> 1);
1004 
1005     x1r = *inp;
1006     x1i = *(inp + 1);
1007     inp += (n_points >> 1);
1008 
1009     x2r = *inp;
1010     x2i = *(inp + 1);
1011     inp += (n_points >> 1);
1012 
1013     x3r = *inp;
1014     x3i = *(inp + 1);
1015 
1016     x0r = ia_add_flt(x0r, x2r);
1017     x0i = ia_add_flt(x0i, x2i);
1018 
1019     tmk = ia_sub_flt(x0r, x2r);
1020     x2r = ia_sub_flt(tmk, x2r);
1021     tmk = ia_sub_flt(x0i, x2i);
1022     x2i = ia_sub_flt(tmk, x2i);
1023 
1024     x1r = ia_add_flt(x1r, x3r);
1025     x1i = ia_add_flt(x1i, x3i);
1026 
1027     tmk = ia_sub_flt(x1r, x3r);
1028     x3r = ia_sub_flt(tmk, x3r);
1029     tmk = ia_sub_flt(x1i, x3i);
1030     x3i = ia_sub_flt(tmk, x3i);
1031 
1032     x0r = ia_add_flt(x0r, x1r);
1033     x0i = ia_add_flt(x0i, x1i);
1034 
1035     tmk = ia_sub_flt(x0r, x1r);
1036     x1r = ia_sub_flt(tmk, x1r);
1037     tmk = ia_sub_flt(x0i, x1i);
1038     x1i = ia_sub_flt(tmk, x1i);
1039 
1040     x2r = ia_add_flt(x2r, x3i);
1041     x2i = ia_sub_flt(x2i, x3r);
1042 
1043     tmk = ia_sub_flt(x2r, x3i);
1044     x3i = ia_sub_flt(tmk, x3i);
1045     tmk = ia_add_flt(x2i, x3r);
1046     x3r = ia_add_flt(tmk, x3r);
1047 
1048     *ptr_y++ = x0r;
1049     *ptr_y++ = x0i;
1050     *ptr_y++ = x2r;
1051     *ptr_y++ = x2i;
1052     *ptr_y++ = x1r;
1053     *ptr_y++ = x1i;
1054     *ptr_y++ = x3i;
1055     *ptr_y++ = x3r;
1056   }
1057   ptr_y -= 2 * n_points;
1058   del = 4;
1059   nodespacing = 64;
1060   in_loop_cnt = n_points >> 4;
1061   for (i = n_stages - 1; i > 0; i--) {
1062     const FLOAT32 *twiddles = ptr_w;
1063     FLOAT32 *data = ptr_y;
1064     FLOAT32 w_1, w_2, w_3, w_4, w_5, w_6;
1065     WORD32 sec_loop_cnt;
1066 
1067     for (k = in_loop_cnt; k != 0; k--) {
1068       x0r = (*data);
1069       x0i = (*(data + 1));
1070       data += ((SIZE_T)del << 1);
1071 
1072       x1r = (*data);
1073       x1i = (*(data + 1));
1074       data += ((SIZE_T)del << 1);
1075 
1076       x2r = (*data);
1077       x2i = (*(data + 1));
1078       data += ((SIZE_T)del << 1);
1079 
1080       x3r = (*data);
1081       x3i = (*(data + 1));
1082       data -= 3 * (del << 1);
1083 
1084       x0r = ia_add_flt(x0r, x2r);
1085       x0i = ia_add_flt(x0i, x2i);
1086       x2r = ia_msu_flt(x0r, x2r, 2);
1087       x2i = ia_msu_flt(x0i, x2i, 2);
1088       x1r = ia_add_flt(x1r, x3r);
1089       x1i = ia_add_flt(x1i, x3i);
1090       x3r = ia_msu_flt(x1r, x3r, 2);
1091       x3i = ia_msu_flt(x1i, x3i, 2);
1092 
1093       x0r = ia_add_flt(x0r, x1r);
1094       x0i = ia_add_flt(x0i, x1i);
1095       x1r = ia_msu_flt(x0r, x1r, 2);
1096       x1i = ia_msu_flt(x0i, x1i, 2);
1097       x2r = ia_add_flt(x2r, x3i);
1098       x2i = ia_sub_flt(x2i, x3r);
1099       x3i = ia_msu_flt(x2r, x3i, 2);
1100       x3r = ia_mac_flt(x2i, x3r, 2);
1101 
1102       *data = x0r;
1103       *(data + 1) = x0i;
1104       data += ((SIZE_T)del << 1);
1105 
1106       *data = x2r;
1107       *(data + 1) = x2i;
1108       data += ((SIZE_T)del << 1);
1109 
1110       *data = x1r;
1111       *(data + 1) = x1i;
1112       data += ((SIZE_T)del << 1);
1113 
1114       *data = x3i;
1115       *(data + 1) = x3r;
1116       data += ((SIZE_T)del << 1);
1117     }
1118     data = ptr_y + 2;
1119 
1120     sec_loop_cnt = (nodespacing * del);
1121     sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) - (sec_loop_cnt / 16) +
1122                    (sec_loop_cnt / 32) - (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
1123                    (sec_loop_cnt / 256);
1124 
1125     for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
1126       w_1 = *(twiddles + j);
1127       w_4 = *(twiddles + j + 257);
1128       w_2 = *(twiddles + ((SIZE_T)j << 1));
1129       w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1130       w_3 = *(twiddles + j + ((SIZE_T)j << 1));
1131       w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 257);
1132 
1133       for (k = in_loop_cnt; k != 0; k--) {
1134         FLOAT32 tmp;
1135         /*x0 is loaded later to avoid register crunch*/
1136 
1137         data += ((SIZE_T)del << 1);
1138 
1139         x1r = *data;
1140         x1i = *(data + 1);
1141         data += ((SIZE_T)del << 1);
1142 
1143         x2r = *data;
1144         x2i = *(data + 1);
1145         data += ((SIZE_T)del << 1);
1146 
1147         x3r = *data;
1148         x3i = *(data + 1);
1149         data -= 3 * (del << 1);
1150 
1151         tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1152         x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1153         x1r = tmp;
1154 
1155         tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1156         x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1157         x2r = tmp;
1158 
1159         tmp = ia_sub_flt(ia_mul_flt(x3r, w_3), ia_mul_flt(x3i, w_6));
1160         x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1161         x3r = tmp;
1162 
1163         x0r = (*data);
1164         x0i = (*(data + 1));
1165 
1166         x0r = ia_add_flt(x0r, (x2r));
1167         x0i = ia_add_flt(x0i, (x2i));
1168         x2r = ia_msu_flt(x0r, x2r, 2);
1169         x2i = ia_msu_flt(x0i, x2i, 2);
1170         x1r = ia_add_flt(x1r, x3r);
1171         x1i = ia_add_flt(x1i, x3i);
1172         x3r = ia_msu_flt(x1r, x3r, 2);
1173         x3i = ia_msu_flt(x1i, x3i, 2);
1174 
1175         x0r = ia_add_flt(x0r, (x1r));
1176         x0i = ia_add_flt(x0i, (x1i));
1177         x1r = ia_msu_flt(x0r, x1r, 2);
1178         x1i = ia_msu_flt(x0i, x1i, 2);
1179         x2r = ia_add_flt(x2r, (x3i));
1180         x2i = ia_sub_flt(x2i, (x3r));
1181         x3i = ia_msu_flt(x2r, x3i, 2);
1182         x3r = ia_mac_flt(x2i, x3r, 2);
1183 
1184         *data = x0r;
1185         *(data + 1) = x0i;
1186         data += ((SIZE_T)del << 1);
1187 
1188         *data = x2r;
1189         *(data + 1) = x2i;
1190         data += ((SIZE_T)del << 1);
1191 
1192         *data = x1r;
1193         *(data + 1) = x1i;
1194         data += ((SIZE_T)del << 1);
1195 
1196         *data = x3i;
1197         *(data + 1) = x3r;
1198         data += ((SIZE_T)del << 1);
1199       }
1200       data -= 2 * n_points;
1201       data += 2;
1202     }
1203     for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
1204       w_1 = *(twiddles + j);
1205       w_4 = *(twiddles + j + 257);
1206       w_2 = *(twiddles + ((SIZE_T)j << 1));
1207       w_5 = *(twiddles + ((SIZE_T)j << 1) + 257);
1208       w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1209       w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1210 
1211       for (k = in_loop_cnt; k != 0; k--) {
1212         FLOAT32 tmp;
1213         /*x0 is loaded later to avoid register crunch*/
1214 
1215         data += ((SIZE_T)del << 1);
1216 
1217         x1r = *data;
1218         x1i = *(data + 1);
1219         data += ((SIZE_T)del << 1);
1220 
1221         x2r = *data;
1222         x2i = *(data + 1);
1223         data += ((SIZE_T)del << 1);
1224 
1225         x3r = *data;
1226         x3i = *(data + 1);
1227         data -= 3 * (del << 1);
1228 
1229         tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1230         x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1231         x1r = tmp;
1232 
1233         tmp = ia_sub_flt(ia_mul_flt(x2r, w_2), ia_mul_flt(x2i, w_5));
1234         x2i = ia_mac_flt(ia_mul_flt(x2r, w_5), x2i, w_2);
1235         x2r = tmp;
1236 
1237         tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1238         x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1239         x3r = tmp;
1240 
1241         x0r = (*data);
1242         x0i = (*(data + 1));
1243 
1244         x0r = ia_add_flt(x0r, (x2r));
1245         x0i = ia_add_flt(x0i, (x2i));
1246         x2r = ia_msu_flt(x0r, x2r, 2);
1247         x2i = ia_msu_flt(x0i, x2i, 2);
1248         x1r = ia_add_flt(x1r, x3r);
1249         x1i = ia_add_flt(x1i, x3i);
1250         x3r = ia_msu_flt(x1r, x3r, 2);
1251         x3i = ia_msu_flt(x1i, x3i, 2);
1252 
1253         x0r = ia_add_flt(x0r, (x1r));
1254         x0i = ia_add_flt(x0i, (x1i));
1255         x1r = ia_msu_flt(x0r, x1r, 2);
1256         x1i = ia_msu_flt(x0i, x1i, 2);
1257         x2r = ia_add_flt(x2r, (x3i));
1258         x2i = ia_sub_flt(x2i, (x3r));
1259         x3i = ia_msu_flt(x2r, x3i, 2);
1260         x3r = ia_mac_flt(x2i, x3r, 2);
1261 
1262         *data = x0r;
1263         *(data + 1) = x0i;
1264         data += ((SIZE_T)del << 1);
1265 
1266         *data = x2r;
1267         *(data + 1) = x2i;
1268         data += ((SIZE_T)del << 1);
1269 
1270         *data = x1r;
1271         *(data + 1) = x1i;
1272         data += ((SIZE_T)del << 1);
1273 
1274         *data = x3i;
1275         *(data + 1) = x3r;
1276         data += ((SIZE_T)del << 1);
1277       }
1278       data -= 2 * n_points;
1279       data += 2;
1280     }
1281     for (; j <= sec_loop_cnt * 2; j += nodespacing) {
1282       w_1 = *(twiddles + j);
1283       w_4 = *(twiddles + j + 257);
1284       w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1285       w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1286       w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 256);
1287       w_6 = *(twiddles + j + ((SIZE_T)j << 1) + 1);
1288 
1289       for (k = in_loop_cnt; k != 0; k--) {
1290         FLOAT32 tmp;
1291         /*x0 is loaded later to avoid register crunch*/
1292 
1293         data += ((SIZE_T)del << 1);
1294 
1295         x1r = *data;
1296         x1i = *(data + 1);
1297         data += ((SIZE_T)del << 1);
1298 
1299         x2r = *data;
1300         x2i = *(data + 1);
1301         data += ((SIZE_T)del << 1);
1302 
1303         x3r = *data;
1304         x3i = *(data + 1);
1305         data -= 3 * (del << 1);
1306 
1307         tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1308         x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1309         x1r = tmp;
1310 
1311         tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1312         x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1313         x2r = tmp;
1314 
1315         tmp = ia_add_flt(ia_mul_flt(x3r, w_6), ia_mul_flt(x3i, w_3));
1316         x3i = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1317         x3r = tmp;
1318 
1319         x0r = (*data);
1320         x0i = (*(data + 1));
1321 
1322         x0r = ia_add_flt(x0r, (x2r));
1323         x0i = ia_add_flt(x0i, (x2i));
1324         x2r = ia_msu_flt(x0r, x2r, 2);
1325         x2i = ia_msu_flt(x0i, x2i, 2);
1326         x1r = ia_add_flt(x1r, x3r);
1327         x1i = ia_add_flt(x1i, x3i);
1328         x3r = ia_msu_flt(x1r, x3r, 2);
1329         x3i = ia_msu_flt(x1i, x3i, 2);
1330 
1331         x0r = ia_add_flt(x0r, (x1r));
1332         x0i = ia_add_flt(x0i, (x1i));
1333         x1r = ia_msu_flt(x0r, x1r, 2);
1334         x1i = ia_msu_flt(x0i, x1i, 2);
1335         x2r = ia_add_flt(x2r, (x3i));
1336         x2i = ia_sub_flt(x2i, (x3r));
1337         x3i = ia_msu_flt(x2r, x3i, 2);
1338         x3r = ia_mac_flt(x2i, x3r, 2);
1339 
1340         *data = x0r;
1341         *(data + 1) = x0i;
1342         data += ((SIZE_T)del << 1);
1343 
1344         *data = x2r;
1345         *(data + 1) = x2i;
1346         data += ((SIZE_T)del << 1);
1347 
1348         *data = x1r;
1349         *(data + 1) = x1i;
1350         data += ((SIZE_T)del << 1);
1351 
1352         *data = x3i;
1353         *(data + 1) = x3r;
1354         data += ((SIZE_T)del << 1);
1355       }
1356       data -= 2 * n_points;
1357       data += 2;
1358     }
1359     for (; j < nodespacing * del; j += nodespacing) {
1360       w_1 = *(twiddles + j);
1361       w_4 = *(twiddles + j + 257);
1362       w_2 = *(twiddles + ((SIZE_T)j << 1) - 256);
1363       w_5 = *(twiddles + ((SIZE_T)j << 1) + 1);
1364       w_3 = *(twiddles + j + ((SIZE_T)j << 1) - 512);
1365       w_6 = *(twiddles + j + ((SIZE_T)j << 1) - 512 + 257);
1366 
1367       for (k = in_loop_cnt; k != 0; k--) {
1368         FLOAT32 tmp;
1369         /*x0 is loaded later to avoid register crunch*/
1370 
1371         data += ((SIZE_T)del << 1);
1372 
1373         x1r = *data;
1374         x1i = *(data + 1);
1375         data += ((SIZE_T)del << 1);
1376 
1377         x2r = *data;
1378         x2i = *(data + 1);
1379         data += ((SIZE_T)del << 1);
1380 
1381         x3r = *data;
1382         x3i = *(data + 1);
1383         data -= 3 * (del << 1);
1384 
1385         tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1386         x1i = ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1387         x1r = tmp;
1388 
1389         tmp = ia_add_flt(ia_mul_flt(x2r, w_5), ia_mul_flt(x2i, w_2));
1390         x2i = ia_add_flt(ia_negate_flt(ia_mul_flt(x2r, w_2)), ia_mul_flt(x2i, w_5));
1391         x2r = tmp;
1392 
1393         tmp = ia_add_flt(ia_negate_flt(ia_mul_flt(x3r, w_3)), ia_mul_flt(x3i, w_6));
1394         x3i = ia_mac_flt(ia_mul_flt(x3r, w_6), x3i, w_3);
1395         x3r = tmp;
1396 
1397         x0r = (*data);
1398         x0i = (*(data + 1));
1399 
1400         x0r = ia_add_flt(x0r, (x2r));
1401         x0i = ia_add_flt(x0i, (x2i));
1402         x2r = ia_msu_flt(x0r, x2r, 2);
1403         x2i = ia_msu_flt(x0i, x2i, 2);
1404         x1r = ia_add_flt(x1r, x3r);
1405         x1i = ia_sub_flt(x1i, x3i);
1406         x3r = ia_msu_flt(x1r, x3r, 2);
1407         x3i = ia_mac_flt(x1i, x3i, 2);
1408 
1409         x0r = ia_add_flt(x0r, (x1r));
1410         x0i = ia_add_flt(x0i, (x1i));
1411         x1r = ia_msu_flt(x0r, x1r, 2);
1412         x1i = ia_msu_flt(x0i, x1i, 2);
1413         x2r = ia_add_flt(x2r, (x3i));
1414         x2i = ia_sub_flt(x2i, (x3r));
1415         x3i = ia_msu_flt(x2r, x3i, 2);
1416         x3r = ia_mac_flt(x2i, x3r, 2);
1417 
1418         *data = x0r;
1419         *(data + 1) = x0i;
1420         data += ((SIZE_T)del << 1);
1421 
1422         *data = x2r;
1423         *(data + 1) = x2i;
1424         data += ((SIZE_T)del << 1);
1425 
1426         *data = x1r;
1427         *(data + 1) = x1i;
1428         data += ((SIZE_T)del << 1);
1429 
1430         *data = x3i;
1431         *(data + 1) = x3r;
1432         data += ((SIZE_T)del << 1);
1433       }
1434       data -= 2 * n_points;
1435       data += 2;
1436     }
1437     nodespacing >>= 2;
1438     del <<= 2;
1439     in_loop_cnt >>= 2;
1440   }
1441   if (not_power_4) {
1442     const FLOAT32 *twiddles = ptr_w;
1443     nodespacing <<= 1;
1444 
1445     for (j = del / 2; j != 0; j--) {
1446       FLOAT32 w_1 = *twiddles;
1447       FLOAT32 w_4 = *(twiddles + 257);
1448       FLOAT32 tmp;
1449       twiddles += nodespacing;
1450 
1451       x0r = *ptr_y;
1452       x0i = *(ptr_y + 1);
1453       ptr_y += ((SIZE_T)del << 1);
1454 
1455       x1r = *ptr_y;
1456       x1i = *(ptr_y + 1);
1457 
1458       tmp = ia_sub_flt(ia_mul_flt(x1r, w_1), ia_mul_flt(x1i, w_4));
1459       x1i = (FLOAT32)ia_mac_flt(ia_mul_flt(x1r, w_4), x1i, w_1);
1460       x1r = tmp;
1461 
1462       *ptr_y = ia_sub_flt((x0r), (x1r));
1463       *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1464       ptr_y -= ((SIZE_T)del << 1);
1465 
1466       *ptr_y = ia_add_flt((x0r), (x1r));
1467       *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1468       ptr_y += 2;
1469     }
1470     twiddles = ptr_w;
1471     for (j = del / 2; j != 0; j--) {
1472       FLOAT32 w_1 = *twiddles;
1473       FLOAT32 w_4 = *(twiddles + 257);
1474       FLOAT32 tmp;
1475       twiddles += nodespacing;
1476 
1477       x0r = *ptr_y;
1478       x0i = *(ptr_y + 1);
1479       ptr_y += ((SIZE_T)del << 1);
1480 
1481       x1r = *ptr_y;
1482       x1i = *(ptr_y + 1);
1483 
1484       tmp = ia_add_flt(ia_mul_flt(x1r, w_4), ia_mul_flt(x1i, w_1));
1485       x1i = ia_add_flt(ia_negate_flt(ia_mul_flt(x1r, w_1)), ia_mul_flt(x1i, w_4));
1486       x1r = tmp;
1487 
1488       *ptr_y = ia_sub_flt((x0r), (x1r));
1489       *(ptr_y + 1) = ia_sub_flt((x0i), (x1i));
1490       ptr_y -= ((SIZE_T)del << 1);
1491 
1492       *ptr_y = ia_add_flt((x0r), (x1r));
1493       *(ptr_y + 1) = ia_add_flt((x0i), (x1i));
1494       ptr_y += 2;
1495     }
1496   }
1497 
1498   for (i = 0; i < n_points; i++) {
1499     ptr_real[i] = y[2 * i];
1500     ptr_imag[i] = y[2 * i + 1];
1501   }
1502 }
ixheaace_cplx_fft_4(FLOAT32 * x_r,FLOAT32 * x_i)1503 static VOID ixheaace_cplx_fft_4(FLOAT32 *x_r, FLOAT32 *x_i) {
1504   FLOAT32 x_0, x_1, x_2, x_3;
1505   FLOAT32 x_4, x_5, x_6, x_7;
1506   FLOAT32 x0r, x1r, x2r, x3r;
1507   FLOAT32 x0i, x1i, x2i, x3i;
1508 
1509   // 4 Point FFT
1510   x_0 = x_r[0];
1511   x_1 = x_i[0];
1512   x_2 = x_r[1];
1513   x_3 = x_i[1];
1514   x_4 = x_r[2];
1515   x_5 = x_i[2];
1516   x_6 = x_r[3];
1517   x_7 = x_i[3];
1518 
1519   x0r = ia_add_flt(x_0, x_4);
1520   x0i = ia_add_flt(x_1, x_5);
1521   x2r = ia_sub_flt(x_0, x_4);
1522   x2i = ia_sub_flt(x_1, x_5);
1523   x1r = ia_add_flt(x_2, x_6);
1524   x1i = ia_add_flt(x_3, x_7);
1525   x3r = ia_sub_flt(x_2, x_6);
1526   x3i = ia_sub_flt(x_3, x_7);
1527 
1528   x_r[0] = ia_add_flt(x0r, x1r);
1529   x_i[0] = ia_add_flt(x0i, x1i);
1530   x_r[2] = ia_sub_flt(x0r, x1r);
1531   x_i[2] = ia_sub_flt(x0i, x1i);
1532   x_r[1] = ia_add_flt(x2r, x3i);
1533   x_i[1] = ia_sub_flt(x2i, x3r);
1534   x_r[3] = ia_sub_flt(x2r, x3i);
1535   x_i[3] = ia_add_flt(x2i, x3r);
1536   return;
1537 }
iusace_complex_fft_4096(FLOAT32 * ptr_x_r,FLOAT32 * ptr_x_i,FLOAT32 * ptr_scratch_buf)1538 VOID iusace_complex_fft_4096(FLOAT32 *ptr_x_r, FLOAT32 *ptr_x_i, FLOAT32 *ptr_scratch_buf) {
1539   FLOAT32 *ptr_data_r;
1540   FLOAT32 *ptr_data_i;
1541   WORD32 fft_len = 4096;
1542   FLOAT32 *ptr_fft_interim_buf = &ptr_scratch_buf[2 * fft_len];
1543   WORD32 i, j;
1544   WORD32 dim2 = fft_len >> 10;
1545   WORD32 dim1 = fft_len / dim2;
1546   WORD32 fac = 4;
1547 
1548   for (i = 0; i < dim2; i++) {
1549     ptr_data_r = &ptr_scratch_buf[(2 * i + 0) * dim1];
1550     ptr_data_i = &ptr_scratch_buf[(2 * i + 1) * dim1];
1551     for (j = 0; j < dim1; j++) {
1552       ptr_data_r[j] = ptr_x_r[(dim2 * j + i)];
1553       ptr_data_i[j] = 0;
1554     }
1555     ixheaace_rad2_cplx_fft(ptr_data_r, ptr_data_i, dim1, ptr_fft_interim_buf);
1556   }
1557   ptr_data_r = &ptr_scratch_buf[0];
1558   ptr_data_i = &ptr_scratch_buf[0];
1559   for (i = 0; i < dim1; i++) {
1560     FLOAT32 *ptr_cos_val = (FLOAT32 *)&ia_mixed_rad_twiddle_cos[i * dim2 * fac];
1561     FLOAT32 *ptr_sin_val = (FLOAT32 *)&ia_mixed_rad_twiddle_sin[i * dim2 * fac];
1562     for (j = 0; j < dim2; j++) {
1563       FLOAT32 real = ptr_data_r[(2 * j + 0) * dim1 + i];
1564       FLOAT32 imag = ptr_data_i[(2 * j + 1) * dim1 + i];
1565       FLOAT32 cos_val = ptr_cos_val[j * fac];
1566       FLOAT32 sin_val = ptr_sin_val[j * fac];
1567       FLOAT32 temp_real = (FLOAT32)(real * cos_val + imag * sin_val);
1568       FLOAT32 temp_imag = (FLOAT32)(imag * cos_val - real * sin_val);
1569       ptr_fft_interim_buf[(2 * i + 0) * dim2 + j] = temp_real;
1570       ptr_fft_interim_buf[(2 * i + 1) * dim2 + j] = temp_imag;
1571     }
1572   }
1573   for (i = 0; i < dim1; i++) {
1574     ptr_data_r = &ptr_fft_interim_buf[(2 * i + 0) * dim2];
1575     ptr_data_i = &ptr_fft_interim_buf[(2 * i + 1) * dim2];
1576     ixheaace_cplx_fft_4(ptr_data_r, ptr_data_i);
1577   }
1578   ptr_data_r = &ptr_fft_interim_buf[0];
1579   ptr_data_i = &ptr_fft_interim_buf[0];
1580   for (i = 0; i < dim1; i++) {
1581     for (j = 0; j < dim2; j++) {
1582       ptr_x_r[(j * dim1 + i)] = ptr_data_r[(2 * i + 0) * dim2 + j];
1583       ptr_x_i[(j * dim1 + i)] = ptr_data_i[(2 * i + 1) * dim2 + j];
1584     }
1585   }
1586 }