xref: /aosp_15_r20/external/libaom/aom_dsp/x86/fft_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <xmmintrin.h>
13*77c1e3ccSAndroid Build Coastguard Worker 
14*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
15*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/aom_dsp_common.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/fft_common.h"
17*77c1e3ccSAndroid Build Coastguard Worker 
transpose4x4(const float * A,float * B,const int lda,const int ldb)18*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose4x4(const float *A, float *B, const int lda,
19*77c1e3ccSAndroid Build Coastguard Worker                                 const int ldb) {
20*77c1e3ccSAndroid Build Coastguard Worker   __m128 row1 = _mm_load_ps(&A[0 * lda]);
21*77c1e3ccSAndroid Build Coastguard Worker   __m128 row2 = _mm_load_ps(&A[1 * lda]);
22*77c1e3ccSAndroid Build Coastguard Worker   __m128 row3 = _mm_load_ps(&A[2 * lda]);
23*77c1e3ccSAndroid Build Coastguard Worker   __m128 row4 = _mm_load_ps(&A[3 * lda]);
24*77c1e3ccSAndroid Build Coastguard Worker   _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
25*77c1e3ccSAndroid Build Coastguard Worker   _mm_store_ps(&B[0 * ldb], row1);
26*77c1e3ccSAndroid Build Coastguard Worker   _mm_store_ps(&B[1 * ldb], row2);
27*77c1e3ccSAndroid Build Coastguard Worker   _mm_store_ps(&B[2 * ldb], row3);
28*77c1e3ccSAndroid Build Coastguard Worker   _mm_store_ps(&B[3 * ldb], row4);
29*77c1e3ccSAndroid Build Coastguard Worker }
30*77c1e3ccSAndroid Build Coastguard Worker 
31*77c1e3ccSAndroid Build Coastguard Worker // Referenced by fft_avx2.c.
32*77c1e3ccSAndroid Build Coastguard Worker void aom_transpose_float_sse2(const float *A, float *B, int n);
33*77c1e3ccSAndroid Build Coastguard Worker 
aom_transpose_float_sse2(const float * A,float * B,int n)34*77c1e3ccSAndroid Build Coastguard Worker void aom_transpose_float_sse2(const float *A, float *B, int n) {
35*77c1e3ccSAndroid Build Coastguard Worker   for (int y = 0; y < n; y += 4) {
36*77c1e3ccSAndroid Build Coastguard Worker     for (int x = 0; x < n; x += 4) {
37*77c1e3ccSAndroid Build Coastguard Worker       transpose4x4(A + y * n + x, B + x * n + y, n, n);
38*77c1e3ccSAndroid Build Coastguard Worker     }
39*77c1e3ccSAndroid Build Coastguard Worker   }
40*77c1e3ccSAndroid Build Coastguard Worker }
41*77c1e3ccSAndroid Build Coastguard Worker 
42*77c1e3ccSAndroid Build Coastguard Worker // Referenced by fft_avx2.c.
43*77c1e3ccSAndroid Build Coastguard Worker void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n);
44*77c1e3ccSAndroid Build Coastguard Worker 
aom_fft_unpack_2d_output_sse2(const float * packed,float * output,int n)45*77c1e3ccSAndroid Build Coastguard Worker void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
46*77c1e3ccSAndroid Build Coastguard Worker   const int n2 = n / 2;
47*77c1e3ccSAndroid Build Coastguard Worker   output[0] = packed[0];
48*77c1e3ccSAndroid Build Coastguard Worker   output[1] = 0;
49*77c1e3ccSAndroid Build Coastguard Worker   output[2 * (n2 * n)] = packed[n2 * n];
50*77c1e3ccSAndroid Build Coastguard Worker   output[2 * (n2 * n) + 1] = 0;
51*77c1e3ccSAndroid Build Coastguard Worker 
52*77c1e3ccSAndroid Build Coastguard Worker   output[2 * n2] = packed[n2];
53*77c1e3ccSAndroid Build Coastguard Worker   output[2 * n2 + 1] = 0;
54*77c1e3ccSAndroid Build Coastguard Worker   output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
55*77c1e3ccSAndroid Build Coastguard Worker   output[2 * (n2 * n + n2) + 1] = 0;
56*77c1e3ccSAndroid Build Coastguard Worker 
57*77c1e3ccSAndroid Build Coastguard Worker   for (int c = 1; c < n2; ++c) {
58*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (0 * n + c)] = packed[c];
59*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (0 * n + c) + 1] = packed[c + n2];
60*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
61*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
62*77c1e3ccSAndroid Build Coastguard Worker   }
63*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 1; r < n2; ++r) {
64*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (r * n + 0)] = packed[r * n];
65*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
66*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (r * n + n2) + 0] = packed[r * n + n2];
67*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
68*77c1e3ccSAndroid Build Coastguard Worker 
69*77c1e3ccSAndroid Build Coastguard Worker     for (int c = 1; c < AOMMIN(n2, 4); ++c) {
70*77c1e3ccSAndroid Build Coastguard Worker       output[2 * (r * n + c)] =
71*77c1e3ccSAndroid Build Coastguard Worker           packed[r * n + c] - packed[(r + n2) * n + c + n2];
72*77c1e3ccSAndroid Build Coastguard Worker       output[2 * (r * n + c) + 1] =
73*77c1e3ccSAndroid Build Coastguard Worker           packed[(r + n2) * n + c] + packed[r * n + c + n2];
74*77c1e3ccSAndroid Build Coastguard Worker     }
75*77c1e3ccSAndroid Build Coastguard Worker 
76*77c1e3ccSAndroid Build Coastguard Worker     for (int c = 4; c < n2; c += 4) {
77*77c1e3ccSAndroid Build Coastguard Worker       __m128 real1 = _mm_load_ps(packed + r * n + c);
78*77c1e3ccSAndroid Build Coastguard Worker       __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
79*77c1e3ccSAndroid Build Coastguard Worker       __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
80*77c1e3ccSAndroid Build Coastguard Worker       __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
81*77c1e3ccSAndroid Build Coastguard Worker       real1 = _mm_sub_ps(real1, real2);
82*77c1e3ccSAndroid Build Coastguard Worker       imag1 = _mm_add_ps(imag1, imag2);
83*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
84*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
85*77c1e3ccSAndroid Build Coastguard Worker     }
86*77c1e3ccSAndroid Build Coastguard Worker 
87*77c1e3ccSAndroid Build Coastguard Worker     int r2 = r + n2;
88*77c1e3ccSAndroid Build Coastguard Worker     int r3 = n - r2;
89*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (r2 * n + 0)] = packed[r3 * n];
90*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
91*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
92*77c1e3ccSAndroid Build Coastguard Worker     output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
93*77c1e3ccSAndroid Build Coastguard Worker     for (int c = 1; c < AOMMIN(4, n2); ++c) {
94*77c1e3ccSAndroid Build Coastguard Worker       output[2 * (r2 * n + c)] =
95*77c1e3ccSAndroid Build Coastguard Worker           packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
96*77c1e3ccSAndroid Build Coastguard Worker       output[2 * (r2 * n + c) + 1] =
97*77c1e3ccSAndroid Build Coastguard Worker           -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
98*77c1e3ccSAndroid Build Coastguard Worker     }
99*77c1e3ccSAndroid Build Coastguard Worker     for (int c = 4; c < n2; c += 4) {
100*77c1e3ccSAndroid Build Coastguard Worker       __m128 real1 = _mm_load_ps(packed + r3 * n + c);
101*77c1e3ccSAndroid Build Coastguard Worker       __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
102*77c1e3ccSAndroid Build Coastguard Worker       __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
103*77c1e3ccSAndroid Build Coastguard Worker       __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
104*77c1e3ccSAndroid Build Coastguard Worker       real1 = _mm_add_ps(real1, real2);
105*77c1e3ccSAndroid Build Coastguard Worker       imag1 = _mm_sub_ps(imag2, imag1);
106*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
107*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_ps(output + 2 * (r2 * n + c + 2),
108*77c1e3ccSAndroid Build Coastguard Worker                    _mm_unpackhi_ps(real1, imag1));
109*77c1e3ccSAndroid Build Coastguard Worker     }
110*77c1e3ccSAndroid Build Coastguard Worker   }
111*77c1e3ccSAndroid Build Coastguard Worker }
112*77c1e3ccSAndroid Build Coastguard Worker 
113*77c1e3ccSAndroid Build Coastguard Worker // Generate definitions for 1d transforms using float and __mm128
GEN_FFT_4(static inline void,sse2,float,__m128,_mm_load_ps,_mm_store_ps,_mm_set1_ps,_mm_add_ps,_mm_sub_ps)114*77c1e3ccSAndroid Build Coastguard Worker GEN_FFT_4(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
115*77c1e3ccSAndroid Build Coastguard Worker           _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
116*77c1e3ccSAndroid Build Coastguard Worker GEN_FFT_8(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
117*77c1e3ccSAndroid Build Coastguard Worker           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
118*77c1e3ccSAndroid Build Coastguard Worker GEN_FFT_16(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
119*77c1e3ccSAndroid Build Coastguard Worker            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
120*77c1e3ccSAndroid Build Coastguard Worker GEN_FFT_32(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
121*77c1e3ccSAndroid Build Coastguard Worker            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
122*77c1e3ccSAndroid Build Coastguard Worker 
123*77c1e3ccSAndroid Build Coastguard Worker void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
124*77c1e3ccSAndroid Build Coastguard Worker   aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
125*77c1e3ccSAndroid Build Coastguard Worker                  aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
126*77c1e3ccSAndroid Build Coastguard Worker }
127*77c1e3ccSAndroid Build Coastguard Worker 
aom_fft8x8_float_sse2(const float * input,float * temp,float * output)128*77c1e3ccSAndroid Build Coastguard Worker void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
129*77c1e3ccSAndroid Build Coastguard Worker   aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
130*77c1e3ccSAndroid Build Coastguard Worker                  aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
131*77c1e3ccSAndroid Build Coastguard Worker }
132*77c1e3ccSAndroid Build Coastguard Worker 
aom_fft16x16_float_sse2(const float * input,float * temp,float * output)133*77c1e3ccSAndroid Build Coastguard Worker void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
134*77c1e3ccSAndroid Build Coastguard Worker   aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
135*77c1e3ccSAndroid Build Coastguard Worker                  aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
136*77c1e3ccSAndroid Build Coastguard Worker }
137*77c1e3ccSAndroid Build Coastguard Worker 
aom_fft32x32_float_sse2(const float * input,float * temp,float * output)138*77c1e3ccSAndroid Build Coastguard Worker void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
139*77c1e3ccSAndroid Build Coastguard Worker   aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
140*77c1e3ccSAndroid Build Coastguard Worker                  aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
141*77c1e3ccSAndroid Build Coastguard Worker }
142*77c1e3ccSAndroid Build Coastguard Worker 
143*77c1e3ccSAndroid Build Coastguard Worker // Generate definitions for 1d inverse transforms using float and mm128
GEN_IFFT_4(static inline void,sse2,float,__m128,_mm_load_ps,_mm_store_ps,_mm_set1_ps,_mm_add_ps,_mm_sub_ps)144*77c1e3ccSAndroid Build Coastguard Worker GEN_IFFT_4(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
145*77c1e3ccSAndroid Build Coastguard Worker            _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
146*77c1e3ccSAndroid Build Coastguard Worker GEN_IFFT_8(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
147*77c1e3ccSAndroid Build Coastguard Worker            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
148*77c1e3ccSAndroid Build Coastguard Worker GEN_IFFT_16(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
149*77c1e3ccSAndroid Build Coastguard Worker             _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
150*77c1e3ccSAndroid Build Coastguard Worker GEN_IFFT_32(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
151*77c1e3ccSAndroid Build Coastguard Worker             _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
152*77c1e3ccSAndroid Build Coastguard Worker 
153*77c1e3ccSAndroid Build Coastguard Worker void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
154*77c1e3ccSAndroid Build Coastguard Worker   aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
155*77c1e3ccSAndroid Build Coastguard Worker                   aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
156*77c1e3ccSAndroid Build Coastguard Worker }
157*77c1e3ccSAndroid Build Coastguard Worker 
aom_ifft8x8_float_sse2(const float * input,float * temp,float * output)158*77c1e3ccSAndroid Build Coastguard Worker void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
159*77c1e3ccSAndroid Build Coastguard Worker   aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
160*77c1e3ccSAndroid Build Coastguard Worker                   aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
161*77c1e3ccSAndroid Build Coastguard Worker }
162*77c1e3ccSAndroid Build Coastguard Worker 
aom_ifft16x16_float_sse2(const float * input,float * temp,float * output)163*77c1e3ccSAndroid Build Coastguard Worker void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
164*77c1e3ccSAndroid Build Coastguard Worker   aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
165*77c1e3ccSAndroid Build Coastguard Worker                   aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
166*77c1e3ccSAndroid Build Coastguard Worker                   aom_transpose_float_sse2, 4);
167*77c1e3ccSAndroid Build Coastguard Worker }
168*77c1e3ccSAndroid Build Coastguard Worker 
aom_ifft32x32_float_sse2(const float * input,float * temp,float * output)169*77c1e3ccSAndroid Build Coastguard Worker void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
170*77c1e3ccSAndroid Build Coastguard Worker   aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
171*77c1e3ccSAndroid Build Coastguard Worker                   aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
172*77c1e3ccSAndroid Build Coastguard Worker                   aom_transpose_float_sse2, 4);
173*77c1e3ccSAndroid Build Coastguard Worker }
174