xref: /aosp_15_r20/external/libaom/aom_dsp/fwd_txfm.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include "aom_dsp/txfm_common.h"
14 #include "config/aom_dsp_rtcd.h"
15 
aom_fdct4x4_c(const int16_t * input,tran_low_t * output,int stride)16 void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
17   // The 2D transform is done with two passes which are actually pretty
18   // similar. In the first one, we transform the columns and transpose
19   // the results. In the second one, we transform the rows.
20   // We need an intermediate buffer between passes.
21   tran_low_t intermediate[4 * 4];
22   const tran_low_t *in_low = NULL;
23   tran_low_t *out = intermediate;
24   // Do the two transform passes
25   for (int pass = 0; pass < 2; ++pass) {
26     tran_high_t in_high[4];  // canbe16
27     tran_high_t step[4];     // canbe16
28     tran_low_t temp[4];
29     for (int i = 0; i < 4; ++i) {
30       // Load inputs.
31       if (pass == 0) {
32         in_high[0] = input[0 * stride] * 16;
33         in_high[1] = input[1 * stride] * 16;
34         in_high[2] = input[2 * stride] * 16;
35         in_high[3] = input[3 * stride] * 16;
36         if (i == 0 && in_high[0]) {
37           ++in_high[0];
38         }
39         ++input;  // Next column
40       } else {
41         assert(in_low != NULL);
42         in_high[0] = in_low[0 * 4];
43         in_high[1] = in_low[1 * 4];
44         in_high[2] = in_low[2 * 4];
45         in_high[3] = in_low[3 * 4];
46         ++in_low;  // Next column (which is a transposed row)
47       }
48       // Transform.
49       step[0] = in_high[0] + in_high[3];
50       step[1] = in_high[1] + in_high[2];
51       step[2] = in_high[1] - in_high[2];
52       step[3] = in_high[0] - in_high[3];
53       temp[0] = (tran_low_t)fdct_round_shift((step[0] + step[1]) * cospi_16_64);
54       temp[2] = (tran_low_t)fdct_round_shift((step[0] - step[1]) * cospi_16_64);
55       temp[1] = (tran_low_t)fdct_round_shift(step[2] * cospi_24_64 +
56                                              step[3] * cospi_8_64);
57       temp[3] = (tran_low_t)fdct_round_shift(-step[2] * cospi_8_64 +
58                                              step[3] * cospi_24_64);
59       // Only transpose the first pass.
60       if (pass == 0) {
61         out[0] = temp[0];
62         out[1] = temp[1];
63         out[2] = temp[2];
64         out[3] = temp[3];
65         out += 4;
66       } else {
67         out[0 * 4] = temp[0];
68         out[1 * 4] = temp[1];
69         out[2 * 4] = temp[2];
70         out[3 * 4] = temp[3];
71         ++out;
72       }
73     }
74     // Setup in/out for next pass.
75     in_low = intermediate;
76     out = output;
77   }
78 
79   for (int i = 0; i < 4; ++i) {
80     for (int j = 0; j < 4; ++j)
81       output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
82   }
83 }
84 
aom_fdct4x4_lp_c(const int16_t * input,int16_t * output,int stride)85 void aom_fdct4x4_lp_c(const int16_t *input, int16_t *output, int stride) {
86   // The 2D transform is done with two passes which are actually pretty
87   // similar. In the first one, we transform the columns and transpose
88   // the results. In the second one, we transform the rows.
89   // We need an intermediate buffer between passes.
90   int16_t intermediate[4 * 4];
91   const int16_t *in_low = NULL;
92   int16_t *out = intermediate;
93   // Do the two transform passes
94   for (int pass = 0; pass < 2; ++pass) {
95     int32_t in_high[4];  // canbe16
96     int32_t step[4];     // canbe16
97     int16_t temp[4];
98     for (int i = 0; i < 4; ++i) {
99       // Load inputs.
100       if (pass == 0) {
101         in_high[0] = input[0 * stride] * 16;
102         in_high[1] = input[1 * stride] * 16;
103         in_high[2] = input[2 * stride] * 16;
104         in_high[3] = input[3 * stride] * 16;
105         ++input;
106         if (i == 0 && in_high[0]) {
107           ++in_high[0];
108         }
109       } else {
110         assert(in_low != NULL);
111         in_high[0] = in_low[0 * 4];
112         in_high[1] = in_low[1 * 4];
113         in_high[2] = in_low[2 * 4];
114         in_high[3] = in_low[3 * 4];
115         ++in_low;
116       }
117       // Transform.
118       step[0] = in_high[0] + in_high[3];
119       step[1] = in_high[1] + in_high[2];
120       step[2] = in_high[1] - in_high[2];
121       step[3] = in_high[0] - in_high[3];
122       temp[0] = (int16_t)fdct_round_shift((step[0] + step[1]) * cospi_16_64);
123       temp[2] = (int16_t)fdct_round_shift((step[0] - step[1]) * cospi_16_64);
124       temp[1] = (int16_t)fdct_round_shift(step[2] * cospi_24_64 +
125                                           step[3] * cospi_8_64);
126       temp[3] = (int16_t)fdct_round_shift(-step[2] * cospi_8_64 +
127                                           step[3] * cospi_24_64);
128       // Only transpose the first pass.
129       if (pass == 0) {
130         out[0] = temp[0];
131         out[1] = temp[1];
132         out[2] = temp[2];
133         out[3] = temp[3];
134         out += 4;
135       } else {
136         out[0 * 4] = temp[0];
137         out[1 * 4] = temp[1];
138         out[2 * 4] = temp[2];
139         out[3 * 4] = temp[3];
140         ++out;
141       }
142     }
143     // Setup in/out for next pass.
144     in_low = intermediate;
145     out = output;
146   }
147 
148   for (int i = 0; i < 4; ++i) {
149     for (int j = 0; j < 4; ++j)
150       output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
151   }
152 }
153 
154 #if CONFIG_INTERNAL_STATS
aom_fdct8x8_c(const int16_t * input,tran_low_t * final_output,int stride)155 void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
156   int i, j;
157   tran_low_t intermediate[64];
158   int pass;
159   tran_low_t *output = intermediate;
160   const tran_low_t *in = NULL;
161 
162   // Transform columns
163   for (pass = 0; pass < 2; ++pass) {
164     tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
165     tran_high_t t0, t1, t2, t3;                  // needs32
166     tran_high_t x0, x1, x2, x3;                  // canbe16
167 
168     for (i = 0; i < 8; i++) {
169       // stage 1
170       if (pass == 0) {
171         s0 = (input[0 * stride] + input[7 * stride]) * 4;
172         s1 = (input[1 * stride] + input[6 * stride]) * 4;
173         s2 = (input[2 * stride] + input[5 * stride]) * 4;
174         s3 = (input[3 * stride] + input[4 * stride]) * 4;
175         s4 = (input[3 * stride] - input[4 * stride]) * 4;
176         s5 = (input[2 * stride] - input[5 * stride]) * 4;
177         s6 = (input[1 * stride] - input[6 * stride]) * 4;
178         s7 = (input[0 * stride] - input[7 * stride]) * 4;
179         ++input;
180       } else {
181         s0 = in[0 * 8] + in[7 * 8];
182         s1 = in[1 * 8] + in[6 * 8];
183         s2 = in[2 * 8] + in[5 * 8];
184         s3 = in[3 * 8] + in[4 * 8];
185         s4 = in[3 * 8] - in[4 * 8];
186         s5 = in[2 * 8] - in[5 * 8];
187         s6 = in[1 * 8] - in[6 * 8];
188         s7 = in[0 * 8] - in[7 * 8];
189         ++in;
190       }
191 
192       // fdct4(step, step);
193       x0 = s0 + s3;
194       x1 = s1 + s2;
195       x2 = s1 - s2;
196       x3 = s0 - s3;
197       t0 = (x0 + x1) * cospi_16_64;
198       t1 = (x0 - x1) * cospi_16_64;
199       t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
200       t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
201       output[0] = (tran_low_t)fdct_round_shift(t0);
202       output[2] = (tran_low_t)fdct_round_shift(t2);
203       output[4] = (tran_low_t)fdct_round_shift(t1);
204       output[6] = (tran_low_t)fdct_round_shift(t3);
205 
206       // Stage 2
207       t0 = (s6 - s5) * cospi_16_64;
208       t1 = (s6 + s5) * cospi_16_64;
209       t2 = fdct_round_shift(t0);
210       t3 = fdct_round_shift(t1);
211 
212       // Stage 3
213       x0 = s4 + t2;
214       x1 = s4 - t2;
215       x2 = s7 - t3;
216       x3 = s7 + t3;
217 
218       // Stage 4
219       t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
220       t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
221       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
222       t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
223       output[1] = (tran_low_t)fdct_round_shift(t0);
224       output[3] = (tran_low_t)fdct_round_shift(t2);
225       output[5] = (tran_low_t)fdct_round_shift(t1);
226       output[7] = (tran_low_t)fdct_round_shift(t3);
227       output += 8;
228     }
229     in = intermediate;
230     output = final_output;
231   }
232 
233   // Rows
234   for (i = 0; i < 8; ++i) {
235     for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
236   }
237 }
238 #endif  // CONFIG_INTERNAL_STATS
239 
240 #if CONFIG_AV1_HIGHBITDEPTH && CONFIG_INTERNAL_STATS
aom_highbd_fdct8x8_c(const int16_t * input,tran_low_t * final_output,int stride)241 void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
242                           int stride) {
243   aom_fdct8x8_c(input, final_output, stride);
244 }
245 #endif
246