xref: /aosp_15_r20/external/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 
13 #include "./vp9_rtcd.h"
14 #include "vp9/common/vp9_enums.h"
15 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
16 #include "vpx_dsp/mips/fwd_txfm_msa.h"
17 
fadst16_cols_step1_msa(const int16_t * input,int32_t stride,const int32_t * const0,int16_t * int_buf)18 static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
19                                    const int32_t *const0, int16_t *int_buf) {
20   v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
21   v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
22   v4i32 k0, k1, k2, k3;
23 
24   /* load input data */
25   r0 = LD_SH(input);
26   r15 = LD_SH(input + 15 * stride);
27   r7 = LD_SH(input + 7 * stride);
28   r8 = LD_SH(input + 8 * stride);
29   SLLI_4V(r0, r15, r7, r8, 2);
30 
31   /* stage 1 */
32   LD_SW2(const0, 4, k0, k1);
33   LD_SW2(const0 + 8, 4, k2, k3);
34   MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
35 
36   r3 = LD_SH(input + 3 * stride);
37   r4 = LD_SH(input + 4 * stride);
38   r11 = LD_SH(input + 11 * stride);
39   r12 = LD_SH(input + 12 * stride);
40   SLLI_4V(r3, r4, r11, r12, 2);
41 
42   LD_SW2(const0 + 4 * 4, 4, k0, k1);
43   LD_SW2(const0 + 4 * 6, 4, k2, k3);
44   MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
45 
46   /* stage 2 */
47   BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
48   ST_SH2(tp0, tp2, int_buf, 8);
49   ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
50 
51   LD_SW2(const0 + 4 * 8, 4, k0, k1);
52   k2 = LD_SW(const0 + 4 * 10);
53   MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
54 
55   ST_SH2(h0, h1, int_buf + 8 * 8, 8);
56   ST_SH2(h3, h2, int_buf + 12 * 8, 8);
57 
58   r9 = LD_SH(input + 9 * stride);
59   r6 = LD_SH(input + 6 * stride);
60   r1 = LD_SH(input + stride);
61   r14 = LD_SH(input + 14 * stride);
62   SLLI_4V(r9, r6, r1, r14, 2);
63 
64   LD_SW2(const0 + 4 * 11, 4, k0, k1);
65   LD_SW2(const0 + 4 * 13, 4, k2, k3);
66   MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
67 
68   ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
69 
70   r13 = LD_SH(input + 13 * stride);
71   r2 = LD_SH(input + 2 * stride);
72   r5 = LD_SH(input + 5 * stride);
73   r10 = LD_SH(input + 10 * stride);
74   SLLI_4V(r13, r2, r5, r10, 2);
75 
76   LD_SW2(const0 + 4 * 15, 4, k0, k1);
77   LD_SW2(const0 + 4 * 17, 4, k2, k3);
78   MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
79 
80   ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
81 
82   BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
83   ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
84 }
85 
fadst16_cols_step2_msa(int16_t * int_buf,const int32_t * const0,int16_t * out)86 static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
87                                    int16_t *out) {
88   int16_t *out_ptr = out + 128;
89   v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
90   v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
91   v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
92   v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
93   v4i32 k0, k1, k2, k3;
94 
95   LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
96   LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
97   LD_SW2(const0 + 4 * 19, 4, k0, k1);
98   k2 = LD_SW(const0 + 4 * 21);
99   MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
100 
101   tp0 = LD_SH(int_buf + 4 * 8);
102   tp1 = LD_SH(int_buf + 5 * 8);
103   tp3 = LD_SH(int_buf + 10 * 8);
104   tp2 = LD_SH(int_buf + 14 * 8);
105   LD_SW2(const0 + 4 * 22, 4, k0, k1);
106   k2 = LD_SW(const0 + 4 * 24);
107   MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
108   out4 = -out4;
109   ST_SH(out4, (out + 3 * 16));
110   ST_SH(out5, (out_ptr + 4 * 16));
111 
112   h1 = LD_SH(int_buf + 9 * 8);
113   h3 = LD_SH(int_buf + 12 * 8);
114   MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
115   out13 = -out13;
116   ST_SH(out12, (out + 2 * 16));
117   ST_SH(out13, (out_ptr + 5 * 16));
118 
119   tp0 = LD_SH(int_buf);
120   tp1 = LD_SH(int_buf + 8);
121   tp2 = LD_SH(int_buf + 2 * 8);
122   tp3 = LD_SH(int_buf + 6 * 8);
123 
124   BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
125   out1 = -out1;
126   ST_SH(out0, (out));
127   ST_SH(out1, (out_ptr + 7 * 16));
128 
129   h0 = LD_SH(int_buf + 8 * 8);
130   h2 = LD_SH(int_buf + 13 * 8);
131 
132   BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
133   out8 = -out8;
134   ST_SH(out8, (out + 16));
135   ST_SH(out9, (out_ptr + 6 * 16));
136 
137   /* stage 4 */
138   LD_SW2(const0 + 4 * 25, 4, k0, k1);
139   LD_SW2(const0 + 4 * 27, 4, k2, k3);
140   MADD_SHORT(h10, h11, k1, k2, out2, out3);
141   ST_SH(out2, (out + 7 * 16));
142   ST_SH(out3, (out_ptr));
143 
144   MADD_SHORT(out6, out7, k0, k3, out6, out7);
145   ST_SH(out6, (out + 4 * 16));
146   ST_SH(out7, (out_ptr + 3 * 16));
147 
148   MADD_SHORT(out10, out11, k0, k3, out10, out11);
149   ST_SH(out10, (out + 6 * 16));
150   ST_SH(out11, (out_ptr + 16));
151 
152   MADD_SHORT(out14, out15, k1, k2, out14, out15);
153   ST_SH(out14, (out + 5 * 16));
154   ST_SH(out15, (out_ptr + 2 * 16));
155 }
156 
fadst16_transpose_postproc_msa(int16_t * input,int16_t * out)157 static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
158   v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
159   v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
160 
161   /* load input data */
162   LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
163   TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
164                      r7);
165   FDCT_POSTPROC_2V_NEG_H(r0, r1);
166   FDCT_POSTPROC_2V_NEG_H(r2, r3);
167   FDCT_POSTPROC_2V_NEG_H(r4, r5);
168   FDCT_POSTPROC_2V_NEG_H(r6, r7);
169   ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
170   out += 64;
171 
172   LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
173   TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
174                      r12, r13, r14, r15);
175   FDCT_POSTPROC_2V_NEG_H(r8, r9);
176   FDCT_POSTPROC_2V_NEG_H(r10, r11);
177   FDCT_POSTPROC_2V_NEG_H(r12, r13);
178   FDCT_POSTPROC_2V_NEG_H(r14, r15);
179   ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
180   out += 64;
181 
182   /* load input data */
183   input += 128;
184   LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
185   TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
186                      r7);
187   FDCT_POSTPROC_2V_NEG_H(r0, r1);
188   FDCT_POSTPROC_2V_NEG_H(r2, r3);
189   FDCT_POSTPROC_2V_NEG_H(r4, r5);
190   FDCT_POSTPROC_2V_NEG_H(r6, r7);
191   ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
192   out += 64;
193 
194   LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
195   TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
196                      r12, r13, r14, r15);
197   FDCT_POSTPROC_2V_NEG_H(r8, r9);
198   FDCT_POSTPROC_2V_NEG_H(r10, r11);
199   FDCT_POSTPROC_2V_NEG_H(r12, r13);
200   FDCT_POSTPROC_2V_NEG_H(r14, r15);
201   ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
202 }
203 
fadst16_rows_step1_msa(int16_t * input,const int32_t * const0,int16_t * int_buf)204 static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
205                                    int16_t *int_buf) {
206   v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
207   v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
208   v4i32 k0, k1, k2, k3;
209 
210   /* load input data */
211   r0 = LD_SH(input);
212   r7 = LD_SH(input + 7 * 8);
213   r8 = LD_SH(input + 8 * 8);
214   r15 = LD_SH(input + 15 * 8);
215 
216   /* stage 1 */
217   LD_SW2(const0, 4, k0, k1);
218   LD_SW2(const0 + 4 * 2, 4, k2, k3);
219   MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
220 
221   r3 = LD_SH(input + 3 * 8);
222   r4 = LD_SH(input + 4 * 8);
223   r11 = LD_SH(input + 11 * 8);
224   r12 = LD_SH(input + 12 * 8);
225 
226   LD_SW2(const0 + 4 * 4, 4, k0, k1);
227   LD_SW2(const0 + 4 * 6, 4, k2, k3);
228   MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
229 
230   /* stage 2 */
231   BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
232   ST_SH2(tp0, tp1, int_buf, 4 * 8);
233   ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
234 
235   LD_SW2(const0 + 4 * 8, 4, k0, k1);
236   k2 = LD_SW(const0 + 4 * 10);
237   MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
238   ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
239   ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
240 
241   r1 = LD_SH(input + 8);
242   r6 = LD_SH(input + 6 * 8);
243   r9 = LD_SH(input + 9 * 8);
244   r14 = LD_SH(input + 14 * 8);
245 
246   LD_SW2(const0 + 4 * 11, 4, k0, k1);
247   LD_SW2(const0 + 4 * 13, 4, k2, k3);
248   MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
249   ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
250 
251   r2 = LD_SH(input + 2 * 8);
252   r5 = LD_SH(input + 5 * 8);
253   r10 = LD_SH(input + 10 * 8);
254   r13 = LD_SH(input + 13 * 8);
255 
256   LD_SW2(const0 + 4 * 15, 4, k0, k1);
257   LD_SW2(const0 + 4 * 17, 4, k2, k3);
258   MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
259   ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
260   BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
261   ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
262 }
263 
fadst16_rows_step2_msa(int16_t * int_buf,const int32_t * const0,int16_t * out)264 static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
265                                    int16_t *out) {
266   int16_t *out_ptr = out + 8;
267   v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
268   v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
269   v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
270   v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
271   v4i32 k0, k1, k2, k3;
272 
273   g13 = LD_SH(int_buf + 3 * 8);
274   g15 = LD_SH(int_buf + 7 * 8);
275   g5 = LD_SH(int_buf + 11 * 8);
276   g7 = LD_SH(int_buf + 15 * 8);
277 
278   LD_SW2(const0 + 4 * 19, 4, k0, k1);
279   k2 = LD_SW(const0 + 4 * 21);
280   MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
281 
282   tp0 = LD_SH(int_buf + 4 * 8);
283   tp1 = LD_SH(int_buf + 5 * 8);
284   tp3 = LD_SH(int_buf + 10 * 8);
285   tp2 = LD_SH(int_buf + 14 * 8);
286 
287   LD_SW2(const0 + 4 * 22, 4, k0, k1);
288   k2 = LD_SW(const0 + 4 * 24);
289   MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
290   out4 = -out4;
291   ST_SH(out4, (out + 3 * 16));
292   ST_SH(out5, (out_ptr + 4 * 16));
293 
294   h1 = LD_SH(int_buf + 9 * 8);
295   h3 = LD_SH(int_buf + 12 * 8);
296   MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
297   out13 = -out13;
298   ST_SH(out12, (out + 2 * 16));
299   ST_SH(out13, (out_ptr + 5 * 16));
300 
301   tp0 = LD_SH(int_buf);
302   tp1 = LD_SH(int_buf + 8);
303   tp2 = LD_SH(int_buf + 2 * 8);
304   tp3 = LD_SH(int_buf + 6 * 8);
305 
306   BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
307   out1 = -out1;
308   ST_SH(out0, (out));
309   ST_SH(out1, (out_ptr + 7 * 16));
310 
311   h0 = LD_SH(int_buf + 8 * 8);
312   h2 = LD_SH(int_buf + 13 * 8);
313   BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
314   out8 = -out8;
315   ST_SH(out8, (out + 16));
316   ST_SH(out9, (out_ptr + 6 * 16));
317 
318   /* stage 4 */
319   LD_SW2(const0 + 4 * 25, 4, k0, k1);
320   LD_SW2(const0 + 4 * 27, 4, k2, k3);
321   MADD_SHORT(h10, h11, k1, k2, out2, out3);
322   ST_SH(out2, (out + 7 * 16));
323   ST_SH(out3, (out_ptr));
324 
325   MADD_SHORT(out6, out7, k0, k3, out6, out7);
326   ST_SH(out6, (out + 4 * 16));
327   ST_SH(out7, (out_ptr + 3 * 16));
328 
329   MADD_SHORT(out10, out11, k0, k3, out10, out11);
330   ST_SH(out10, (out + 6 * 16));
331   ST_SH(out11, (out_ptr + 16));
332 
333   MADD_SHORT(out14, out15, k1, k2, out14, out15);
334   ST_SH(out14, (out + 5 * 16));
335   ST_SH(out15, (out_ptr + 2 * 16));
336 }
337 
fadst16_transpose_msa(int16_t * input,int16_t * out)338 static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
339   v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
340   v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
341 
342   /* load input data */
343   LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
344           l7, l15);
345   TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
346                      r7);
347   TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
348                      r12, r13, r14, r15);
349   ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
350   ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
351   out += 16 * 8;
352 
353   /* load input data */
354   input += 128;
355   LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
356           l7, l15);
357   TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
358                      r7);
359   TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
360                      r12, r13, r14, r15);
361   ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
362   ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
363 }
364 
postproc_fdct16x8_1d_row(int16_t * intermediate,int16_t * output)365 static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
366   int16_t *temp = intermediate;
367   int16_t *out = output;
368   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
369   v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
370   v8i16 in12, in13, in14, in15;
371 
372   LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
373   temp = intermediate + 8;
374   LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
375   TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
376                      in4, in5, in6, in7);
377   TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
378                      in10, in11, in12, in13, in14, in15);
379   FDCT_POSTPROC_2V_NEG_H(in0, in1);
380   FDCT_POSTPROC_2V_NEG_H(in2, in3);
381   FDCT_POSTPROC_2V_NEG_H(in4, in5);
382   FDCT_POSTPROC_2V_NEG_H(in6, in7);
383   FDCT_POSTPROC_2V_NEG_H(in8, in9);
384   FDCT_POSTPROC_2V_NEG_H(in10, in11);
385   FDCT_POSTPROC_2V_NEG_H(in12, in13);
386   FDCT_POSTPROC_2V_NEG_H(in14, in15);
387   BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
388                in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
389                tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
390   temp = intermediate;
391   ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
392   FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
393                 tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
394   temp = intermediate;
395   LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
396   FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
397                in4, in5, in6, in7);
398   TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
399                      tmp1, in1, tmp2, in2, tmp3, in3);
400   ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
401   TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
402                      tmp5, in5, tmp6, in6, tmp7, in7);
403   out = output + 8;
404   ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
405 }
406 
vp9_fht16x16_msa(const int16_t * input,int16_t * output,int32_t stride,int32_t tx_type)407 void vp9_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride,
408                       int32_t tx_type) {
409   DECLARE_ALIGNED(32, int16_t, tmp[256]);
410   DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
411   DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
412   int32_t i;
413   int16_t *ptmpbuf = &tmp_buf[0];
414   int16_t *trans = &trans_buf[0];
415   const int32_t const_arr[29 * 4] = {
416     52707308,    52707308,    52707308,    52707308,    -1072430300,
417     -1072430300, -1072430300, -1072430300, 795618043,   795618043,
418     795618043,   795618043,   -721080468,  -721080468,  -721080468,
419     -721080468,  459094491,   459094491,   459094491,   459094491,
420     -970646691,  -970646691,  -970646691,  -970646691,  1010963856,
421     1010963856,  1010963856,  1010963856,  -361743294,  -361743294,
422     -361743294,  -361743294,  209469125,   209469125,   209469125,
423     209469125,   -1053094788, -1053094788, -1053094788, -1053094788,
424     1053160324,  1053160324,  1053160324,  1053160324,  639644520,
425     639644520,   639644520,   639644520,   -862444000,  -862444000,
426     -862444000,  -862444000,  1062144356,  1062144356,  1062144356,
427     1062144356,  -157532337,  -157532337,  -157532337,  -157532337,
428     260914709,   260914709,   260914709,   260914709,   -1041559667,
429     -1041559667, -1041559667, -1041559667, 920985831,   920985831,
430     920985831,   920985831,   -551995675,  -551995675,  -551995675,
431     -551995675,  596522295,   596522295,   596522295,   596522295,
432     892853362,   892853362,   892853362,   892853362,   -892787826,
433     -892787826,  -892787826,  -892787826,  410925857,   410925857,
434     410925857,   410925857,   -992012162,  -992012162,  -992012162,
435     -992012162,  992077698,   992077698,   992077698,   992077698,
436     759246145,   759246145,   759246145,   759246145,   -759180609,
437     -759180609,  -759180609,  -759180609,  -759222975,  -759222975,
438     -759222975,  -759222975,  759288511,   759288511,   759288511,
439     759288511
440   };
441 
442   switch (tx_type) {
443     case DCT_DCT:
444       /* column transform */
445       for (i = 0; i < 2; ++i) {
446         fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
447       }
448 
449       /* row transform */
450       for (i = 0; i < 2; ++i) {
451         fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
452       }
453       break;
454     case ADST_DCT:
455       /* column transform */
456       for (i = 0; i < 2; ++i) {
457         fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
458         fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
459       }
460 
461       /* row transform */
462       for (i = 0; i < 2; ++i) {
463         postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
464       }
465       break;
466     case DCT_ADST:
467       /* column transform */
468       for (i = 0; i < 2; ++i) {
469         fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
470       }
471 
472       fadst16_transpose_postproc_msa(tmp, trans);
473 
474       /* row transform */
475       for (i = 0; i < 2; ++i) {
476         fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
477         fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
478       }
479 
480       fadst16_transpose_msa(tmp, output);
481       break;
482     case ADST_ADST:
483       /* column transform */
484       for (i = 0; i < 2; ++i) {
485         fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
486         fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
487       }
488 
489       fadst16_transpose_postproc_msa(tmp, trans);
490 
491       /* row transform */
492       for (i = 0; i < 2; ++i) {
493         fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
494         fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
495       }
496 
497       fadst16_transpose_msa(tmp, output);
498       break;
499     default: assert(0); break;
500   }
501 }
502