1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
13
14 #define UNPCK_UB_SH(_in, _out0, _out1) \
15 do { \
16 _out0 = __lsx_vsllwil_hu_bu(_in, 0); \
17 _out1 = __lsx_vexth_hu_bu(_in); \
18 } while (0)
19
idct32x8_row_transpose_store(const int16_t * input,int16_t * tmp_buf)20 static void idct32x8_row_transpose_store(const int16_t *input,
21 int16_t *tmp_buf) {
22 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
23 __m128i n0, n1, n2, n3, n4, n5, n6, n7;
24
25 /* 1st & 2nd 8x8 */
26 DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1,
27 n1);
28 DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2,
29 m3, n3);
30 DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5,
31 n5);
32 DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6,
33 m7, n7);
34
35 LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
36 n3);
37 LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
38 n7);
39
40 __lsx_vst(m0, tmp_buf, 0);
41 __lsx_vst(n0, tmp_buf, 16);
42 __lsx_vst(m1, tmp_buf, 32);
43 __lsx_vst(n1, tmp_buf, 48);
44 __lsx_vst(m2, tmp_buf, 64);
45 __lsx_vst(n2, tmp_buf, 80);
46 __lsx_vst(m3, tmp_buf, 96);
47 __lsx_vst(n3, tmp_buf, 112);
48 __lsx_vst(m4, tmp_buf, 128);
49 __lsx_vst(n4, tmp_buf, 144);
50 __lsx_vst(m5, tmp_buf, 160);
51 __lsx_vst(n5, tmp_buf, 176);
52 __lsx_vst(m6, tmp_buf, 192);
53 __lsx_vst(n6, tmp_buf, 208);
54 __lsx_vst(m7, tmp_buf, 224);
55 __lsx_vst(n7, tmp_buf, 240);
56
57 /* 3rd & 4th 8x8 */
58 DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1,
59 n1);
60 DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2,
61 m3, n3);
62 DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4,
63 m5, n5);
64 DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6,
65 m7, n7);
66
67 LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
68 n3);
69 LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
70 n7);
71
72 __lsx_vst(m0, tmp_buf, 256);
73 __lsx_vst(n0, tmp_buf, 272);
74 __lsx_vst(m1, tmp_buf, 288);
75 __lsx_vst(n1, tmp_buf, 304);
76 __lsx_vst(m2, tmp_buf, 320);
77 __lsx_vst(n2, tmp_buf, 336);
78 __lsx_vst(m3, tmp_buf, 352);
79 __lsx_vst(n3, tmp_buf, 368);
80 __lsx_vst(m4, tmp_buf, 384);
81 __lsx_vst(n4, tmp_buf, 400);
82 __lsx_vst(m5, tmp_buf, 416);
83 __lsx_vst(n5, tmp_buf, 432);
84 __lsx_vst(m6, tmp_buf, 448);
85 __lsx_vst(n6, tmp_buf, 464);
86 __lsx_vst(m7, tmp_buf, 480);
87 __lsx_vst(n7, tmp_buf, 496);
88 }
89
idct32x8_row_even_process_store(int16_t * tmp_buf,int16_t * tmp_eve_buf)90 static void idct32x8_row_even_process_store(int16_t *tmp_buf,
91 int16_t *tmp_eve_buf) {
92 __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
93 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
94 __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
95 __m128i tmp0;
96
97 /* Even stage 1 */
98 DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192,
99 reg0, reg1, reg2, reg3);
100 DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448,
101 reg4, reg5, reg6, reg7);
102
103 DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
104 DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
105 LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
106 DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
107
108 loc1 = vec3;
109 loc0 = vec1;
110
111 DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
112 DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
113 LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
114 LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
115 LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
116
117 /* Even stage 2 */
118 DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224,
119 reg0, reg1, reg2, reg3);
120 DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480,
121 reg4, reg5, reg6, reg7);
122 DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
123 DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
124 DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
125 DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
126
127 vec0 = __lsx_vadd_h(reg0, reg4);
128 reg0 = __lsx_vsub_h(reg0, reg4);
129 reg4 = __lsx_vadd_h(reg6, reg2);
130 reg6 = __lsx_vsub_h(reg6, reg2);
131 reg2 = __lsx_vadd_h(reg1, reg5);
132 reg1 = __lsx_vsub_h(reg1, reg5);
133 reg5 = __lsx_vadd_h(reg7, reg3);
134 reg7 = __lsx_vsub_h(reg7, reg3);
135 reg3 = vec0;
136
137 vec1 = reg2;
138 reg2 = __lsx_vadd_h(reg3, reg4);
139 reg3 = __lsx_vsub_h(reg3, reg4);
140 reg4 = __lsx_vsub_h(reg5, vec1);
141 reg5 = __lsx_vadd_h(reg5, vec1);
142
143 tmp0 = __lsx_vneg_h(reg6);
144 DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
145 DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
146
147 vec0 = __lsx_vsub_h(reg0, reg6);
148 reg0 = __lsx_vadd_h(reg0, reg6);
149 vec1 = __lsx_vsub_h(reg7, reg1);
150 reg7 = __lsx_vadd_h(reg7, reg1);
151
152 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
153 DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
154
155 /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
156 LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
157 __lsx_vst(loc0, tmp_eve_buf, 240);
158 __lsx_vst(loc1, tmp_eve_buf, 0);
159 __lsx_vst(loc2, tmp_eve_buf, 224);
160 __lsx_vst(loc3, tmp_eve_buf, 16);
161
162 LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
163 __lsx_vst(loc0, tmp_eve_buf, 208);
164 __lsx_vst(loc1, tmp_eve_buf, 32);
165 __lsx_vst(loc2, tmp_eve_buf, 192);
166 __lsx_vst(loc3, tmp_eve_buf, 48);
167
168 /* Store 8 */
169 LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
170 __lsx_vst(loc0, tmp_eve_buf, 176);
171 __lsx_vst(loc1, tmp_eve_buf, 64);
172 __lsx_vst(loc2, tmp_eve_buf, 160);
173 __lsx_vst(loc3, tmp_eve_buf, 80);
174
175 LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
176 __lsx_vst(loc0, tmp_eve_buf, 144);
177 __lsx_vst(loc1, tmp_eve_buf, 96);
178 __lsx_vst(loc2, tmp_eve_buf, 128);
179 __lsx_vst(loc3, tmp_eve_buf, 112);
180 }
181
idct32x8_row_odd_process_store(int16_t * tmp_buf,int16_t * tmp_odd_buf)182 static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
183 int16_t *tmp_odd_buf) {
184 __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
185 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
186
187 /* Odd stage 1 */
188 DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240,
189 reg0, reg1, reg2, reg3);
190 DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496,
191 reg4, reg5, reg6, reg7);
192
193 DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
194 DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
195 DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
196 DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
197
198 vec0 = __lsx_vadd_h(reg0, reg3);
199 reg0 = __lsx_vsub_h(reg0, reg3);
200 reg3 = __lsx_vadd_h(reg7, reg4);
201 reg7 = __lsx_vsub_h(reg7, reg4);
202 reg4 = __lsx_vadd_h(reg1, reg2);
203 reg1 = __lsx_vsub_h(reg1, reg2);
204 reg2 = __lsx_vadd_h(reg6, reg5);
205 reg6 = __lsx_vsub_h(reg6, reg5);
206 reg5 = vec0;
207
208 /* 4 Stores */
209 DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
210 __lsx_vst(vec0, tmp_odd_buf, 64);
211 __lsx_vst(vec1, tmp_odd_buf, 80);
212
213 DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
214 DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
215 __lsx_vst(vec0, tmp_odd_buf, 0);
216 __lsx_vst(vec1, tmp_odd_buf, 16);
217
218 /* 4 Stores */
219 DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
220 DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
221 LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
222 __lsx_vst(vec0, tmp_odd_buf, 96);
223 __lsx_vst(vec1, tmp_odd_buf, 112);
224
225 DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
226 __lsx_vst(vec2, tmp_odd_buf, 32);
227 __lsx_vst(vec3, tmp_odd_buf, 48);
228
229 /* Odd stage 2 */
230 /* 8 loads */
231 DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208,
232 reg0, reg1, reg2, reg3);
233 DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464,
234 reg4, reg5, reg6, reg7);
235
236 DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
237 DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
238 DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
239 DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
240
241 /* 4 Stores */
242 DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
243 vec1, vec2, vec3);
244 DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
245 DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
246
247 LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
248 __lsx_vst(vec0, tmp_odd_buf, 192);
249 __lsx_vst(vec1, tmp_odd_buf, 240);
250
251 DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
252 __lsx_vst(vec0, tmp_odd_buf, 160);
253 __lsx_vst(vec1, tmp_odd_buf, 176);
254
255 /* 4 Stores */
256 DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1,
257 vec2, vec0, vec3);
258 LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
259 __lsx_vst(reg0, tmp_odd_buf, 208);
260 __lsx_vst(reg1, tmp_odd_buf, 224);
261
262 DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
263 __lsx_vst(reg0, tmp_odd_buf, 128);
264 __lsx_vst(reg1, tmp_odd_buf, 144);
265
266 /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
267
268 /* Load 8 & Store 8 */
269 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
270 tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
271 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
272 tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
273 DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
274 loc1, loc2, loc3);
275 __lsx_vst(loc0, tmp_odd_buf, 0);
276 __lsx_vst(loc1, tmp_odd_buf, 16);
277 __lsx_vst(loc2, tmp_odd_buf, 32);
278 __lsx_vst(loc3, tmp_odd_buf, 48);
279
280 DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
281 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
282
283 DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
284 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
285 __lsx_vst(loc0, tmp_odd_buf, 128);
286 __lsx_vst(loc1, tmp_odd_buf, 144);
287 __lsx_vst(loc2, tmp_odd_buf, 160);
288 __lsx_vst(loc3, tmp_odd_buf, 176);
289
290 /* Load 8 & Store 8 */
291 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
292 tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
293 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
294 tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
295
296 DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
297 loc1, loc2, loc3);
298 __lsx_vst(loc0, tmp_odd_buf, 64);
299 __lsx_vst(loc1, tmp_odd_buf, 80);
300 __lsx_vst(loc2, tmp_odd_buf, 96);
301 __lsx_vst(loc3, tmp_odd_buf, 112);
302
303 DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
304 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
305 DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
306 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
307 __lsx_vst(loc0, tmp_odd_buf, 192);
308 __lsx_vst(loc1, tmp_odd_buf, 208);
309 __lsx_vst(loc2, tmp_odd_buf, 224);
310 __lsx_vst(loc3, tmp_odd_buf, 240);
311 }
312
idct_butterfly_transpose_store(int16_t * tmp_buf,int16_t * tmp_eve_buf,int16_t * tmp_odd_buf,int16_t * dst)313 static void idct_butterfly_transpose_store(int16_t *tmp_buf,
314 int16_t *tmp_eve_buf,
315 int16_t *tmp_odd_buf, int16_t *dst) {
316 __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
317 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
318 __m128i n0, n1, n2, n3, n4, n5, n6, n7;
319 __m128i reg0, reg1, reg2, reg3;
320
321 /* FINAL BUTTERFLY : Dependency on Even & Odd */
322 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
323 tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
324 DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
325 tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
326
327 DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
328 m4, m2, m6);
329 DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
330 reg1, reg2, reg3);
331 __lsx_vst(reg0, tmp_buf, 496);
332 __lsx_vst(reg1, tmp_buf, 368);
333 __lsx_vst(reg2, tmp_buf, 432);
334 __lsx_vst(reg3, tmp_buf, 304);
335
336 /* Load 8 & Store 8 */
337 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
338 tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
339 DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
340 tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
341
342 DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
343 m5, m3, m7);
344 DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
345 reg1, reg2, reg3);
346 __lsx_vst(reg0, tmp_buf, 464);
347 __lsx_vst(reg1, tmp_buf, 336);
348 __lsx_vst(reg2, tmp_buf, 400);
349 __lsx_vst(reg3, tmp_buf, 272);
350
351 /* Load 8 & Store 8 */
352 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
353 tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
354 DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
355 tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
356
357 DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
358 n4, n2, n6);
359 DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
360 reg1, reg2, reg3);
361 __lsx_vst(reg0, tmp_buf, 480);
362 __lsx_vst(reg1, tmp_buf, 352);
363 __lsx_vst(reg2, tmp_buf, 416);
364 __lsx_vst(reg3, tmp_buf, 288);
365
366 /* Load 8 & Store 8 */
367 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
368 tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
369 DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
370 tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
371 DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
372 n5, n3, n7);
373 DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
374 reg1, reg2, reg3);
375 __lsx_vst(reg0, tmp_buf, 448);
376 __lsx_vst(reg1, tmp_buf, 320);
377 __lsx_vst(reg2, tmp_buf, 384);
378 __lsx_vst(reg3, tmp_buf, 256);
379
380 /* Transpose : 16 vectors */
381 /* 1st & 2nd 8x8 */
382 LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
383 n3);
384 __lsx_vst(m0, dst, 0);
385 __lsx_vst(n0, dst, 64);
386 __lsx_vst(m1, dst, 128);
387 __lsx_vst(n1, dst, 192);
388 __lsx_vst(m2, dst, 256);
389 __lsx_vst(n2, dst, 320);
390 __lsx_vst(m3, dst, 384);
391 __lsx_vst(n3, dst, 448);
392
393 LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
394 n7);
395 __lsx_vst(m4, dst, 16);
396 __lsx_vst(n4, dst, 80);
397 __lsx_vst(m5, dst, 144);
398 __lsx_vst(n5, dst, 208);
399 __lsx_vst(m6, dst, 272);
400 __lsx_vst(n6, dst, 336);
401 __lsx_vst(m7, dst, 400);
402 __lsx_vst(n7, dst, 464);
403
404 /* 3rd & 4th 8x8 */
405 DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304,
406 m0, n0, m1, n1);
407 DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368,
408 m2, n2, m3, n3);
409 DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432,
410 m4, n4, m5, n5);
411 DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496,
412 m6, n6, m7, n7);
413 LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
414 n3);
415 LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
416 n7);
417 __lsx_vst(m0, dst, 32);
418 __lsx_vst(n0, dst, 96);
419 __lsx_vst(m1, dst, 160);
420 __lsx_vst(n1, dst, 224);
421 __lsx_vst(m2, dst, 288);
422 __lsx_vst(n2, dst, 352);
423 __lsx_vst(m3, dst, 416);
424 __lsx_vst(n3, dst, 480);
425 __lsx_vst(m4, dst, 48);
426 __lsx_vst(n4, dst, 112);
427 __lsx_vst(m5, dst, 176);
428 __lsx_vst(n5, dst, 240);
429 __lsx_vst(m6, dst, 304);
430 __lsx_vst(n6, dst, 368);
431 __lsx_vst(m7, dst, 432);
432 __lsx_vst(n7, dst, 496);
433 }
434
idct32x8_1d_rows_lsx(const int16_t * input,int16_t * output)435 static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) {
436 DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
437 DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
438 DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
439
440 idct32x8_row_transpose_store(input, &tmp_buf[0]);
441 idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
442 idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
443 idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
444 output);
445 }
446
idct8x32_column_even_process_store(int16_t * tmp_buf,int16_t * tmp_eve_buf)447 static void idct8x32_column_even_process_store(int16_t *tmp_buf,
448 int16_t *tmp_eve_buf) {
449 __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
450 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
451 __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
452 __m128i tmp0;
453
454 /* Even stage 1 */
455 DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
456 reg0, reg1, reg2, reg3);
457 DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
458 1792, reg4, reg5, reg6, reg7);
459 tmp_buf += 64;
460
461 DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
462 DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
463 LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
464 DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
465
466 loc1 = vec3;
467 loc0 = vec1;
468
469 DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
470 DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
471 LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
472 LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
473 LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
474
475 /* Even stage 2 */
476 /* Load 8 */
477 DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
478 reg0, reg1, reg2, reg3);
479 DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
480 1792, reg4, reg5, reg6, reg7);
481 DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
482 DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
483 DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
484 DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
485
486 vec0 = __lsx_vadd_h(reg0, reg4);
487 reg0 = __lsx_vsub_h(reg0, reg4);
488 reg4 = __lsx_vadd_h(reg6, reg2);
489 reg6 = __lsx_vsub_h(reg6, reg2);
490 reg2 = __lsx_vadd_h(reg1, reg5);
491 reg1 = __lsx_vsub_h(reg1, reg5);
492 reg5 = __lsx_vadd_h(reg7, reg3);
493 reg7 = __lsx_vsub_h(reg7, reg3);
494 reg3 = vec0;
495
496 vec1 = reg2;
497 reg2 = __lsx_vadd_h(reg3, reg4);
498 reg3 = __lsx_vsub_h(reg3, reg4);
499 reg4 = __lsx_vsub_h(reg5, vec1);
500 reg5 = __lsx_vadd_h(reg5, vec1);
501
502 tmp0 = __lsx_vneg_h(reg6);
503 DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
504 DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
505
506 vec0 = __lsx_vsub_h(reg0, reg6);
507 reg0 = __lsx_vadd_h(reg0, reg6);
508 vec1 = __lsx_vsub_h(reg7, reg1);
509 reg7 = __lsx_vadd_h(reg7, reg1);
510
511 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
512 DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
513
514 /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
515 /* Store 8 */
516 LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
517 __lsx_vst(loc1, tmp_eve_buf, 0);
518 __lsx_vst(loc3, tmp_eve_buf, 16);
519 __lsx_vst(loc2, tmp_eve_buf, 224);
520 __lsx_vst(loc0, tmp_eve_buf, 240);
521
522 LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
523 __lsx_vst(loc1, tmp_eve_buf, 32);
524 __lsx_vst(loc3, tmp_eve_buf, 48);
525 __lsx_vst(loc2, tmp_eve_buf, 192);
526 __lsx_vst(loc0, tmp_eve_buf, 208);
527
528 /* Store 8 */
529 LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
530 __lsx_vst(loc1, tmp_eve_buf, 64);
531 __lsx_vst(loc3, tmp_eve_buf, 80);
532 __lsx_vst(loc2, tmp_eve_buf, 160);
533 __lsx_vst(loc0, tmp_eve_buf, 176);
534
535 LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
536 __lsx_vst(loc1, tmp_eve_buf, 96);
537 __lsx_vst(loc3, tmp_eve_buf, 112);
538 __lsx_vst(loc2, tmp_eve_buf, 128);
539 __lsx_vst(loc0, tmp_eve_buf, 144);
540 }
541
idct8x32_column_odd_process_store(int16_t * tmp_buf,int16_t * tmp_odd_buf)542 static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
543 int16_t *tmp_odd_buf) {
544 __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
545 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
546
547 /* Odd stage 1 */
548 DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960,
549 reg0, reg1, reg2, reg3);
550 DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf,
551 1984, reg4, reg5, reg6, reg7);
552
553 DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
554 DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
555 DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
556 DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
557
558 vec0 = __lsx_vadd_h(reg0, reg3);
559 reg0 = __lsx_vsub_h(reg0, reg3);
560 reg3 = __lsx_vadd_h(reg7, reg4);
561 reg7 = __lsx_vsub_h(reg7, reg4);
562 reg4 = __lsx_vadd_h(reg1, reg2);
563 reg1 = __lsx_vsub_h(reg1, reg2);
564 reg2 = __lsx_vadd_h(reg6, reg5);
565 reg6 = __lsx_vsub_h(reg6, reg5);
566 reg5 = vec0;
567
568 /* 4 Stores */
569 DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
570 __lsx_vst(vec0, tmp_odd_buf, 64);
571 __lsx_vst(vec1, tmp_odd_buf, 80);
572 DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
573 DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
574 __lsx_vst(vec0, tmp_odd_buf, 0);
575 __lsx_vst(vec1, tmp_odd_buf, 16);
576
577 /* 4 Stores */
578 DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
579 DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
580 LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
581 DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
582 __lsx_vst(vec0, tmp_odd_buf, 96);
583 __lsx_vst(vec1, tmp_odd_buf, 112);
584 __lsx_vst(vec2, tmp_odd_buf, 32);
585 __lsx_vst(vec3, tmp_odd_buf, 48);
586
587 /* Odd stage 2 */
588 /* 8 loads */
589 DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832,
590 reg0, reg1, reg2, reg3);
591 DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf,
592 1856, reg4, reg5, reg6, reg7);
593 DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
594 DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
595 DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
596 DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
597
598 /* 4 Stores */
599 DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
600 vec1, vec2, vec3);
601 DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
602 DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
603 LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
604 __lsx_vst(vec0, tmp_odd_buf, 192);
605 __lsx_vst(vec1, tmp_odd_buf, 240);
606 DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
607 __lsx_vst(vec0, tmp_odd_buf, 160);
608 __lsx_vst(vec1, tmp_odd_buf, 176);
609
610 /* 4 Stores */
611 DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0,
612 vec1, vec2, vec3);
613 LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
614 __lsx_vst(reg0, tmp_odd_buf, 208);
615 __lsx_vst(reg1, tmp_odd_buf, 224);
616 DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
617 __lsx_vst(reg0, tmp_odd_buf, 128);
618 __lsx_vst(reg1, tmp_odd_buf, 144);
619
620 /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
621 /* Load 8 & Store 8 */
622 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
623 tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
624 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
625 tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
626 DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
627 loc1, loc2, loc3);
628 __lsx_vst(loc0, tmp_odd_buf, 0);
629 __lsx_vst(loc1, tmp_odd_buf, 16);
630 __lsx_vst(loc2, tmp_odd_buf, 32);
631 __lsx_vst(loc3, tmp_odd_buf, 48);
632
633 DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
634 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
635 DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
636 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
637 __lsx_vst(loc0, tmp_odd_buf, 128);
638 __lsx_vst(loc1, tmp_odd_buf, 144);
639 __lsx_vst(loc2, tmp_odd_buf, 160);
640 __lsx_vst(loc3, tmp_odd_buf, 176);
641
642 /* Load 8 & Store 8 */
643 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
644 tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
645 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
646 tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
647 DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
648 loc1, loc2, loc3);
649 __lsx_vst(loc0, tmp_odd_buf, 64);
650 __lsx_vst(loc1, tmp_odd_buf, 80);
651 __lsx_vst(loc2, tmp_odd_buf, 96);
652 __lsx_vst(loc3, tmp_odd_buf, 112);
653
654 DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
655 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
656 DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
657 DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
658 __lsx_vst(loc0, tmp_odd_buf, 192);
659 __lsx_vst(loc1, tmp_odd_buf, 208);
660 __lsx_vst(loc2, tmp_odd_buf, 224);
661 __lsx_vst(loc3, tmp_odd_buf, 240);
662 }
663
idct8x32_column_butterfly_addblk(int16_t * tmp_eve_buf,int16_t * tmp_odd_buf,uint8_t * dst,int32_t dst_stride)664 static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
665 int16_t *tmp_odd_buf, uint8_t *dst,
666 int32_t dst_stride) {
667 __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
668 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
669 __m128i n0, n1, n2, n3, n4, n5, n6, n7;
670 int32_t stride = dst_stride << 2;
671 int32_t stride2 = stride << 1;
672 int32_t stride3 = stride + stride2;
673
674 /* FINAL BUTTERFLY : Dependency on Even & Odd */
675 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
676 tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
677 DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
678 tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
679
680 DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
681 m4, m2, m6);
682 DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
683 VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6);
684 DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6,
685 m2, m4, m0);
686 DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
687 VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2,
688 m4, m6);
689
690 /* Load 8 & Store 8 */
691 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
692 tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
693 DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
694 tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
695
696 DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
697 m5, m3, m7);
698 DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
699 VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3,
700 m5, m7);
701 DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7,
702 m3, m5, m1);
703 DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
704 VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3,
705 m5, m7);
706
707 /* Load 8 & Store 8 */
708 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
709 tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
710 DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
711 tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
712 DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
713 n4, n2, n6);
714 DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
715 VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4,
716 n6);
717 DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6,
718 n2, n4, n0);
719 DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
720 VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2,
721 n4, n6);
722
723 /* Load 8 & Store 8 */
724 DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
725 tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
726 DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
727 tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
728 DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
729 n5, n3, n7);
730 DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
731 VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3,
732 n5, n7);
733 DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7,
734 n3, n5, n1);
735 DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
736 VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3,
737 n5, n7);
738 }
739
idct8x32_1d_columns_addblk_lsx(int16_t * input,uint8_t * dst,int32_t dst_stride)740 static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
741 int32_t dst_stride) {
742 DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
743 DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
744
745 idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
746 idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
747 idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
748 dst_stride);
749 }
750
vpx_idct32x32_1024_add_lsx(const int16_t * input,uint8_t * dst,int32_t dst_stride)751 void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst,
752 int32_t dst_stride) {
753 int32_t i;
754 DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
755 int16_t *out_ptr = out_arr;
756
757 /* transform rows */
758 for (i = 0; i < 4; ++i) {
759 /* process 32 * 8 block */
760 idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8)));
761 }
762
763 for (i = 0; i < 4; ++i) {
764 /* process 8 * 32 block */
765 idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
766 dst_stride);
767 }
768 }
769
vpx_idct32x32_34_add_lsx(const int16_t * input,uint8_t * dst,int32_t dst_stride)770 void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst,
771 int32_t dst_stride) {
772 int32_t i;
773 DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
774 int16_t *out_ptr = out_arr;
775 __m128i zero = __lsx_vldi(0);
776
777 for (i = 32; i--;) {
778 __lsx_vst(zero, out_ptr, 0);
779 __lsx_vst(zero, out_ptr, 16);
780 __lsx_vst(zero, out_ptr, 32);
781 __lsx_vst(zero, out_ptr, 48);
782 out_ptr += 32;
783 }
784
785 out_ptr = out_arr;
786
787 /* rows: only upper-left 8x8 has non-zero coeff */
788 idct32x8_1d_rows_lsx(input, out_ptr);
789
790 /* transform columns */
791 for (i = 0; i < 4; ++i) {
792 /* process 8 * 32 block */
793 idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
794 dst_stride);
795 }
796 }
797
vpx_idct32x32_1_add_lsx(const int16_t * input,uint8_t * dst,int32_t dst_stride)798 void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst,
799 int32_t dst_stride) {
800 int32_t i;
801 int16_t out;
802 __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
803 __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec;
804
805 out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
806 out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
807 out = ROUND_POWER_OF_TWO(out, 6);
808
809 vec = __lsx_vreplgr2vr_h(out);
810
811 for (i = 16; i--;) {
812 DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
813 dst2 = __lsx_vldx(dst, dst_stride);
814 dst3 = __lsx_vldx(dst + 16, dst_stride);
815
816 UNPCK_UB_SH(dst0, res0, res4);
817 UNPCK_UB_SH(dst1, res1, res5);
818 UNPCK_UB_SH(dst2, res2, res6);
819 UNPCK_UB_SH(dst3, res3, res7);
820
821 DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
822 res1, res2, res3);
823 DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4,
824 res5, res6, res7);
825 DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0,
826 res7, res3, 0, tmp0, tmp1, tmp2, tmp3);
827 __lsx_vst(tmp0, dst, 0);
828 __lsx_vst(tmp1, dst, 16);
829 dst += dst_stride;
830 __lsx_vst(tmp2, dst, 0);
831 __lsx_vst(tmp3, dst, 16);
832 dst += dst_stride;
833 }
834 }
835