xref: /aosp_15_r20/external/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
13 
14 #define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
15   do {                                                                         \
16     __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
17                                                                                \
18     DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
19     DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
20     _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
21     _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
22     _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
23     _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
24     DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
25     DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
26   } while (0)
27 
28 #if !CONFIG_VP9_HIGHBITDEPTH
fdct8x16_1d_column(const int16_t * input,int16_t * tmp_ptr,int32_t src_stride)29 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
30                         int32_t src_stride) {
31   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
32   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
33   __m128i in8, in9, in10, in11, in12, in13, in14, in15;
34   __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
35   __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
36   __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
37   __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };
38   __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };
39   __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 };
40 
41   int32_t src_stride2 = src_stride << 1;
42   int32_t src_stride4 = src_stride2 << 1;
43   int32_t src_stride6 = src_stride4 + src_stride2;
44   int32_t src_stride8 = src_stride4 << 1;
45   int16_t *input_tmp = (int16_t *)input;
46   in0 = __lsx_vld(input_tmp, 0);
47   DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
48             input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
49   input_tmp += src_stride4;
50   DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
51             input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
52   input_tmp += src_stride4;
53   DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
54             input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
55             in12);
56   input_tmp += src_stride4;
57   DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
58             in14);
59   input_tmp += src_stride2;
60   in15 = __lsx_vldx(input_tmp, src_stride2);
61 
62   DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
63   DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
64   DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
65             in11);
66   DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
67             in15);
68   DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
69             tmp1, tmp2, tmp3);
70   DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
71             tmp6, tmp7);
72   FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
73                 tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
74   __lsx_vst(tmp0, tmp_ptr, 0);
75   __lsx_vst(tmp1, tmp_ptr, 64);
76   __lsx_vst(tmp2, tmp_ptr, 128);
77   __lsx_vst(tmp3, tmp_ptr, 192);
78   __lsx_vst(tmp4, tmp_ptr, 256);
79   __lsx_vst(tmp5, tmp_ptr, 320);
80   __lsx_vst(tmp6, tmp_ptr, 384);
81   __lsx_vst(tmp7, tmp_ptr, 448);
82   DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
83             in14, in13, in12);
84   DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
85             in9, in8);
86 
87   tmp_ptr += 16;
88 
89   /* stp 1 */
90   DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
91   DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);
92 
93   cnst4 = __lsx_vreplvei_h(coeff, 0);
94   DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);
95 
96   cnst5 = __lsx_vreplvei_h(coeff, 1);
97   cnst5 = __lsx_vpackev_h(cnst5, cnst4);
98   DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
99   DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
100   DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);
101 
102   /* stp2 */
103   LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
104   LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
105   DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
106   DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
107   DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1);
108   cnst0 = __lsx_vpackev_h(cnst0, cnst1);
109   DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);
110 
111   cnst0 = __lsx_vreplvei_h(coeff, 4);
112   cnst1 = __lsx_vpackev_h(cnst1, cnst0);
113   DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);
114 
115   LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
116   vec1 = __lsx_vilvl_h(in15, in8);
117   vec0 = __lsx_vilvh_h(in15, in8);
118 
119   DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1);
120   cnst0 = __lsx_vpackev_h(cnst0, cnst1);
121 
122   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
123   __lsx_vst(in8, tmp_ptr, 0);
124 
125   cnst0 = __lsx_vreplvei_h(coeff2, 0);
126   cnst0 = __lsx_vpackev_h(cnst1, cnst0);
127   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
128   __lsx_vst(in8, tmp_ptr, 448);
129 
130   vec1 = __lsx_vilvl_h(in14, in9);
131   vec0 = __lsx_vilvh_h(in14, in9);
132   DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1);
133   cnst1 = __lsx_vpackev_h(cnst1, cnst0);
134 
135   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
136   __lsx_vst(in8, tmp_ptr, 256);
137 
138   cnst1 = __lsx_vreplvei_h(coeff2, 2);
139   cnst0 = __lsx_vpackev_h(cnst0, cnst1);
140   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
141   __lsx_vst(in8, tmp_ptr, 192);
142 
143   DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1);
144   cnst1 = __lsx_vpackev_h(cnst1, cnst0);
145   DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);
146 
147   cnst1 = __lsx_vreplvei_h(coeff, 3);
148   cnst1 = __lsx_vpackev_h(cnst0, cnst1);
149   DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);
150 
151   /* stp4 */
152   DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);
153 
154   vec1 = __lsx_vilvl_h(in13, in10);
155   vec0 = __lsx_vilvh_h(in13, in10);
156   DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1);
157   cnst0 = __lsx_vpackev_h(cnst0, cnst1);
158   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
159   __lsx_vst(in8, tmp_ptr, 128);
160 
161   cnst0 = __lsx_vreplvei_h(coeff2, 1);
162   cnst0 = __lsx_vpackev_h(cnst1, cnst0);
163   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
164   __lsx_vst(in8, tmp_ptr, 320);
165 
166   DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
167   vec1 = __lsx_vilvl_h(in12, in11);
168   vec0 = __lsx_vilvh_h(in12, in11);
169   DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1);
170   cnst1 = __lsx_vpackev_h(cnst1, cnst0);
171 
172   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
173   __lsx_vst(in8, tmp_ptr, 384);
174 
175   cnst1 = __lsx_vreplvei_h(coeff2, 3);
176   cnst0 = __lsx_vpackev_h(cnst0, cnst1);
177   DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
178   __lsx_vst(in8, tmp_ptr, 64);
179 }
180 
fdct16x8_1d_row(int16_t * input,int16_t * output)181 void fdct16x8_1d_row(int16_t *input, int16_t *output) {
182   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
183   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
184   __m128i in8, in9, in10, in11, in12, in13, in14, in15;
185   int16_t *input_tmp = input;
186 
187   DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2,
188             in3);
189   DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5,
190             in6, in7);
191   DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp,
192             112, in8, in9, in10, in11);
193   DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208,
194             input_tmp, 240, in12, in13, in14, in15);
195 
196   LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
197                      in4, in5, in6, in7);
198   LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
199                      in10, in11, in12, in13, in14, in15);
200   DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
201   DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
202   DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10,
203             in11);
204   DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13,
205             in14, in15);
206 
207   DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
208   DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
209   DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
210             in11);
211   DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
212             in15);
213   LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
214                      in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
215                      tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
216                      in15);
217   __lsx_vst(in8, input, 0);
218   __lsx_vst(in9, input, 32);
219   __lsx_vst(in10, input, 64);
220   __lsx_vst(in11, input, 96);
221   __lsx_vst(in12, input, 128);
222   __lsx_vst(in13, input, 160);
223   __lsx_vst(in14, input, 192);
224   __lsx_vst(in15, input, 224);
225 
226   FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
227                 tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
228   DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9,
229             in10, in11);
230   DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12,
231             in13, in14, in15);
232   FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
233                in4, in5, in6, in7);
234   LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
235                      tmp1, in1, tmp2, in2, tmp3, in3);
236   __lsx_vst(tmp0, output, 0);
237   __lsx_vst(in0, output, 32);
238   __lsx_vst(tmp1, output, 64);
239   __lsx_vst(in1, output, 96);
240   __lsx_vst(tmp2, output, 128);
241   __lsx_vst(in2, output, 160);
242   __lsx_vst(tmp3, output, 192);
243   __lsx_vst(in3, output, 224);
244 
245   LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
246                      tmp5, in5, tmp6, in6, tmp7, in7);
247   __lsx_vst(tmp4, output, 16);
248   __lsx_vst(in4, output, 48);
249   __lsx_vst(tmp5, output, 80);
250   __lsx_vst(in5, output, 112);
251   __lsx_vst(tmp6, output, 144);
252   __lsx_vst(in6, output, 176);
253   __lsx_vst(tmp7, output, 208);
254   __lsx_vst(in7, output, 240);
255 }
256 
vpx_fdct4x4_lsx(const int16_t * input,int16_t * output,int32_t src_stride)257 void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
258                      int32_t src_stride) {
259   __m128i in0, in1, in2, in3;
260 
261   int32_t src_stride2 = src_stride << 1;
262   int32_t src_stride4 = src_stride2 << 1;
263   int32_t src_stride6 = src_stride4 + src_stride2;
264 
265   in0 = __lsx_vld(input, 0);
266   DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
267   in3 = __lsx_vldx(input, src_stride6);
268 
269   /* fdct4 pre-process */
270   {
271     __m128i vec, mask;
272     __m128i zero = __lsx_vldi(0);
273 
274     mask = __lsx_vinsgr2vr_b(zero, 1, 0);
275     DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
276               in3);
277     vec = __lsx_vseqi_h(in0, 0);
278     vec = __lsx_vxori_b(vec, 255);
279     vec = __lsx_vand_v(mask, vec);
280     in0 = __lsx_vadd_h(in0, vec);
281   }
282 
283   VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
284   LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
285   VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
286   LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
287   DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
288   DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
289   DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
290   __lsx_vst(in0, output, 0);
291   __lsx_vst(in2, output, 16);
292 }
293 
vpx_fdct8x8_lsx(const int16_t * input,int16_t * output,int32_t src_stride)294 void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
295                      int32_t src_stride) {
296   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
297   int32_t src_stride2 = src_stride << 1;
298   int32_t src_stride4 = src_stride2 << 1;
299   int32_t src_stride6 = src_stride4 + src_stride2;
300   int16_t *input_tmp = (int16_t *)input;
301 
302   in0 = __lsx_vld(input_tmp, 0);
303   DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
304             in2);
305   in3 = __lsx_vldx(input_tmp, src_stride6);
306   input_tmp += src_stride4;
307   in4 = __lsx_vld(input_tmp, 0);
308   DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
309             in6);
310   in7 = __lsx_vldx(input_tmp, src_stride6);
311 
312   DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
313   DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
314 
315   VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
316             in5, in6, in7);
317   LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
318                      in4, in5, in6, in7);
319   VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
320             in5, in6, in7);
321   LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
322                      in4, in5, in6, in7);
323   SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
324 
325   __lsx_vst(in0, output, 0);
326   __lsx_vst(in1, output, 16);
327   __lsx_vst(in2, output, 32);
328   __lsx_vst(in3, output, 48);
329   __lsx_vst(in4, output, 64);
330   __lsx_vst(in5, output, 80);
331   __lsx_vst(in6, output, 96);
332   __lsx_vst(in7, output, 112);
333 }
334 
vpx_fdct16x16_lsx(const int16_t * input,int16_t * output,int32_t src_stride)335 void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
336                        int32_t src_stride) {
337   int32_t i;
338   DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
339 
340   /* column transform */
341   for (i = 0; i < 2; ++i) {
342     fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
343   }
344 
345   /* row transform */
346   for (i = 0; i < 2; ++i) {
347     fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
348   }
349 }
350 #endif  // !CONFIG_VP9_HIGHBITDEPTH
351