xref: /aosp_15_r20/external/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
12 #define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
13 
14 #include "vpx_dsp/loongarch/txfm_macros_lsx.h"
15 #include "vpx_dsp/txfm_common.h"
16 
17 #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                 \
18   do {                                                                        \
19     __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m;                               \
20     __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                   \
21     __m128i vec4_m, vec5_m, vec6_m, vec7_m;                                   \
22     __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df };             \
23                                                                               \
24     LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);    \
25     DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
26     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m);    \
27     cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
28     vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m);                                 \
29     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m);    \
30     cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m);                              \
31     vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m);                                 \
32                                                                               \
33     vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m);                                 \
34     cnst2_m = __lsx_vreplvei_h(coeff_m, 2);                                   \
35     cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m);                              \
36     vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m);                                 \
37                                                                               \
38     DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m,     \
39               vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
40               vec7_m, DCT_CONST_BITS, out0, out2, out1, out3);                \
41   } while (0)
42 
43 #define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
44                   out3, out4, out5, out6, out7)                             \
45   do {                                                                      \
46     __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                       \
47     __m128i s7_m, x0_m, x1_m, x2_m, x3_m;                                   \
48     __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };           \
49                                                                             \
50     /* FDCT stage1 */                                                       \
51     LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m,   \
52                       s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);                  \
53     LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);      \
54     DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);           \
55     DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);           \
56     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m);        \
57     x1_m = __lsx_vpackev_h(x1_m, x0_m);                                     \
58     DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4);                          \
59                                                                             \
60     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m);        \
61     x2_m = __lsx_vneg_h(x2_m);                                              \
62     x2_m = __lsx_vpackev_h(x3_m, x2_m);                                     \
63     DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6);                          \
64                                                                             \
65     DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0);                          \
66     x2_m = __lsx_vreplvei_h(coeff_m, 2);                                    \
67     x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
68     DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2);                          \
69                                                                             \
70     /* stage2 */                                                            \
71     s1_m = __lsx_vilvl_h(s5_m, s6_m);                                       \
72     s0_m = __lsx_vilvh_h(s5_m, s6_m);                                       \
73                                                                             \
74     DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m);                          \
75     DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m);                          \
76                                                                             \
77     /* stage3 */                                                            \
78     LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);      \
79                                                                             \
80     /* stage4 */                                                            \
81     DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);           \
82     DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);           \
83                                                                             \
84     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m);        \
85     x1_m = __lsx_vpackev_h(x0_m, x1_m);                                     \
86     DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1);                          \
87                                                                             \
88     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m);        \
89     x2_m = __lsx_vpackev_h(x3_m, x2_m);                                     \
90     DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5);                          \
91                                                                             \
92     x1_m = __lsx_vreplvei_h(coeff_m, 5);                                    \
93     x0_m = __lsx_vneg_h(x0_m);                                              \
94     x0_m = __lsx_vpackev_h(x1_m, x0_m);                                     \
95     DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7);                          \
96     x2_m = __lsx_vreplvei_h(coeff_m, 6);                                    \
97     x3_m = __lsx_vneg_h(x3_m);                                              \
98     x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
99     DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                          \
100   } while (0)
101 
102 #define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7)             \
103   do {                                                                      \
104     __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
105                                                                             \
106     DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m,    \
107               vec1_m, vec2_m, vec3_m);                                      \
108     DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m,    \
109               vec5_m, vec6_m, vec7_m);                                      \
110     DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m,  \
111               in3, in0, in1, in2, in3);                                     \
112     DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m,  \
113               in7, in4, in5, in6, in7);                                     \
114   } while (0)
115 
116 #define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
117   do {                                       \
118     __m128i tp0_m, tp1_m;                    \
119     __m128i one = __lsx_vreplgr2vr_h(1);     \
120                                              \
121     tp0_m = __lsx_vslei_h(vec0, 0);          \
122     tp1_m = __lsx_vslei_h(vec1, 0);          \
123     tp0_m = __lsx_vxori_b(tp0_m, 255);       \
124     tp1_m = __lsx_vxori_b(tp1_m, 255);       \
125     vec0 = __lsx_vadd_h(vec0, one);          \
126     vec1 = __lsx_vadd_h(vec1, one);          \
127     tp0_m = __lsx_vand_v(one, tp0_m);        \
128     tp1_m = __lsx_vand_v(one, tp1_m);        \
129     vec0 = __lsx_vadd_h(vec0, tp0_m);        \
130     vec1 = __lsx_vadd_h(vec1, tp1_m);        \
131     vec0 = __lsx_vsrai_h(vec0, 2);           \
132     vec1 = __lsx_vsrai_h(vec1, 2);           \
133   } while (0)
134 
135 #define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
136   do {                                     \
137     __m128i tp0_m, tp1_m;                  \
138     __m128i one_m = __lsx_vldi(0x401);     \
139                                            \
140     tp0_m = __lsx_vslti_h(vec0, 0);        \
141     tp1_m = __lsx_vslti_h(vec1, 0);        \
142     vec0 = __lsx_vadd_h(vec0, one_m);      \
143     vec1 = __lsx_vadd_h(vec1, one_m);      \
144     tp0_m = __lsx_vand_v(one_m, tp0_m);    \
145     tp1_m = __lsx_vand_v(one_m, tp1_m);    \
146     vec0 = __lsx_vadd_h(vec0, tp0_m);      \
147     vec1 = __lsx_vadd_h(vec1, tp1_m);      \
148     vec0 = __lsx_vsrai_h(vec0, 2);         \
149     vec1 = __lsx_vsrai_h(vec1, 2);         \
150   } while (0)
151 
152 #define FDCT32_POSTPROC_NEG_W(vec)         \
153   do {                                     \
154     __m128i temp_m;                        \
155     __m128i one_m = __lsx_vreplgr2vr_w(1); \
156                                            \
157     temp_m = __lsx_vslti_w(vec, 0);        \
158     vec = __lsx_vadd_w(vec, one_m);        \
159     temp_m = __lsx_vand_v(one_m, temp_m);  \
160     vec = __lsx_vadd_w(vec, temp_m);       \
161     vec = __lsx_vsrai_w(vec, 2);           \
162   } while (0)
163 
164 #define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right,       \
165                           const0, const1, out0, out1, out2, out3)             \
166   do {                                                                        \
167     __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                   \
168     __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1;                         \
169     __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0);                       \
170                                                                               \
171     s0_m = __lsx_vreplgr2vr_w((int32_t)const1);                               \
172     k0_m = __lsx_vpackev_w(s0_m, k0_m);                                       \
173                                                                               \
174     DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1);             \
175     s1_m = __lsx_vilvl_w(_tmp0, reg0_left);                                   \
176     s0_m = __lsx_vilvh_w(_tmp0, reg0_left);                                   \
177     s3_m = __lsx_vilvl_w(reg0_left, reg1_left);                               \
178     s2_m = __lsx_vilvh_w(reg0_left, reg1_left);                               \
179     s5_m = __lsx_vilvl_w(_tmp1, reg0_right);                                  \
180     s4_m = __lsx_vilvh_w(_tmp1, reg0_right);                                  \
181     s7_m = __lsx_vilvl_w(reg0_right, reg1_right);                             \
182     s6_m = __lsx_vilvh_w(reg0_right, reg1_right);                             \
183     DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m);          \
184     DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m);          \
185     DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
186               DCT_CONST_BITS, out0, out1);                                    \
187     DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m);          \
188     DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m);          \
189     DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
190               DCT_CONST_BITS, out2, out3);                                    \
191   } while (0)
192 
193 #define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2,   \
194                             in3)                                               \
195   do {                                                                         \
196     __m128i dst0_m, dst1_m, dst2_m, dst3_m;                                    \
197     __m128i tmp0_m, tmp1_m;                                                    \
198     __m128i res0_m, res1_m, res2_m, res3_m;                                    \
199                                                                                \
200     dst0_m = __lsx_vld(dst, 0);                                                \
201     DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m);        \
202     dst3_m = __lsx_vldx(dst, _stride3);                                        \
203     DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \
204               res0_m, res1_m, res2_m, res3_m);                                 \
205     DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m,     \
206               in3, res0_m, res1_m, res2_m, res3_m);                            \
207     DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0,       \
208               tmp0_m, tmp1_m);                                                 \
209     __lsx_vstelm_d(tmp0_m, dst, 0, 0);                                         \
210     __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1);                               \
211     __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0);                              \
212     __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1);                              \
213   } while (0)
214 
215 #define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
216                       out2, out3, out4, out5, out6, out7)                 \
217   do {                                                                    \
218     __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;               \
219     __m128i x0_m, x1_m, x2_m, x3_m;                                       \
220     __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };         \
221                                                                           \
222     /* FDCT stage1 */                                                     \
223     LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
224                       s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);                \
225     LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);    \
226     DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);         \
227     DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);         \
228     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m);      \
229     x1_m = __lsx_vpackev_h(x1_m, x0_m);                                   \
230     DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4);                        \
231                                                                           \
232     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m);      \
233     x2_m = __lsx_vneg_h(x2_m);                                            \
234     x2_m = __lsx_vpackev_h(x3_m, x2_m);                                   \
235     DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6);                        \
236                                                                           \
237     DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0);                        \
238     x2_m = __lsx_vreplvei_h(coeff_m, 2);                                  \
239     x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
240     DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2);                        \
241                                                                           \
242     /* stage2 */                                                          \
243     s1_m = __lsx_vilvl_h(s5_m, s6_m);                                     \
244     s0_m = __lsx_vilvh_h(s5_m, s6_m);                                     \
245                                                                           \
246     DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m);                        \
247     DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m);                        \
248                                                                           \
249     /* stage3 */                                                          \
250     LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);    \
251                                                                           \
252     /* stage4 */                                                          \
253     DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);         \
254     DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);         \
255                                                                           \
256     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m);      \
257     x1_m = __lsx_vpackev_h(x0_m, x1_m);                                   \
258     DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1);                        \
259                                                                           \
260     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m);      \
261     x2_m = __lsx_vpackev_h(x3_m, x2_m);                                   \
262     DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5);                        \
263                                                                           \
264     x1_m = __lsx_vreplvei_h(coeff_m, 5);                                  \
265     x0_m = __lsx_vneg_h(x0_m);                                            \
266     x0_m = __lsx_vpackev_h(x1_m, x0_m);                                   \
267     DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7);                        \
268                                                                           \
269     x2_m = __lsx_vreplvei_h(coeff_m, 6);                                  \
270     x3_m = __lsx_vneg_h(x3_m);                                            \
271     x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
272     DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                        \
273   } while (0)
274 
275 #define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6,  \
276                      input7, out1, out3, out5, out7, out9, out11, out13,      \
277                      out15)                                                   \
278   do {                                                                        \
279     __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;             \
280     __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;             \
281     __m128i stp36_m, stp37_m, vec0_m, vec1_m;                                 \
282     __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                           \
283     __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m;                               \
284     __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };             \
285     __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };            \
286     __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 };                           \
287                                                                               \
288     /* stp 1 */                                                               \
289     DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \
290     DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \
291                                                                               \
292     cnst4_m = __lsx_vreplvei_h(coeff_m, 0);                                   \
293     DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m);                  \
294                                                                               \
295     cnst5_m = __lsx_vreplvei_h(coeff_m, 1);                                   \
296     cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m);                              \
297     DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m);                  \
298     DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m);                  \
299     DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m);                  \
300                                                                               \
301     /* stp2 */                                                                \
302     LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m,     \
303                       stp32_m, stp33_m);                                      \
304     LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m,     \
305                       stp35_m, stp34_m);                                      \
306                                                                               \
307     DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m,      \
308               vec4_m);                                                        \
309     DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m,      \
310               vec5_m);                                                        \
311                                                                               \
312     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m);    \
313     cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
314     DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m);                  \
315                                                                               \
316     cnst0_m = __lsx_vreplvei_h(coeff_m, 4);                                   \
317     cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
318     DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m);                  \
319                                                                               \
320     DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m);    \
321     cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
322     DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m);                  \
323                                                                               \
324     cnst0_m = __lsx_vreplvei_h(coeff_m, 3);                                   \
325     cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
326     DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m);                  \
327                                                                               \
328     /* stp4 */                                                                \
329     LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m,     \
330                       vec4_m, vec5_m);                                        \
331     LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m,   \
332                       stp24_m, stp31_m);                                      \
333                                                                               \
334     vec1_m = __lsx_vilvl_h(vec2_m, vec6_m);                                   \
335     vec0_m = __lsx_vilvh_h(vec2_m, vec6_m);                                   \
336     DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m);  \
337     cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
338                                                                               \
339     DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1);                     \
340                                                                               \
341     cnst0_m = __lsx_vreplvei_h(coeff2_m, 0);                                  \
342     cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
343     DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15);                    \
344                                                                               \
345     vec1_m = __lsx_vilvl_h(vec4_m, vec5_m);                                   \
346     vec0_m = __lsx_vilvh_h(vec4_m, vec5_m);                                   \
347     DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m);  \
348     cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
349                                                                               \
350     DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9);                     \
351                                                                               \
352     cnst1_m = __lsx_vreplvei_h(coeff2_m, 2);                                  \
353     cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
354     DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7);                     \
355                                                                               \
356     vec1_m = __lsx_vilvl_h(stp23_m, stp21_m);                                 \
357     vec0_m = __lsx_vilvh_h(stp23_m, stp21_m);                                 \
358     DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m);  \
359     cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
360     DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5);                     \
361                                                                               \
362     cnst0_m = __lsx_vreplvei_h(coeff2_m, 1);                                  \
363     cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
364     DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11);                    \
365                                                                               \
366     vec1_m = __lsx_vilvl_h(stp24_m, stp31_m);                                 \
367     vec0_m = __lsx_vilvh_h(stp24_m, stp31_m);                                 \
368     DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m);  \
369     cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
370                                                                               \
371     DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13);                    \
372                                                                               \
373     cnst1_m = __lsx_vreplvei_h(coeff2_m, 3);                                  \
374     cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
375     DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3);                     \
376   } while (0)
377 
378 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
379                         int32_t src_stride);
380 void fdct16x8_1d_row(int16_t *input, int16_t *output);
381 #endif  // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
382