1 /* 2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ 12 #define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ 13 14 #include "vpx_dsp/loongarch/txfm_macros_lsx.h" 15 #include "vpx_dsp/txfm_common.h" 16 17 #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ 18 do { \ 19 __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ 20 __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ 21 __m128i vec4_m, vec5_m, vec6_m, vec7_m; \ 22 __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \ 23 \ 24 LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ 25 DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ 26 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \ 27 cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ 28 vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \ 29 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \ 30 cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \ 31 vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ 32 \ 33 vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \ 34 cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \ 35 cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \ 36 vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ 37 \ 38 DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \ 39 vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \ 40 vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \ 41 } while (0) 42 43 #define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 44 out3, out4, out5, out6, out7) \ 45 do { \ 46 __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ 47 __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \ 48 __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ 49 \ 50 /* FDCT stage1 */ \ 51 LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ 52 s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ 53 LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ 54 DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ 55 DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ 56 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ 57 x1_m = __lsx_vpackev_h(x1_m, x0_m); \ 58 DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ 59 \ 60 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ 61 x2_m = __lsx_vneg_h(x2_m); \ 62 x2_m = __lsx_vpackev_h(x3_m, x2_m); \ 63 DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ 64 \ 65 DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ 66 x2_m = __lsx_vreplvei_h(coeff_m, 2); \ 67 x2_m = __lsx_vpackev_h(x2_m, x3_m); \ 68 DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ 69 \ 70 /* stage2 */ \ 71 s1_m = __lsx_vilvl_h(s5_m, s6_m); \ 72 s0_m = __lsx_vilvh_h(s5_m, s6_m); \ 73 \ 74 DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ 75 DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ 76 \ 77 /* stage3 */ \ 78 LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ 79 \ 80 /* stage4 */ \ 81 DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ 82 DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ 83 \ 84 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ 85 x1_m = __lsx_vpackev_h(x0_m, x1_m); \ 86 DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ 87 \ 88 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ 89 x2_m = __lsx_vpackev_h(x3_m, x2_m); \ 90 DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ 91 \ 92 x1_m = __lsx_vreplvei_h(coeff_m, 5); \ 93 x0_m = __lsx_vneg_h(x0_m); \ 94 x0_m = __lsx_vpackev_h(x1_m, x0_m); \ 95 DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ 96 x2_m = __lsx_vreplvei_h(coeff_m, 6); \ 97 x3_m = __lsx_vneg_h(x3_m); \ 98 x2_m = __lsx_vpackev_h(x2_m, x3_m); \ 99 DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ 100 } while (0) 101 102 #define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ 103 do { \ 104 __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 105 \ 106 DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \ 107 vec1_m, vec2_m, vec3_m); \ 108 DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \ 109 vec5_m, vec6_m, vec7_m); \ 110 DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \ 111 in3, in0, in1, in2, in3); \ 112 DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \ 113 in7, in4, in5, in6, in7); \ 114 } while (0) 115 116 #define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ 117 do { \ 118 __m128i tp0_m, tp1_m; \ 119 __m128i one = __lsx_vreplgr2vr_h(1); \ 120 \ 121 tp0_m = __lsx_vslei_h(vec0, 0); \ 122 tp1_m = __lsx_vslei_h(vec1, 0); \ 123 tp0_m = __lsx_vxori_b(tp0_m, 255); \ 124 tp1_m = __lsx_vxori_b(tp1_m, 255); \ 125 vec0 = __lsx_vadd_h(vec0, one); \ 126 vec1 = __lsx_vadd_h(vec1, one); \ 127 tp0_m = __lsx_vand_v(one, tp0_m); \ 128 tp1_m = __lsx_vand_v(one, tp1_m); \ 129 vec0 = __lsx_vadd_h(vec0, tp0_m); \ 130 vec1 = __lsx_vadd_h(vec1, tp1_m); \ 131 vec0 = __lsx_vsrai_h(vec0, 2); \ 132 vec1 = __lsx_vsrai_h(vec1, 2); \ 133 } while (0) 134 135 #define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ 136 do { \ 137 __m128i tp0_m, tp1_m; \ 138 __m128i one_m = __lsx_vldi(0x401); \ 139 \ 140 tp0_m = __lsx_vslti_h(vec0, 0); \ 141 tp1_m = __lsx_vslti_h(vec1, 0); \ 142 vec0 = __lsx_vadd_h(vec0, one_m); \ 143 vec1 = __lsx_vadd_h(vec1, one_m); \ 144 tp0_m = __lsx_vand_v(one_m, tp0_m); \ 145 tp1_m = __lsx_vand_v(one_m, tp1_m); \ 146 vec0 = __lsx_vadd_h(vec0, tp0_m); \ 147 vec1 = __lsx_vadd_h(vec1, tp1_m); \ 148 vec0 = __lsx_vsrai_h(vec0, 2); \ 149 vec1 = __lsx_vsrai_h(vec1, 2); \ 150 } while (0) 151 152 #define FDCT32_POSTPROC_NEG_W(vec) \ 153 do { \ 154 __m128i temp_m; \ 155 __m128i one_m = __lsx_vreplgr2vr_w(1); \ 156 \ 157 temp_m = __lsx_vslti_w(vec, 0); \ 158 vec = __lsx_vadd_w(vec, one_m); \ 159 temp_m = __lsx_vand_v(one_m, temp_m); \ 160 vec = __lsx_vadd_w(vec, temp_m); \ 161 vec = __lsx_vsrai_w(vec, 2); \ 162 } while (0) 163 164 #define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ 165 const0, const1, out0, out1, out2, out3) \ 166 do { \ 167 __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ 168 __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1; \ 169 __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0); \ 170 \ 171 s0_m = __lsx_vreplgr2vr_w((int32_t)const1); \ 172 k0_m = __lsx_vpackev_w(s0_m, k0_m); \ 173 \ 174 DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1); \ 175 s1_m = __lsx_vilvl_w(_tmp0, reg0_left); \ 176 s0_m = __lsx_vilvh_w(_tmp0, reg0_left); \ 177 s3_m = __lsx_vilvl_w(reg0_left, reg1_left); \ 178 s2_m = __lsx_vilvh_w(reg0_left, reg1_left); \ 179 s5_m = __lsx_vilvl_w(_tmp1, reg0_right); \ 180 s4_m = __lsx_vilvh_w(_tmp1, reg0_right); \ 181 s7_m = __lsx_vilvl_w(reg0_right, reg1_right); \ 182 s6_m = __lsx_vilvh_w(reg0_right, reg1_right); \ 183 DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m); \ 184 DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m); \ 185 DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ 186 DCT_CONST_BITS, out0, out1); \ 187 DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m); \ 188 DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m); \ 189 DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ 190 DCT_CONST_BITS, out2, out3); \ 191 } while (0) 192 193 #define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2, \ 194 in3) \ 195 do { \ 196 __m128i dst0_m, dst1_m, dst2_m, dst3_m; \ 197 __m128i tmp0_m, tmp1_m; \ 198 __m128i res0_m, res1_m, res2_m, res3_m; \ 199 \ 200 dst0_m = __lsx_vld(dst, 0); \ 201 DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m); \ 202 dst3_m = __lsx_vldx(dst, _stride3); \ 203 DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \ 204 res0_m, res1_m, res2_m, res3_m); \ 205 DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m, \ 206 in3, res0_m, res1_m, res2_m, res3_m); \ 207 DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0, \ 208 tmp0_m, tmp1_m); \ 209 __lsx_vstelm_d(tmp0_m, dst, 0, 0); \ 210 __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1); \ 211 __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0); \ 212 __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \ 213 } while (0) 214 215 #define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 216 out2, out3, out4, out5, out6, out7) \ 217 do { \ 218 __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ 219 __m128i x0_m, x1_m, x2_m, x3_m; \ 220 __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ 221 \ 222 /* FDCT stage1 */ \ 223 LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ 224 s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ 225 LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ 226 DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ 227 DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ 228 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ 229 x1_m = __lsx_vpackev_h(x1_m, x0_m); \ 230 DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ 231 \ 232 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ 233 x2_m = __lsx_vneg_h(x2_m); \ 234 x2_m = __lsx_vpackev_h(x3_m, x2_m); \ 235 DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ 236 \ 237 DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ 238 x2_m = __lsx_vreplvei_h(coeff_m, 2); \ 239 x2_m = __lsx_vpackev_h(x2_m, x3_m); \ 240 DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ 241 \ 242 /* stage2 */ \ 243 s1_m = __lsx_vilvl_h(s5_m, s6_m); \ 244 s0_m = __lsx_vilvh_h(s5_m, s6_m); \ 245 \ 246 DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ 247 DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ 248 \ 249 /* stage3 */ \ 250 LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ 251 \ 252 /* stage4 */ \ 253 DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ 254 DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ 255 \ 256 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ 257 x1_m = __lsx_vpackev_h(x0_m, x1_m); \ 258 DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ 259 \ 260 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ 261 x2_m = __lsx_vpackev_h(x3_m, x2_m); \ 262 DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ 263 \ 264 x1_m = __lsx_vreplvei_h(coeff_m, 5); \ 265 x0_m = __lsx_vneg_h(x0_m); \ 266 x0_m = __lsx_vpackev_h(x1_m, x0_m); \ 267 DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ 268 \ 269 x2_m = __lsx_vreplvei_h(coeff_m, 6); \ 270 x3_m = __lsx_vneg_h(x3_m); \ 271 x2_m = __lsx_vpackev_h(x2_m, x3_m); \ 272 DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ 273 } while (0) 274 275 #define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ 276 input7, out1, out3, out5, out7, out9, out11, out13, \ 277 out15) \ 278 do { \ 279 __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ 280 __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ 281 __m128i stp36_m, stp37_m, vec0_m, vec1_m; \ 282 __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ 283 __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ 284 __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; \ 285 __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; \ 286 __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 }; \ 287 \ 288 /* stp 1 */ \ 289 DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \ 290 DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \ 291 \ 292 cnst4_m = __lsx_vreplvei_h(coeff_m, 0); \ 293 DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m); \ 294 \ 295 cnst5_m = __lsx_vreplvei_h(coeff_m, 1); \ 296 cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m); \ 297 DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m); \ 298 DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m); \ 299 DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m); \ 300 \ 301 /* stp2 */ \ 302 LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, \ 303 stp32_m, stp33_m); \ 304 LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, \ 305 stp35_m, stp34_m); \ 306 \ 307 DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, \ 308 vec4_m); \ 309 DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, \ 310 vec5_m); \ 311 \ 312 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m); \ 313 cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ 314 DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m); \ 315 \ 316 cnst0_m = __lsx_vreplvei_h(coeff_m, 4); \ 317 cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ 318 DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m); \ 319 \ 320 DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m); \ 321 cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ 322 DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m); \ 323 \ 324 cnst0_m = __lsx_vreplvei_h(coeff_m, 3); \ 325 cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ 326 DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m); \ 327 \ 328 /* stp4 */ \ 329 LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, \ 330 vec4_m, vec5_m); \ 331 LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, \ 332 stp24_m, stp31_m); \ 333 \ 334 vec1_m = __lsx_vilvl_h(vec2_m, vec6_m); \ 335 vec0_m = __lsx_vilvh_h(vec2_m, vec6_m); \ 336 DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m); \ 337 cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ 338 \ 339 DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1); \ 340 \ 341 cnst0_m = __lsx_vreplvei_h(coeff2_m, 0); \ 342 cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ 343 DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15); \ 344 \ 345 vec1_m = __lsx_vilvl_h(vec4_m, vec5_m); \ 346 vec0_m = __lsx_vilvh_h(vec4_m, vec5_m); \ 347 DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m); \ 348 cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ 349 \ 350 DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9); \ 351 \ 352 cnst1_m = __lsx_vreplvei_h(coeff2_m, 2); \ 353 cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ 354 DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7); \ 355 \ 356 vec1_m = __lsx_vilvl_h(stp23_m, stp21_m); \ 357 vec0_m = __lsx_vilvh_h(stp23_m, stp21_m); \ 358 DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m); \ 359 cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ 360 DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5); \ 361 \ 362 cnst0_m = __lsx_vreplvei_h(coeff2_m, 1); \ 363 cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ 364 DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11); \ 365 \ 366 vec1_m = __lsx_vilvl_h(stp24_m, stp31_m); \ 367 vec0_m = __lsx_vilvh_h(stp24_m, stp31_m); \ 368 DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m); \ 369 cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ 370 \ 371 DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13); \ 372 \ 373 cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \ 374 cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ 375 DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \ 376 } while (0) 377 378 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, 379 int32_t src_stride); 380 void fdct16x8_1d_row(int16_t *input, int16_t *output); 381 #endif // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ 382