xref: /aosp_15_r20/external/libvpx/vp8/encoder/loongarch/dct_lsx.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  *  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  *  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  *  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  *  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  *  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  */
10*fb1b10abSAndroid Build Coastguard Worker 
11*fb1b10abSAndroid Build Coastguard Worker #include <stdint.h>
12*fb1b10abSAndroid Build Coastguard Worker #include "./vp8_rtcd.h"
13*fb1b10abSAndroid Build Coastguard Worker #include "vpx_util/loongson_intrinsics.h"
14*fb1b10abSAndroid Build Coastguard Worker 
15*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
16*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
17*fb1b10abSAndroid Build Coastguard Worker     __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
18*fb1b10abSAndroid Build Coastguard Worker                                                                                \
19*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
20*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
21*fb1b10abSAndroid Build Coastguard Worker     _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
22*fb1b10abSAndroid Build Coastguard Worker     _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
23*fb1b10abSAndroid Build Coastguard Worker     _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
24*fb1b10abSAndroid Build Coastguard Worker     _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
25*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
26*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
27*fb1b10abSAndroid Build Coastguard Worker   }
28*fb1b10abSAndroid Build Coastguard Worker 
29*fb1b10abSAndroid Build Coastguard Worker #define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2)           \
30*fb1b10abSAndroid Build Coastguard Worker   {                                                                        \
31*fb1b10abSAndroid Build Coastguard Worker     __m128i tmp0_m, tmp1_m, tmp2_m;                                        \
32*fb1b10abSAndroid Build Coastguard Worker                                                                            \
33*fb1b10abSAndroid Build Coastguard Worker     tmp0_m = __lsx_vreplvei_h(coeff, val0);                                \
34*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \
35*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1,     \
36*fb1b10abSAndroid Build Coastguard Worker               const2);                                                     \
37*fb1b10abSAndroid Build Coastguard Worker   }
38*fb1b10abSAndroid Build Coastguard Worker 
39*fb1b10abSAndroid Build Coastguard Worker #define RET_1_IF_NZERO_H(_in)           \
40*fb1b10abSAndroid Build Coastguard Worker   ({                                    \
41*fb1b10abSAndroid Build Coastguard Worker     __m128i tmp_m;                      \
42*fb1b10abSAndroid Build Coastguard Worker     __m128i one_m = __lsx_vldi(0x401);  \
43*fb1b10abSAndroid Build Coastguard Worker     __m128i max_m = __lsx_vldi(0xFF);   \
44*fb1b10abSAndroid Build Coastguard Worker                                         \
45*fb1b10abSAndroid Build Coastguard Worker     tmp_m = __lsx_vseqi_h(_in, 0);      \
46*fb1b10abSAndroid Build Coastguard Worker     tmp_m = __lsx_vxor_v(tmp_m, max_m); \
47*fb1b10abSAndroid Build Coastguard Worker     tmp_m = __lsx_vand_v(tmp_m, one_m); \
48*fb1b10abSAndroid Build Coastguard Worker                                         \
49*fb1b10abSAndroid Build Coastguard Worker     tmp_m;                              \
50*fb1b10abSAndroid Build Coastguard Worker   })
51*fb1b10abSAndroid Build Coastguard Worker 
vp8_short_fdct4x4_lsx(int16_t * input,int16_t * output,int32_t pitch)52*fb1b10abSAndroid Build Coastguard Worker void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
53*fb1b10abSAndroid Build Coastguard Worker   __m128i in0, in1, in2, in3;
54*fb1b10abSAndroid Build Coastguard Worker   __m128i tmp0, tmp1, tmp2, tmp3, const0, const1;
55*fb1b10abSAndroid Build Coastguard Worker   __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
56*fb1b10abSAndroid Build Coastguard Worker   __m128i out0, out1, out2, out3;
57*fb1b10abSAndroid Build Coastguard Worker   __m128i zero = __lsx_vldi(0);
58*fb1b10abSAndroid Build Coastguard Worker   int32_t pitch2 = pitch << 1;
59*fb1b10abSAndroid Build Coastguard Worker   int32_t pitch3 = pitch2 + pitch;
60*fb1b10abSAndroid Build Coastguard Worker 
61*fb1b10abSAndroid Build Coastguard Worker   in0 = __lsx_vld(input, 0);
62*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
63*fb1b10abSAndroid Build Coastguard Worker   in3 = __lsx_vldx(input, pitch3);
64*fb1b10abSAndroid Build Coastguard Worker 
65*fb1b10abSAndroid Build Coastguard Worker   LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
66*fb1b10abSAndroid Build Coastguard Worker   LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
67*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1,
68*fb1b10abSAndroid Build Coastguard Worker             in3);
69*fb1b10abSAndroid Build Coastguard Worker   in0 = __lsx_vadd_h(tmp0, tmp1);
70*fb1b10abSAndroid Build Coastguard Worker   in2 = __lsx_vsub_h(tmp0, tmp1);
71*fb1b10abSAndroid Build Coastguard Worker   SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
72*fb1b10abSAndroid Build Coastguard Worker   tmp0 = __lsx_vilvl_h(in3, in1);
73*fb1b10abSAndroid Build Coastguard Worker   in1 = __lsx_vreplvei_h(coeff, 3);
74*fb1b10abSAndroid Build Coastguard Worker   out0 = __lsx_vpackev_h(zero, in1);
75*fb1b10abSAndroid Build Coastguard Worker   coeff = __lsx_vilvl_h(zero, coeff);
76*fb1b10abSAndroid Build Coastguard Worker   out1 = __lsx_vreplvei_w(coeff, 0);
77*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0,
78*fb1b10abSAndroid Build Coastguard Worker             out1);
79*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3);
80*fb1b10abSAndroid Build Coastguard Worker   LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
81*fb1b10abSAndroid Build Coastguard Worker   LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
82*fb1b10abSAndroid Build Coastguard Worker   tmp2 = __lsx_vadd_h(tmp0, tmp1);
83*fb1b10abSAndroid Build Coastguard Worker   tmp3 = __lsx_vsub_h(tmp0, tmp1);
84*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2);
85*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2);
86*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2);
87*fb1b10abSAndroid Build Coastguard Worker   tmp1 = RET_1_IF_NZERO_H(in3);
88*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0);
89*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1);
90*fb1b10abSAndroid Build Coastguard Worker   out3 = __lsx_vadd_w(out3, out1);
91*fb1b10abSAndroid Build Coastguard Worker   out1 = __lsx_vreplvei_w(coeff, 1);
92*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1,
93*fb1b10abSAndroid Build Coastguard Worker             out3);
94*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3);
95*fb1b10abSAndroid Build Coastguard Worker   out1 = __lsx_vadd_w(out1, tmp1);
96*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2);
97*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(in0, output, 0);
98*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(in2, output, 16);
99*fb1b10abSAndroid Build Coastguard Worker }
100*fb1b10abSAndroid Build Coastguard Worker 
vp8_short_fdct8x4_lsx(int16_t * input,int16_t * output,int32_t pitch)101*fb1b10abSAndroid Build Coastguard Worker void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
102*fb1b10abSAndroid Build Coastguard Worker   __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1;
103*fb1b10abSAndroid Build Coastguard Worker   __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w;
104*fb1b10abSAndroid Build Coastguard Worker   __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
105*fb1b10abSAndroid Build Coastguard Worker   __m128i zero = __lsx_vldi(0);
106*fb1b10abSAndroid Build Coastguard Worker   int32_t pitch2 = pitch << 1;
107*fb1b10abSAndroid Build Coastguard Worker   int32_t pitch3 = pitch2 + pitch;
108*fb1b10abSAndroid Build Coastguard Worker 
109*fb1b10abSAndroid Build Coastguard Worker   in0 = __lsx_vld(input, 0);
110*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
111*fb1b10abSAndroid Build Coastguard Worker   in3 = __lsx_vldx(input, pitch3);
112*fb1b10abSAndroid Build Coastguard Worker   LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
113*fb1b10abSAndroid Build Coastguard Worker 
114*fb1b10abSAndroid Build Coastguard Worker   LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
115*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1,
116*fb1b10abSAndroid Build Coastguard Worker             in1, in3);
117*fb1b10abSAndroid Build Coastguard Worker   in0 = __lsx_vadd_h(temp0, temp1);
118*fb1b10abSAndroid Build Coastguard Worker   in2 = __lsx_vsub_h(temp0, temp1);
119*fb1b10abSAndroid Build Coastguard Worker   SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
120*fb1b10abSAndroid Build Coastguard Worker   temp0 = __lsx_vreplvei_h(coeff, 3);
121*fb1b10abSAndroid Build Coastguard Worker   vec1_w = __lsx_vpackev_h(zero, temp0);
122*fb1b10abSAndroid Build Coastguard Worker   coeff = __lsx_vilvh_h(zero, coeff);
123*fb1b10abSAndroid Build Coastguard Worker   vec3_w = __lsx_vreplvei_w(coeff, 0);
124*fb1b10abSAndroid Build Coastguard Worker   tmp1 = __lsx_vilvl_h(in3, in1);
125*fb1b10abSAndroid Build Coastguard Worker   tmp0 = __lsx_vilvh_h(in3, in1);
126*fb1b10abSAndroid Build Coastguard Worker   vec0_w = vec1_w;
127*fb1b10abSAndroid Build Coastguard Worker   vec2_w = vec3_w;
128*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
129*fb1b10abSAndroid Build Coastguard Worker             vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
130*fb1b10abSAndroid Build Coastguard Worker             vec3_w);
131*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3);
132*fb1b10abSAndroid Build Coastguard Worker   LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
133*fb1b10abSAndroid Build Coastguard Worker 
134*fb1b10abSAndroid Build Coastguard Worker   LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
135*fb1b10abSAndroid Build Coastguard Worker   in0 = __lsx_vadd_h(temp0, temp1);
136*fb1b10abSAndroid Build Coastguard Worker   in0 = __lsx_vaddi_hu(in0, 7);
137*fb1b10abSAndroid Build Coastguard Worker   in2 = __lsx_vsub_h(temp0, temp1);
138*fb1b10abSAndroid Build Coastguard Worker   in2 = __lsx_vaddi_hu(in2, 7);
139*fb1b10abSAndroid Build Coastguard Worker   in0 = __lsx_vsrai_h(in0, 4);
140*fb1b10abSAndroid Build Coastguard Worker   in2 = __lsx_vsrai_h(in2, 4);
141*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w);
142*fb1b10abSAndroid Build Coastguard Worker   vec3_w = __lsx_vadd_w(vec3_w, vec1_w);
143*fb1b10abSAndroid Build Coastguard Worker   vec1_w = __lsx_vreplvei_w(coeff, 1);
144*fb1b10abSAndroid Build Coastguard Worker   const0 = RET_1_IF_NZERO_H(in3);
145*fb1b10abSAndroid Build Coastguard Worker   tmp1 = __lsx_vilvl_h(in3, in1);
146*fb1b10abSAndroid Build Coastguard Worker   tmp0 = __lsx_vilvh_h(in3, in1);
147*fb1b10abSAndroid Build Coastguard Worker   vec0_w = vec1_w;
148*fb1b10abSAndroid Build Coastguard Worker   vec2_w = vec3_w;
149*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
150*fb1b10abSAndroid Build Coastguard Worker             vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
151*fb1b10abSAndroid Build Coastguard Worker             vec3_w);
152*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3);
153*fb1b10abSAndroid Build Coastguard Worker   in1 = __lsx_vadd_h(in1, const0);
154*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1);
155*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(temp0, output, 0);
156*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(temp1, output, 16);
157*fb1b10abSAndroid Build Coastguard Worker 
158*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2);
159*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(in0, output, 32);
160*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(in2, output, 48);
161*fb1b10abSAndroid Build Coastguard Worker }
162