1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stdint.h>
12 #include "./vp8_rtcd.h"
13 #include "vpx_util/loongson_intrinsics.h"
14
15 #define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
16 { \
17 __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \
18 \
19 DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \
20 DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \
21 _t0 = __lsx_vilvl_h(_s1, _s0); \
22 _t1 = __lsx_vilvh_h(_s1, _s0); \
23 _t2 = __lsx_vilvl_h(_s3, _s2); \
24 _t3 = __lsx_vilvh_h(_s3, _s2); \
25 DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \
26 DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \
27 }
28
29 #define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \
30 { \
31 __m128i tmp0_m, tmp1_m, tmp2_m; \
32 \
33 tmp0_m = __lsx_vreplvei_h(coeff, val0); \
34 DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \
35 DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1, \
36 const2); \
37 }
38
39 #define RET_1_IF_NZERO_H(_in) \
40 ({ \
41 __m128i tmp_m; \
42 __m128i one_m = __lsx_vldi(0x401); \
43 __m128i max_m = __lsx_vldi(0xFF); \
44 \
45 tmp_m = __lsx_vseqi_h(_in, 0); \
46 tmp_m = __lsx_vxor_v(tmp_m, max_m); \
47 tmp_m = __lsx_vand_v(tmp_m, one_m); \
48 \
49 tmp_m; \
50 })
51
vp8_short_fdct4x4_lsx(int16_t * input,int16_t * output,int32_t pitch)52 void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
53 __m128i in0, in1, in2, in3;
54 __m128i tmp0, tmp1, tmp2, tmp3, const0, const1;
55 __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
56 __m128i out0, out1, out2, out3;
57 __m128i zero = __lsx_vldi(0);
58 int32_t pitch2 = pitch << 1;
59 int32_t pitch3 = pitch2 + pitch;
60
61 in0 = __lsx_vld(input, 0);
62 DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
63 in3 = __lsx_vldx(input, pitch3);
64
65 LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
66 LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
67 DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1,
68 in3);
69 in0 = __lsx_vadd_h(tmp0, tmp1);
70 in2 = __lsx_vsub_h(tmp0, tmp1);
71 SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
72 tmp0 = __lsx_vilvl_h(in3, in1);
73 in1 = __lsx_vreplvei_h(coeff, 3);
74 out0 = __lsx_vpackev_h(zero, in1);
75 coeff = __lsx_vilvl_h(zero, coeff);
76 out1 = __lsx_vreplvei_w(coeff, 0);
77 DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0,
78 out1);
79 DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3);
80 LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
81 LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
82 tmp2 = __lsx_vadd_h(tmp0, tmp1);
83 tmp3 = __lsx_vsub_h(tmp0, tmp1);
84 DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2);
85 DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2);
86 DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2);
87 tmp1 = RET_1_IF_NZERO_H(in3);
88 DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0);
89 DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1);
90 out3 = __lsx_vadd_w(out3, out1);
91 out1 = __lsx_vreplvei_w(coeff, 1);
92 DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1,
93 out3);
94 DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3);
95 out1 = __lsx_vadd_w(out1, tmp1);
96 DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2);
97 __lsx_vst(in0, output, 0);
98 __lsx_vst(in2, output, 16);
99 }
100
vp8_short_fdct8x4_lsx(int16_t * input,int16_t * output,int32_t pitch)101 void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
102 __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1;
103 __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w;
104 __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
105 __m128i zero = __lsx_vldi(0);
106 int32_t pitch2 = pitch << 1;
107 int32_t pitch3 = pitch2 + pitch;
108
109 in0 = __lsx_vld(input, 0);
110 DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
111 in3 = __lsx_vldx(input, pitch3);
112 LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
113
114 LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
115 DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1,
116 in1, in3);
117 in0 = __lsx_vadd_h(temp0, temp1);
118 in2 = __lsx_vsub_h(temp0, temp1);
119 SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
120 temp0 = __lsx_vreplvei_h(coeff, 3);
121 vec1_w = __lsx_vpackev_h(zero, temp0);
122 coeff = __lsx_vilvh_h(zero, coeff);
123 vec3_w = __lsx_vreplvei_w(coeff, 0);
124 tmp1 = __lsx_vilvl_h(in3, in1);
125 tmp0 = __lsx_vilvh_h(in3, in1);
126 vec0_w = vec1_w;
127 vec2_w = vec3_w;
128 DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
129 vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
130 vec3_w);
131 DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3);
132 LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
133
134 LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
135 in0 = __lsx_vadd_h(temp0, temp1);
136 in0 = __lsx_vaddi_hu(in0, 7);
137 in2 = __lsx_vsub_h(temp0, temp1);
138 in2 = __lsx_vaddi_hu(in2, 7);
139 in0 = __lsx_vsrai_h(in0, 4);
140 in2 = __lsx_vsrai_h(in2, 4);
141 DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w);
142 vec3_w = __lsx_vadd_w(vec3_w, vec1_w);
143 vec1_w = __lsx_vreplvei_w(coeff, 1);
144 const0 = RET_1_IF_NZERO_H(in3);
145 tmp1 = __lsx_vilvl_h(in3, in1);
146 tmp0 = __lsx_vilvh_h(in3, in1);
147 vec0_w = vec1_w;
148 vec2_w = vec3_w;
149 DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
150 vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
151 vec3_w);
152 DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3);
153 in1 = __lsx_vadd_h(in1, const0);
154 DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1);
155 __lsx_vst(temp0, output, 0);
156 __lsx_vst(temp1, output, 16);
157
158 DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2);
159 __lsx_vst(in0, output, 32);
160 __lsx_vst(in2, output, 48);
161 }
162