1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
14
15 static const uint8_t mc_filt_mask_arr[16 * 3] = {
16 /* 8 width cases */
17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
18 /* 4 width cases */
19 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
20 /* 4 width cases */
21 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
22 };
23
common_hv_8ht_8vt_4w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)24 static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
25 uint8_t *dst, int32_t dst_stride,
26 int8_t *filter_horiz, int8_t *filter_vert,
27 int32_t height) {
28 uint32_t loop_cnt = (height >> 2);
29 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
30 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
31 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
32 __m128i mask0, mask1, mask2, mask3;
33 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
34 __m128i out0, out1;
35 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
36
37 mask0 = __lsx_vld(mc_filt_mask_arr, 16);
38 src -= (3 + 3 * src_stride);
39 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
40 filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
41 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
42 mask3 = __lsx_vaddi_bu(mask0, 6);
43
44 LSX_LD_4(src, src_stride, src0, src1, src2, src3);
45 src += src_stride;
46 src4 = __lsx_vld(src, 0);
47 src += src_stride;
48 src5 = __lsx_vld(src, 0);
49 src += src_stride;
50 src6 = __lsx_vld(src, 0);
51 src += src_stride;
52 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
53 src1, src2, src3);
54 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
55 src6 = __lsx_vxori_b(src6, 128);
56
57 tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
58 filt_hz1, filt_hz2, filt_hz3);
59 tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
60 filt_hz1, filt_hz2, filt_hz3);
61 tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
62 filt_hz1, filt_hz2, filt_hz3);
63 tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
64 filt_hz1, filt_hz2, filt_hz3);
65 DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
66 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
67 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
68 DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
69 tmp2 = __lsx_vpackev_b(tmp5, tmp4);
70
71 for (; loop_cnt--;) {
72 LSX_LD_4(src, src_stride, src7, src8, src9, src10);
73 src += src_stride;
74 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
75 src8, src9, src10);
76 tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
77 filt_hz1, filt_hz2, filt_hz3);
78 tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
79 tmp4 = __lsx_vpackev_b(tmp3, tmp4);
80 out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
81 filt_vt2, filt_vt3);
82 src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
83 filt_hz1, filt_hz2, filt_hz3);
84 src0 = __lsx_vshuf_b(src1, tmp3, shuff);
85 src0 = __lsx_vpackev_b(src1, src0);
86 out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
87 filt_vt2, filt_vt3);
88 out0 = __lsx_vssrarni_b_h(out1, out0, 7);
89 out0 = __lsx_vxori_b(out0, 128);
90 __lsx_vstelm_w(out0, dst, 0, 0);
91 dst += dst_stride;
92 __lsx_vstelm_w(out0, dst, 0, 1);
93 dst += dst_stride;
94 __lsx_vstelm_w(out0, dst, 0, 2);
95 dst += dst_stride;
96 __lsx_vstelm_w(out0, dst, 0, 3);
97 dst += dst_stride;
98
99 tmp5 = src1;
100 tmp0 = tmp2;
101 tmp1 = tmp4;
102 tmp2 = src0;
103 }
104 }
105
common_hv_8ht_8vt_8w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)106 static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
107 uint8_t *dst, int32_t dst_stride,
108 int8_t *filter_horiz, int8_t *filter_vert,
109 int32_t height) {
110 uint32_t loop_cnt = (height >> 2);
111 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
112 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
113 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
114 __m128i mask0, mask1, mask2, mask3;
115 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
116 __m128i out0, out1;
117
118 mask0 = __lsx_vld(mc_filt_mask_arr, 0);
119 src -= (3 + 3 * src_stride);
120 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
121 filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
122 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
123 mask3 = __lsx_vaddi_bu(mask0, 6);
124
125 LSX_LD_4(src, src_stride, src0, src1, src2, src3);
126 src += src_stride;
127 src4 = __lsx_vld(src, 0);
128 src += src_stride;
129 src5 = __lsx_vld(src, 0);
130 src += src_stride;
131 src6 = __lsx_vld(src, 0);
132 src += src_stride;
133 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
134 src1, src2, src3);
135 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
136 src6 = __lsx_vxori_b(src6, 128);
137
138 src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
139 filt_hz1, filt_hz2, filt_hz3);
140 src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
141 filt_hz1, filt_hz2, filt_hz3);
142 src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
143 filt_hz1, filt_hz2, filt_hz3);
144 src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
145 filt_hz1, filt_hz2, filt_hz3);
146 src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
147 filt_hz1, filt_hz2, filt_hz3);
148 src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
149 filt_hz1, filt_hz2, filt_hz3);
150 src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
151 filt_hz1, filt_hz2, filt_hz3);
152
153 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
154 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
155 DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
156 tmp0, tmp1, tmp2, tmp4);
157 DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
158
159 for (; loop_cnt--;) {
160 LSX_LD_4(src, src_stride, src7, src8, src9, src10);
161 src += src_stride;
162 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
163 src8, src9, src10);
164 src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
165 filt_hz1, filt_hz2, filt_hz3);
166 tmp3 = __lsx_vpackev_b(src7, src6);
167 out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
168 filt_vt2, filt_vt3);
169 src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
170 filt_hz1, filt_hz2, filt_hz3);
171 src0 = __lsx_vpackev_b(src8, src7);
172 out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
173 filt_vt2, filt_vt3);
174 src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
175 filt_hz1, filt_hz2, filt_hz3);
176 src1 = __lsx_vpackev_b(src9, src8);
177 src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
178 filt_vt2, filt_vt3);
179 src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
180 filt_hz1, filt_hz2, filt_hz3);
181 src2 = __lsx_vpackev_b(src10, src9);
182 src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
183 filt_vt2, filt_vt3);
184 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
185 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
186 __lsx_vstelm_d(out0, dst, 0, 0);
187 dst += dst_stride;
188 __lsx_vstelm_d(out0, dst, 0, 1);
189 dst += dst_stride;
190 __lsx_vstelm_d(out1, dst, 0, 0);
191 dst += dst_stride;
192 __lsx_vstelm_d(out1, dst, 0, 1);
193 dst += dst_stride;
194
195 src6 = src10;
196 tmp0 = tmp2;
197 tmp1 = tmp3;
198 tmp2 = src1;
199 tmp4 = tmp6;
200 tmp5 = src0;
201 tmp6 = src2;
202 }
203 }
204
common_hv_8ht_8vt_16w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)205 static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
206 uint8_t *dst, int32_t dst_stride,
207 int8_t *filter_horiz, int8_t *filter_vert,
208 int32_t height) {
209 common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
210 filter_vert, height);
211 src += 8;
212 dst += 8;
213
214 common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
215 filter_vert, height);
216 src += 8;
217 dst += 8;
218 }
219
common_hv_8ht_8vt_32w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)220 static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
221 uint8_t *dst, int32_t dst_stride,
222 int8_t *filter_horiz, int8_t *filter_vert,
223 int32_t height) {
224 int32_t multiple8_cnt;
225 for (multiple8_cnt = 4; multiple8_cnt--;) {
226 common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
227 filter_vert, height);
228 src += 8;
229 dst += 8;
230 }
231 }
232
common_hv_8ht_8vt_64w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)233 static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
234 uint8_t *dst, int32_t dst_stride,
235 int8_t *filter_horiz, int8_t *filter_vert,
236 int32_t height) {
237 int32_t multiple8_cnt;
238 for (multiple8_cnt = 8; multiple8_cnt--;) {
239 common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
240 filter_vert, height);
241 src += 8;
242 dst += 8;
243 }
244 }
245
common_hv_2ht_2vt_4x4_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)246 static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
247 uint8_t *dst, int32_t dst_stride,
248 int8_t *filter_horiz,
249 int8_t *filter_vert) {
250 __m128i src0, src1, src2, src3, src4, mask;
251 __m128i filt_vt, filt_hz, vec0, vec1;
252 __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
253 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
254
255 int32_t src_stride2 = src_stride << 1;
256 int32_t src_stride3 = src_stride + src_stride2;
257 int32_t src_stride4 = src_stride2 << 1;
258
259 int32_t dst_stride2 = dst_stride << 1;
260 int32_t dst_stride3 = dst_stride2 + dst_stride;
261 mask = __lsx_vld(mc_filt_mask_arr, 16);
262
263 /* rearranging filter */
264 filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
265 filt_vt = __lsx_vldrepl_h(filter_vert, 0);
266
267 src0 = __lsx_vld(src, 0);
268 DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
269 src, src_stride4, src1, src2, src3, src4);
270 hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
271 hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
272 hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
273
274 hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
275 hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
276
277 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
278 DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
279 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1,
280 FILTER_BITS, tmp0, tmp1);
281
282 __lsx_vstelm_w(tmp0, dst, 0, 0);
283 __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
284 __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0);
285 __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1);
286 }
287
common_hv_2ht_2vt_4x8_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)288 static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
289 uint8_t *dst, int32_t dst_stride,
290 int8_t *filter_horiz,
291 int8_t *filter_vert) {
292 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
293 __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
294 __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
295 __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7;
296 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
297
298 int32_t src_stride2 = src_stride << 1;
299 int32_t src_stride3 = src_stride2 + src_stride;
300 int32_t src_stride4 = src_stride2 << 1;
301
302 int32_t dst_stride2 = dst_stride << 1;
303 int32_t dst_stride3 = dst_stride2 + dst_stride;
304 int32_t dst_stride4 = dst_stride2 << 1;
305
306 mask = __lsx_vld(mc_filt_mask_arr, 16);
307
308 /* rearranging filter */
309 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
310
311 src0 = __lsx_vld(src, 0);
312 DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
313 src, src_stride4, src1, src2, src3, src4);
314 src += src_stride4;
315 DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
316 src, src_stride4, src5, src6, src7, src8);
317 src += src_stride4;
318
319 hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
320 hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
321 hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
322 hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
323 hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
324
325 DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
326 hz_out1, hz_out3);
327 hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
328 hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
329 DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
330 hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
331 DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
332 filt_vt, vec4, vec5, vec6, vec7);
333 DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
334 FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4,
335 vec5, vec6, vec7);
336
337 __lsx_vstelm_w(vec4, dst, 0, 0);
338 __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1);
339 __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0);
340 __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1);
341 dst += dst_stride4;
342 __lsx_vstelm_w(vec6, dst, 0, 0);
343 __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1);
344 __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0);
345 __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1);
346 }
347
common_hv_2ht_2vt_4w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)348 static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride,
349 uint8_t *dst, int32_t dst_stride,
350 int8_t *filter_horiz, int8_t *filter_vert,
351 int32_t height) {
352 if (height == 4) {
353 common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
354 filter_vert);
355 } else if (height == 8) {
356 common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz,
357 filter_vert);
358 }
359 }
360
common_hv_2ht_2vt_8x4_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)361 static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
362 uint8_t *dst, int32_t dst_stride,
363 int8_t *filter_horiz,
364 int8_t *filter_vert) {
365 __m128i src0, src1, src2, src3, src4, mask;
366 __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
367 __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
368
369 int32_t src_stride2 = src_stride << 1;
370 int32_t src_stride3 = src_stride2 + src_stride;
371 int32_t src_stride4 = src_stride2 << 1;
372
373 int32_t dst_stride2 = dst_stride << 1;
374 int32_t dst_stride3 = dst_stride2 + dst_stride;
375
376 mask = __lsx_vld(mc_filt_mask_arr, 0);
377
378 /* rearranging filter */
379 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
380
381 src0 = __lsx_vld(src, 0);
382 DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
383 src, src_stride4, src1, src2, src3, src4);
384
385 hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
386 hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
387 vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
388 tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
389
390 hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
391 vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
392 tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
393
394 hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
395 vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
396 tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
397
398 hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
399 vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
400 tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
401
402 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
403 FILTER_BITS, tmp0, tmp1);
404
405 __lsx_vstelm_d(tmp0, dst, 0, 0);
406 __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
407 __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0);
408 __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1);
409 }
410
common_hv_2ht_2vt_8x8mult_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)411 static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
412 int32_t src_stride, uint8_t *dst,
413 int32_t dst_stride,
414 int8_t *filter_horiz,
415 int8_t *filter_vert, int32_t height) {
416 uint32_t loop_cnt = (height >> 3);
417 __m128i src0, src1, src2, src3, src4, mask;
418 __m128i filt_hz, filt_vt, vec0;
419 __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4;
420
421 int32_t src_stride2 = src_stride << 1;
422 int32_t src_stride3 = src_stride2 + src_stride;
423 int32_t src_stride4 = src_stride2 << 1;
424
425 mask = __lsx_vld(mc_filt_mask_arr, 0);
426
427 /* rearranging filter */
428 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
429
430 src0 = __lsx_vld(src, 0);
431 src += src_stride;
432
433 hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
434
435 for (; loop_cnt--;) {
436 src1 = __lsx_vld(src, 0);
437 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
438 src4 = __lsx_vldx(src, src_stride3);
439 src += src_stride4;
440
441 hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
442 vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
443 tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
444
445 hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
446 vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
447 tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
448
449 hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
450 vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
451 tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
452
453 hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
454 src1 = __lsx_vld(src, 0);
455 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
456 src4 = __lsx_vldx(src, src_stride3);
457 src += src_stride4;
458 vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
459 tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
460
461 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
462 FILTER_BITS, tmp1, tmp2);
463
464 __lsx_vstelm_d(tmp1, dst, 0, 0);
465 dst += dst_stride;
466 __lsx_vstelm_d(tmp1, dst, 0, 1);
467 dst += dst_stride;
468 __lsx_vstelm_d(tmp2, dst, 0, 0);
469 dst += dst_stride;
470 __lsx_vstelm_d(tmp2, dst, 0, 1);
471 dst += dst_stride;
472
473 hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
474 vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
475 tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
476
477 hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
478 vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
479 tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
480
481 hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
482 vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
483 tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
484
485 hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
486 vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
487 tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
488
489 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
490 FILTER_BITS, tmp1, tmp2);
491
492 __lsx_vstelm_d(tmp1, dst, 0, 0);
493 dst += dst_stride;
494 __lsx_vstelm_d(tmp1, dst, 0, 1);
495 dst += dst_stride;
496 __lsx_vstelm_d(tmp2, dst, 0, 0);
497 dst += dst_stride;
498 __lsx_vstelm_d(tmp2, dst, 0, 1);
499 dst += dst_stride;
500 }
501 }
502
common_hv_2ht_2vt_8w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)503 static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride,
504 uint8_t *dst, int32_t dst_stride,
505 int8_t *filter_horiz, int8_t *filter_vert,
506 int32_t height) {
507 if (height == 4) {
508 common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
509 filter_vert);
510 } else {
511 common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride,
512 filter_horiz, filter_vert, height);
513 }
514 }
515
common_hv_2ht_2vt_16w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)516 static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
517 uint8_t *dst, int32_t dst_stride,
518 int8_t *filter_horiz, int8_t *filter_vert,
519 int32_t height) {
520 uint32_t loop_cnt = (height >> 2);
521 __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
522 __m128i filt_hz, filt_vt, vec0, vec1;
523 __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
524
525 int32_t src_stride2 = src_stride << 1;
526 int32_t src_stride3 = src_stride2 + src_stride;
527 int32_t src_stride4 = src_stride2 << 1;
528
529 mask = __lsx_vld(mc_filt_mask_arr, 0);
530
531 /* rearranging filter */
532 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
533
534 DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
535 src += src_stride;
536
537 hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
538 hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
539
540 for (; loop_cnt--;) {
541 uint8_t *src_tmp0 = src + 8;
542
543 DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1);
544 DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src,
545 src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5);
546 DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7);
547 src += src_stride4;
548
549 hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
550 hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
551 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
552 DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
553 tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
554 __lsx_vst(tmp, dst, 0);
555 dst += dst_stride;
556
557 hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
558 hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
559 DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
560 DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
561 tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
562 __lsx_vst(tmp, dst, 0);
563 dst += dst_stride;
564
565 hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
566 hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
567 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
568 DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
569 tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
570 __lsx_vst(tmp, dst, 0);
571 dst += dst_stride;
572
573 hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
574 hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
575 DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
576 DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
577 tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
578 __lsx_vst(tmp, dst, 0);
579 dst += dst_stride;
580 }
581 }
582
common_hv_2ht_2vt_32w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)583 static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride,
584 uint8_t *dst, int32_t dst_stride,
585 int8_t *filter_horiz, int8_t *filter_vert,
586 int32_t height) {
587 common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
588 filter_vert, height);
589 src += 16;
590 dst += 16;
591
592 common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
593 filter_vert, height);
594 }
595
common_hv_2ht_2vt_64w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)596 static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride,
597 uint8_t *dst, int32_t dst_stride,
598 int8_t *filter_horiz, int8_t *filter_vert,
599 int32_t height) {
600 int32_t multiple8_cnt;
601 for (multiple8_cnt = 4; multiple8_cnt--;) {
602 common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
603 filter_vert, height);
604 src += 16;
605 dst += 16;
606 }
607 }
608
vpx_convolve8_lsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)609 void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
610 ptrdiff_t dst_stride, const InterpKernel *filter,
611 int x0_q4, int32_t x_step_q4, int y0_q4,
612 int32_t y_step_q4, int32_t w, int32_t h) {
613 const int16_t *const filter_x = filter[x0_q4];
614 const int16_t *const filter_y = filter[y0_q4];
615 int8_t cnt, filt_hor[8], filt_ver[8];
616
617 assert(x_step_q4 == 16);
618 assert(y_step_q4 == 16);
619 assert(((const int32_t *)filter_x)[1] != 0x800000);
620 assert(((const int32_t *)filter_y)[1] != 0x800000);
621
622 for (cnt = 0; cnt < 8; ++cnt) {
623 filt_hor[cnt] = filter_x[cnt];
624 filt_ver[cnt] = filter_y[cnt];
625 }
626
627 if (vpx_get_filter_taps(filter_x) == 2 &&
628 vpx_get_filter_taps(filter_y) == 2) {
629 switch (w) {
630 case 4:
631 common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst,
632 (int32_t)dst_stride, &filt_hor[3],
633 &filt_ver[3], (int32_t)h);
634 break;
635 case 8:
636 common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst,
637 (int32_t)dst_stride, &filt_hor[3],
638 &filt_ver[3], (int32_t)h);
639 break;
640 case 16:
641 common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst,
642 (int32_t)dst_stride, &filt_hor[3],
643 &filt_ver[3], (int32_t)h);
644 break;
645 case 32:
646 common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst,
647 (int32_t)dst_stride, &filt_hor[3],
648 &filt_ver[3], (int32_t)h);
649 break;
650 case 64:
651 common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst,
652 (int32_t)dst_stride, &filt_hor[3],
653 &filt_ver[3], (int32_t)h);
654 break;
655 default:
656 vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
657 x_step_q4, y0_q4, y_step_q4, w, h);
658 break;
659 }
660 } else if (vpx_get_filter_taps(filter_x) == 2 ||
661 vpx_get_filter_taps(filter_y) == 2) {
662 vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
663 y0_q4, y_step_q4, w, h);
664 } else {
665 switch (w) {
666 case 4:
667 common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst,
668 (int32_t)dst_stride, filt_hor, filt_ver,
669 (int32_t)h);
670 break;
671 case 8:
672 common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst,
673 (int32_t)dst_stride, filt_hor, filt_ver,
674 (int32_t)h);
675 break;
676 case 16:
677 common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst,
678 (int32_t)dst_stride, filt_hor, filt_ver,
679 (int32_t)h);
680 break;
681 case 32:
682 common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst,
683 (int32_t)dst_stride, filt_hor, filt_ver,
684 (int32_t)h);
685 break;
686 case 64:
687 common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst,
688 (int32_t)dst_stride, filt_hor, filt_ver,
689 (int32_t)h);
690 break;
691 default:
692 vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
693 x_step_q4, y0_q4, y_step_q4, w, h);
694 break;
695 }
696 }
697 }
698