xref: /aosp_15_r20/external/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
14 
15 static const uint8_t mc_filt_mask_arr[16 * 3] = {
16   /* 8 width cases */
17   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
18   /* 4 width cases */
19   0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
20   /* 4 width cases */
21   8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
22 };
23 
common_hv_8ht_8vt_4w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)24 static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
25                                      uint8_t *dst, int32_t dst_stride,
26                                      int8_t *filter_horiz, int8_t *filter_vert,
27                                      int32_t height) {
28   uint32_t loop_cnt = (height >> 2);
29   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
30   __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
31   __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
32   __m128i mask0, mask1, mask2, mask3;
33   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
34   __m128i out0, out1;
35   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
36 
37   mask0 = __lsx_vld(mc_filt_mask_arr, 16);
38   src -= (3 + 3 * src_stride);
39   DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
40             filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
41   DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
42   mask3 = __lsx_vaddi_bu(mask0, 6);
43 
44   LSX_LD_4(src, src_stride, src0, src1, src2, src3);
45   src += src_stride;
46   src4 = __lsx_vld(src, 0);
47   src += src_stride;
48   src5 = __lsx_vld(src, 0);
49   src += src_stride;
50   src6 = __lsx_vld(src, 0);
51   src += src_stride;
52   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
53             src1, src2, src3);
54   DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
55   src6 = __lsx_vxori_b(src6, 128);
56 
57   tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
58                          filt_hz1, filt_hz2, filt_hz3);
59   tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
60                          filt_hz1, filt_hz2, filt_hz3);
61   tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
62                          filt_hz1, filt_hz2, filt_hz3);
63   tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
64                          filt_hz1, filt_hz2, filt_hz3);
65   DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
66   DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
67             filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
68   DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
69   tmp2 = __lsx_vpackev_b(tmp5, tmp4);
70 
71   for (; loop_cnt--;) {
72     LSX_LD_4(src, src_stride, src7, src8, src9, src10);
73     src += src_stride;
74     DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
75               src8, src9, src10);
76     tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
77                            filt_hz1, filt_hz2, filt_hz3);
78     tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
79     tmp4 = __lsx_vpackev_b(tmp3, tmp4);
80     out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
81                                filt_vt2, filt_vt3);
82     src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
83                            filt_hz1, filt_hz2, filt_hz3);
84     src0 = __lsx_vshuf_b(src1, tmp3, shuff);
85     src0 = __lsx_vpackev_b(src1, src0);
86     out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
87                                filt_vt2, filt_vt3);
88     out0 = __lsx_vssrarni_b_h(out1, out0, 7);
89     out0 = __lsx_vxori_b(out0, 128);
90     __lsx_vstelm_w(out0, dst, 0, 0);
91     dst += dst_stride;
92     __lsx_vstelm_w(out0, dst, 0, 1);
93     dst += dst_stride;
94     __lsx_vstelm_w(out0, dst, 0, 2);
95     dst += dst_stride;
96     __lsx_vstelm_w(out0, dst, 0, 3);
97     dst += dst_stride;
98 
99     tmp5 = src1;
100     tmp0 = tmp2;
101     tmp1 = tmp4;
102     tmp2 = src0;
103   }
104 }
105 
common_hv_8ht_8vt_8w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)106 static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
107                                      uint8_t *dst, int32_t dst_stride,
108                                      int8_t *filter_horiz, int8_t *filter_vert,
109                                      int32_t height) {
110   uint32_t loop_cnt = (height >> 2);
111   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
112   __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
113   __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
114   __m128i mask0, mask1, mask2, mask3;
115   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
116   __m128i out0, out1;
117 
118   mask0 = __lsx_vld(mc_filt_mask_arr, 0);
119   src -= (3 + 3 * src_stride);
120   DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
121             filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
122   DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
123   mask3 = __lsx_vaddi_bu(mask0, 6);
124 
125   LSX_LD_4(src, src_stride, src0, src1, src2, src3);
126   src += src_stride;
127   src4 = __lsx_vld(src, 0);
128   src += src_stride;
129   src5 = __lsx_vld(src, 0);
130   src += src_stride;
131   src6 = __lsx_vld(src, 0);
132   src += src_stride;
133   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
134             src1, src2, src3);
135   DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
136   src6 = __lsx_vxori_b(src6, 128);
137 
138   src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
139                          filt_hz1, filt_hz2, filt_hz3);
140   src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
141                          filt_hz1, filt_hz2, filt_hz3);
142   src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
143                          filt_hz1, filt_hz2, filt_hz3);
144   src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
145                          filt_hz1, filt_hz2, filt_hz3);
146   src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
147                          filt_hz1, filt_hz2, filt_hz3);
148   src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
149                          filt_hz1, filt_hz2, filt_hz3);
150   src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
151                          filt_hz1, filt_hz2, filt_hz3);
152 
153   DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
154             filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
155   DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
156             tmp0, tmp1, tmp2, tmp4);
157   DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
158 
159   for (; loop_cnt--;) {
160     LSX_LD_4(src, src_stride, src7, src8, src9, src10);
161     src += src_stride;
162     DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
163               src8, src9, src10);
164     src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
165                            filt_hz1, filt_hz2, filt_hz3);
166     tmp3 = __lsx_vpackev_b(src7, src6);
167     out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
168                                filt_vt2, filt_vt3);
169     src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
170                            filt_hz1, filt_hz2, filt_hz3);
171     src0 = __lsx_vpackev_b(src8, src7);
172     out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
173                                filt_vt2, filt_vt3);
174     src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
175                            filt_hz1, filt_hz2, filt_hz3);
176     src1 = __lsx_vpackev_b(src9, src8);
177     src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
178                                filt_vt2, filt_vt3);
179     src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
180                             filt_hz1, filt_hz2, filt_hz3);
181     src2 = __lsx_vpackev_b(src10, src9);
182     src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
183                                filt_vt2, filt_vt3);
184     DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
185     DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
186     __lsx_vstelm_d(out0, dst, 0, 0);
187     dst += dst_stride;
188     __lsx_vstelm_d(out0, dst, 0, 1);
189     dst += dst_stride;
190     __lsx_vstelm_d(out1, dst, 0, 0);
191     dst += dst_stride;
192     __lsx_vstelm_d(out1, dst, 0, 1);
193     dst += dst_stride;
194 
195     src6 = src10;
196     tmp0 = tmp2;
197     tmp1 = tmp3;
198     tmp2 = src1;
199     tmp4 = tmp6;
200     tmp5 = src0;
201     tmp6 = src2;
202   }
203 }
204 
common_hv_8ht_8vt_16w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)205 static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
206                                       uint8_t *dst, int32_t dst_stride,
207                                       int8_t *filter_horiz, int8_t *filter_vert,
208                                       int32_t height) {
209   common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
210                            filter_vert, height);
211   src += 8;
212   dst += 8;
213 
214   common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
215                            filter_vert, height);
216   src += 8;
217   dst += 8;
218 }
219 
common_hv_8ht_8vt_32w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)220 static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
221                                       uint8_t *dst, int32_t dst_stride,
222                                       int8_t *filter_horiz, int8_t *filter_vert,
223                                       int32_t height) {
224   int32_t multiple8_cnt;
225   for (multiple8_cnt = 4; multiple8_cnt--;) {
226     common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
227                              filter_vert, height);
228     src += 8;
229     dst += 8;
230   }
231 }
232 
common_hv_8ht_8vt_64w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)233 static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
234                                       uint8_t *dst, int32_t dst_stride,
235                                       int8_t *filter_horiz, int8_t *filter_vert,
236                                       int32_t height) {
237   int32_t multiple8_cnt;
238   for (multiple8_cnt = 8; multiple8_cnt--;) {
239     common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
240                              filter_vert, height);
241     src += 8;
242     dst += 8;
243   }
244 }
245 
common_hv_2ht_2vt_4x4_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)246 static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
247                                       uint8_t *dst, int32_t dst_stride,
248                                       int8_t *filter_horiz,
249                                       int8_t *filter_vert) {
250   __m128i src0, src1, src2, src3, src4, mask;
251   __m128i filt_vt, filt_hz, vec0, vec1;
252   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
253   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
254 
255   int32_t src_stride2 = src_stride << 1;
256   int32_t src_stride3 = src_stride + src_stride2;
257   int32_t src_stride4 = src_stride2 << 1;
258 
259   int32_t dst_stride2 = dst_stride << 1;
260   int32_t dst_stride3 = dst_stride2 + dst_stride;
261   mask = __lsx_vld(mc_filt_mask_arr, 16);
262 
263   /* rearranging filter */
264   filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
265   filt_vt = __lsx_vldrepl_h(filter_vert, 0);
266 
267   src0 = __lsx_vld(src, 0);
268   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
269             src, src_stride4, src1, src2, src3, src4);
270   hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
271   hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
272   hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
273 
274   hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
275   hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
276 
277   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
278   DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
279   DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1,
280             FILTER_BITS, tmp0, tmp1);
281 
282   __lsx_vstelm_w(tmp0, dst, 0, 0);
283   __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
284   __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0);
285   __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1);
286 }
287 
common_hv_2ht_2vt_4x8_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)288 static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
289                                       uint8_t *dst, int32_t dst_stride,
290                                       int8_t *filter_horiz,
291                                       int8_t *filter_vert) {
292   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
293   __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
294   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
295   __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7;
296   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
297 
298   int32_t src_stride2 = src_stride << 1;
299   int32_t src_stride3 = src_stride2 + src_stride;
300   int32_t src_stride4 = src_stride2 << 1;
301 
302   int32_t dst_stride2 = dst_stride << 1;
303   int32_t dst_stride3 = dst_stride2 + dst_stride;
304   int32_t dst_stride4 = dst_stride2 << 1;
305 
306   mask = __lsx_vld(mc_filt_mask_arr, 16);
307 
308   /* rearranging filter */
309   DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
310 
311   src0 = __lsx_vld(src, 0);
312   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
313             src, src_stride4, src1, src2, src3, src4);
314   src += src_stride4;
315   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
316             src, src_stride4, src5, src6, src7, src8);
317   src += src_stride4;
318 
319   hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
320   hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
321   hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
322   hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
323   hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
324 
325   DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
326             hz_out1, hz_out3);
327   hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
328   hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
329   DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
330             hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
331   DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
332             filt_vt, vec4, vec5, vec6, vec7);
333   DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
334             FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4,
335             vec5, vec6, vec7);
336 
337   __lsx_vstelm_w(vec4, dst, 0, 0);
338   __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1);
339   __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0);
340   __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1);
341   dst += dst_stride4;
342   __lsx_vstelm_w(vec6, dst, 0, 0);
343   __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1);
344   __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0);
345   __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1);
346 }
347 
common_hv_2ht_2vt_4w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)348 static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride,
349                                      uint8_t *dst, int32_t dst_stride,
350                                      int8_t *filter_horiz, int8_t *filter_vert,
351                                      int32_t height) {
352   if (height == 4) {
353     common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
354                               filter_vert);
355   } else if (height == 8) {
356     common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz,
357                               filter_vert);
358   }
359 }
360 
common_hv_2ht_2vt_8x4_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)361 static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
362                                       uint8_t *dst, int32_t dst_stride,
363                                       int8_t *filter_horiz,
364                                       int8_t *filter_vert) {
365   __m128i src0, src1, src2, src3, src4, mask;
366   __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
367   __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
368 
369   int32_t src_stride2 = src_stride << 1;
370   int32_t src_stride3 = src_stride2 + src_stride;
371   int32_t src_stride4 = src_stride2 << 1;
372 
373   int32_t dst_stride2 = dst_stride << 1;
374   int32_t dst_stride3 = dst_stride2 + dst_stride;
375 
376   mask = __lsx_vld(mc_filt_mask_arr, 0);
377 
378   /* rearranging filter */
379   DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
380 
381   src0 = __lsx_vld(src, 0);
382   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
383             src, src_stride4, src1, src2, src3, src4);
384 
385   hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
386   hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
387   vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
388   tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
389 
390   hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
391   vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
392   tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
393 
394   hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
395   vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
396   tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
397 
398   hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
399   vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
400   tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
401 
402   DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
403             FILTER_BITS, tmp0, tmp1);
404 
405   __lsx_vstelm_d(tmp0, dst, 0, 0);
406   __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
407   __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0);
408   __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1);
409 }
410 
common_hv_2ht_2vt_8x8mult_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)411 static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
412                                           int32_t src_stride, uint8_t *dst,
413                                           int32_t dst_stride,
414                                           int8_t *filter_horiz,
415                                           int8_t *filter_vert, int32_t height) {
416   uint32_t loop_cnt = (height >> 3);
417   __m128i src0, src1, src2, src3, src4, mask;
418   __m128i filt_hz, filt_vt, vec0;
419   __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4;
420 
421   int32_t src_stride2 = src_stride << 1;
422   int32_t src_stride3 = src_stride2 + src_stride;
423   int32_t src_stride4 = src_stride2 << 1;
424 
425   mask = __lsx_vld(mc_filt_mask_arr, 0);
426 
427   /* rearranging filter */
428   DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
429 
430   src0 = __lsx_vld(src, 0);
431   src += src_stride;
432 
433   hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
434 
435   for (; loop_cnt--;) {
436     src1 = __lsx_vld(src, 0);
437     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
438     src4 = __lsx_vldx(src, src_stride3);
439     src += src_stride4;
440 
441     hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
442     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
443     tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
444 
445     hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
446     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
447     tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
448 
449     hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
450     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
451     tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
452 
453     hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
454     src1 = __lsx_vld(src, 0);
455     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
456     src4 = __lsx_vldx(src, src_stride3);
457     src += src_stride4;
458     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
459     tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
460 
461     DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
462               FILTER_BITS, tmp1, tmp2);
463 
464     __lsx_vstelm_d(tmp1, dst, 0, 0);
465     dst += dst_stride;
466     __lsx_vstelm_d(tmp1, dst, 0, 1);
467     dst += dst_stride;
468     __lsx_vstelm_d(tmp2, dst, 0, 0);
469     dst += dst_stride;
470     __lsx_vstelm_d(tmp2, dst, 0, 1);
471     dst += dst_stride;
472 
473     hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
474     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
475     tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
476 
477     hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
478     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
479     tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
480 
481     hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
482     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
483     tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
484 
485     hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
486     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
487     tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
488 
489     DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
490               FILTER_BITS, tmp1, tmp2);
491 
492     __lsx_vstelm_d(tmp1, dst, 0, 0);
493     dst += dst_stride;
494     __lsx_vstelm_d(tmp1, dst, 0, 1);
495     dst += dst_stride;
496     __lsx_vstelm_d(tmp2, dst, 0, 0);
497     dst += dst_stride;
498     __lsx_vstelm_d(tmp2, dst, 0, 1);
499     dst += dst_stride;
500   }
501 }
502 
common_hv_2ht_2vt_8w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)503 static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride,
504                                      uint8_t *dst, int32_t dst_stride,
505                                      int8_t *filter_horiz, int8_t *filter_vert,
506                                      int32_t height) {
507   if (height == 4) {
508     common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
509                               filter_vert);
510   } else {
511     common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride,
512                                   filter_horiz, filter_vert, height);
513   }
514 }
515 
common_hv_2ht_2vt_16w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)516 static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
517                                       uint8_t *dst, int32_t dst_stride,
518                                       int8_t *filter_horiz, int8_t *filter_vert,
519                                       int32_t height) {
520   uint32_t loop_cnt = (height >> 2);
521   __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
522   __m128i filt_hz, filt_vt, vec0, vec1;
523   __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
524 
525   int32_t src_stride2 = src_stride << 1;
526   int32_t src_stride3 = src_stride2 + src_stride;
527   int32_t src_stride4 = src_stride2 << 1;
528 
529   mask = __lsx_vld(mc_filt_mask_arr, 0);
530 
531   /* rearranging filter */
532   DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
533 
534   DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
535   src += src_stride;
536 
537   hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
538   hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
539 
540   for (; loop_cnt--;) {
541     uint8_t *src_tmp0 = src + 8;
542 
543     DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1);
544     DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src,
545               src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5);
546     DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7);
547     src += src_stride4;
548 
549     hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
550     hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
551     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
552     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
553     tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
554     __lsx_vst(tmp, dst, 0);
555     dst += dst_stride;
556 
557     hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
558     hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
559     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
560     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
561     tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
562     __lsx_vst(tmp, dst, 0);
563     dst += dst_stride;
564 
565     hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
566     hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
567     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
568     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
569     tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
570     __lsx_vst(tmp, dst, 0);
571     dst += dst_stride;
572 
573     hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
574     hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
575     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
576     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
577     tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
578     __lsx_vst(tmp, dst, 0);
579     dst += dst_stride;
580   }
581 }
582 
common_hv_2ht_2vt_32w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)583 static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride,
584                                       uint8_t *dst, int32_t dst_stride,
585                                       int8_t *filter_horiz, int8_t *filter_vert,
586                                       int32_t height) {
587   common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
588                             filter_vert, height);
589   src += 16;
590   dst += 16;
591 
592   common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
593                             filter_vert, height);
594 }
595 
common_hv_2ht_2vt_64w_lsx(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)596 static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride,
597                                       uint8_t *dst, int32_t dst_stride,
598                                       int8_t *filter_horiz, int8_t *filter_vert,
599                                       int32_t height) {
600   int32_t multiple8_cnt;
601   for (multiple8_cnt = 4; multiple8_cnt--;) {
602     common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
603                               filter_vert, height);
604     src += 16;
605     dst += 16;
606   }
607 }
608 
vpx_convolve8_lsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)609 void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
610                        ptrdiff_t dst_stride, const InterpKernel *filter,
611                        int x0_q4, int32_t x_step_q4, int y0_q4,
612                        int32_t y_step_q4, int32_t w, int32_t h) {
613   const int16_t *const filter_x = filter[x0_q4];
614   const int16_t *const filter_y = filter[y0_q4];
615   int8_t cnt, filt_hor[8], filt_ver[8];
616 
617   assert(x_step_q4 == 16);
618   assert(y_step_q4 == 16);
619   assert(((const int32_t *)filter_x)[1] != 0x800000);
620   assert(((const int32_t *)filter_y)[1] != 0x800000);
621 
622   for (cnt = 0; cnt < 8; ++cnt) {
623     filt_hor[cnt] = filter_x[cnt];
624     filt_ver[cnt] = filter_y[cnt];
625   }
626 
627   if (vpx_get_filter_taps(filter_x) == 2 &&
628       vpx_get_filter_taps(filter_y) == 2) {
629     switch (w) {
630       case 4:
631         common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst,
632                                  (int32_t)dst_stride, &filt_hor[3],
633                                  &filt_ver[3], (int32_t)h);
634         break;
635       case 8:
636         common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst,
637                                  (int32_t)dst_stride, &filt_hor[3],
638                                  &filt_ver[3], (int32_t)h);
639         break;
640       case 16:
641         common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst,
642                                   (int32_t)dst_stride, &filt_hor[3],
643                                   &filt_ver[3], (int32_t)h);
644         break;
645       case 32:
646         common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst,
647                                   (int32_t)dst_stride, &filt_hor[3],
648                                   &filt_ver[3], (int32_t)h);
649         break;
650       case 64:
651         common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst,
652                                   (int32_t)dst_stride, &filt_hor[3],
653                                   &filt_ver[3], (int32_t)h);
654         break;
655       default:
656         vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
657                         x_step_q4, y0_q4, y_step_q4, w, h);
658         break;
659     }
660   } else if (vpx_get_filter_taps(filter_x) == 2 ||
661              vpx_get_filter_taps(filter_y) == 2) {
662     vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
663                     y0_q4, y_step_q4, w, h);
664   } else {
665     switch (w) {
666       case 4:
667         common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst,
668                                  (int32_t)dst_stride, filt_hor, filt_ver,
669                                  (int32_t)h);
670         break;
671       case 8:
672         common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst,
673                                  (int32_t)dst_stride, filt_hor, filt_ver,
674                                  (int32_t)h);
675         break;
676       case 16:
677         common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst,
678                                   (int32_t)dst_stride, filt_hor, filt_ver,
679                                   (int32_t)h);
680         break;
681       case 32:
682         common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst,
683                                   (int32_t)dst_stride, filt_hor, filt_ver,
684                                   (int32_t)h);
685         break;
686       case 64:
687         common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst,
688                                   (int32_t)dst_stride, filt_hor, filt_ver,
689                                   (int32_t)h);
690         break;
691       default:
692         vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
693                         x_step_q4, y0_q4, y_step_q4, w, h);
694         break;
695     }
696   }
697 }
698