xref: /aosp_15_r20/external/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/loongarch/loopfilter_lsx.h"
13 
vpx_lpf_horizontal_8_lsx(uint8_t * dst,int32_t stride,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)14 void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
15                               const uint8_t *b_limit_ptr,
16                               const uint8_t *limit_ptr,
17                               const uint8_t *thresh_ptr) {
18   __m128i mask, hev, flat, thresh, b_limit, limit;
19   __m128i p3, p2, p1, p0, q3, q2, q1, q0;
20   __m128i p2_out, p1_out, p0_out, q0_out, q1_out;
21   __m128i p2_filter8, p1_filter8, p0_filter8;
22   __m128i q0_filter8, q1_filter8, q2_filter8;
23   __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
24 
25   int32_t stride2 = stride << 1;
26   int32_t stride3 = stride2 + stride;
27   int32_t stride4 = stride2 << 1;
28 
29   /* load vector elements */
30   DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
31             -stride, p3, p2, p1, p0);
32   q0 = __lsx_vld(dst, 0);
33   DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
34   q3 = __lsx_vldx(dst, stride3);
35 
36   thresh = __lsx_vldrepl_b(thresh_ptr, 0);
37   b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
38   limit = __lsx_vldrepl_b(limit_ptr, 0);
39 
40   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
41                mask, flat);
42   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
43   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
44 
45   flat = __lsx_vilvl_d(flat, flat);
46 
47   if (__lsx_bz_v(flat)) {
48     __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
49     __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
50     __lsx_vstelm_d(q0_out, dst, 0, 0);
51     __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
52   } else {
53     DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
54               p0_l);
55     DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
56               q3_l);
57     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
58                 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
59 
60     DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8,
61               p1_filter8, q0_filter8);
62     q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8);
63 
64     p2 = __lsx_vilvl_d(p1_out, p2);
65     p0_out = __lsx_vilvl_d(q0_out, p0_out);
66     q1_out = __lsx_vilvl_d(q2, q1_out);
67 
68     DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat,
69               p2_out, p1_out);
70     p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat);
71     dst -= stride3;
72 
73     __lsx_vstelm_d(p2_out, dst, 0, 0);
74     __lsx_vstelm_d(p2_out, dst + stride, 0, 1);
75     __lsx_vstelm_d(p1_out, dst + stride2, 0, 0);
76     __lsx_vstelm_d(p1_out, dst + stride3, 0, 1);
77 
78     dst += stride4;
79     __lsx_vstelm_d(p0_out, dst, 0, 0);
80     dst += stride;
81     __lsx_vstelm_d(p0_out, dst, 0, 1);
82   }
83 }
84 
vpx_lpf_horizontal_8_dual_lsx(uint8_t * dst,int32_t stride,const uint8_t * b_limit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * b_limit1,const uint8_t * limit1,const uint8_t * thresh1)85 void vpx_lpf_horizontal_8_dual_lsx(
86     uint8_t *dst, int32_t stride, const uint8_t *b_limit0,
87     const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1,
88     const uint8_t *limit1, const uint8_t *thresh1) {
89   __m128i p3, p2, p1, p0, q3, q2, q1, q0;
90   __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
91   __m128i flat, mask, hev, thresh, b_limit, limit;
92   __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
93   __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
94   __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
95   __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
96   __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
97   __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
98 
99   int32_t stride2 = stride << 1;
100   int32_t stride3 = stride2 + stride;
101   int32_t stride4 = stride2 << 1;
102 
103   DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
104             -stride, p3, p2, p1, p0);
105   q0 = __lsx_vld(dst, 0);
106   DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
107   q3 = __lsx_vldx(dst, stride3);
108 
109   thresh = __lsx_vldrepl_b(thresh0, 0);
110   p2_out = __lsx_vldrepl_b(thresh1, 0);
111   thresh = __lsx_vilvl_d(p2_out, thresh);
112 
113   b_limit = __lsx_vldrepl_b(b_limit0, 0);
114   p2_out = __lsx_vldrepl_b(b_limit1, 0);
115   b_limit = __lsx_vilvl_d(p2_out, b_limit);
116 
117   limit = __lsx_vldrepl_b(limit0, 0);
118   p2_out = __lsx_vldrepl_b(limit1, 0);
119   limit = __lsx_vilvl_d(p2_out, limit);
120 
121   /* mask and hev */
122   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
123                mask, flat);
124   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
125   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
126 
127   if (__lsx_bz_v(flat)) {
128     __lsx_vst(p1_out, dst - stride2, 0);
129     __lsx_vst(p0_out, dst - stride, 0);
130     __lsx_vst(q0_out, dst, 0);
131     __lsx_vst(q1_out, dst + stride, 0);
132   } else {
133     DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
134               p0_l);
135     DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
136               q3_l);
137     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
138                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
139 
140     DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
141     DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
142     VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
143                 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
144 
145     /* convert 16 bit output data into 8 bit */
146     DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
147               p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
148               p1_filt8_l, p0_filt8_l, q0_filt8_l);
149     DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
150               q1_filt8_l, q2_filt8_l);
151 
152     /* store pixel values */
153     p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
154     p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
155     p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
156     q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
157     q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
158     q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
159 
160     __lsx_vst(p2_out, dst - stride3, 0);
161     __lsx_vst(p1_out, dst - stride2, 0);
162     __lsx_vst(p0_out, dst - stride, 0);
163     __lsx_vst(q0_out, dst, 0);
164     __lsx_vst(q1_out, dst + stride, 0);
165     __lsx_vst(q2_out, dst + stride2, 0);
166   }
167 }
168 
vpx_lpf_vertical_8_lsx(uint8_t * dst,int32_t stride,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)169 void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
170                             const uint8_t *b_limit_ptr,
171                             const uint8_t *limit_ptr,
172                             const uint8_t *thresh_ptr) {
173   __m128i p3, p2, p1, p0, q3, q2, q1, q0;
174   __m128i p1_out, p0_out, q0_out, q1_out;
175   __m128i flat, mask, hev, thresh, b_limit, limit;
176   __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
177   __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
178   __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
179   __m128i zero = __lsx_vldi(0);
180 
181   int32_t stride2 = stride << 1;
182   int32_t stride3 = stride2 + stride;
183   int32_t stride4 = stride2 << 1;
184   uint8_t *dst_tmp = dst - 4;
185 
186   /* load vector elements */
187   p3 = __lsx_vld(dst_tmp, 0);
188   DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
189   p0 = __lsx_vldx(dst_tmp, stride3);
190   dst_tmp += stride4;
191   q0 = __lsx_vld(dst_tmp, 0);
192   DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
193   q3 = __lsx_vldx(dst_tmp, stride3);
194 
195   LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
196                      q3);
197 
198   thresh = __lsx_vldrepl_b(thresh_ptr, 0);
199   b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
200   limit = __lsx_vldrepl_b(limit_ptr, 0);
201 
202   /* mask and hev */
203   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
204                mask, flat);
205   /* flat4 */
206   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
207   /* filter4 */
208   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
209 
210   flat = __lsx_vilvl_d(zero, flat);
211 
212   /* if flat is zero for all pixels, then no need to calculate other filter */
213   if (__lsx_bz_v(flat)) {
214     /* Store 4 pixels p1-_q1 */
215     DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
216     p2 = __lsx_vilvl_h(p1, p0);
217     p3 = __lsx_vilvh_h(p1, p0);
218 
219     dst -= 2;
220     __lsx_vstelm_w(p2, dst, 0, 0);
221     __lsx_vstelm_w(p2, dst + stride, 0, 1);
222     __lsx_vstelm_w(p2, dst + stride2, 0, 2);
223     __lsx_vstelm_w(p2, dst + stride3, 0, 3);
224     dst += stride4;
225     __lsx_vstelm_w(p3, dst, 0, 0);
226     __lsx_vstelm_w(p3, dst + stride, 0, 1);
227     __lsx_vstelm_w(p3, dst + stride2, 0, 2);
228     __lsx_vstelm_w(p3, dst + stride3, 0, 3);
229   } else {
230     DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
231               p1_l, p0_l);
232     DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
233               q2_l, q3_l);
234     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
235                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
236     /* convert 16 bit output data into 8 bit */
237     DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
238               p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
239               p1_filt8_l, p0_filt8_l, q0_filt8_l);
240     DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
241               q1_filt8_l, q2_filt8_l);
242     /* store pixel values */
243     p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
244     p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
245     p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
246     q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
247     q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
248     q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
249 
250     /* Store 6 pixels p2-_q2 */
251     DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
252     p1 = __lsx_vilvl_h(q3, p3);
253     p2 = __lsx_vilvh_h(q3, p3);
254     p3 = __lsx_vilvl_b(q2, q1);
255     dst -= 3;
256     __lsx_vstelm_w(p1, dst, 0, 0);
257     __lsx_vstelm_h(p3, dst, 4, 0);
258     dst += stride;
259     __lsx_vstelm_w(p1, dst, 0, 1);
260     __lsx_vstelm_h(p3, dst, 4, 1);
261     dst += stride;
262     __lsx_vstelm_w(p1, dst, 0, 2);
263     __lsx_vstelm_h(p3, dst, 4, 2);
264     dst += stride;
265     __lsx_vstelm_w(p1, dst, 0, 3);
266     __lsx_vstelm_h(p3, dst, 4, 3);
267     dst += stride;
268     __lsx_vstelm_w(p2, dst, 0, 0);
269     __lsx_vstelm_h(p3, dst, 4, 4);
270     dst += stride;
271     __lsx_vstelm_w(p2, dst, 0, 1);
272     __lsx_vstelm_h(p3, dst, 4, 5);
273     dst += stride;
274     __lsx_vstelm_w(p2, dst, 0, 2);
275     __lsx_vstelm_h(p3, dst, 4, 6);
276     dst += stride;
277     __lsx_vstelm_w(p2, dst, 0, 3);
278     __lsx_vstelm_h(p3, dst, 4, 7);
279   }
280 }
281 
vpx_lpf_vertical_8_dual_lsx(uint8_t * dst,int32_t stride,const uint8_t * b_limit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * b_limit1,const uint8_t * limit1,const uint8_t * thresh1)282 void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
283                                  const uint8_t *b_limit0, const uint8_t *limit0,
284                                  const uint8_t *thresh0,
285                                  const uint8_t *b_limit1, const uint8_t *limit1,
286                                  const uint8_t *thresh1) {
287   uint8_t *dst_tmp = dst - 4;
288   __m128i p3, p2, p1, p0, q3, q2, q1, q0;
289   __m128i p1_out, p0_out, q0_out, q1_out;
290   __m128i flat, mask, hev, thresh, b_limit, limit;
291   __m128i row4, row5, row6, row7, row12, row13, row14, row15;
292   __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
293   __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
294   __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
295   __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
296   __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
297   __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
298   int32_t stride2 = stride << 1;
299   int32_t stride3 = stride2 + stride;
300   int32_t stride4 = stride2 << 1;
301 
302   p0 = __lsx_vld(dst_tmp, 0);
303   DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
304   p3 = __lsx_vldx(dst_tmp, stride3);
305   dst_tmp += stride4;
306   row4 = __lsx_vld(dst_tmp, 0);
307   DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
308   row7 = __lsx_vldx(dst_tmp, stride3);
309   dst_tmp += stride4;
310 
311   q3 = __lsx_vld(dst_tmp, 0);
312   DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
313   q0 = __lsx_vldx(dst_tmp, stride3);
314   dst_tmp += stride4;
315   row12 = __lsx_vld(dst_tmp, 0);
316   DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
317   row15 = __lsx_vldx(dst_tmp, stride3);
318 
319   /* transpose 16x8 matrix into 8x16 */
320   LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
321                       row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
322                       q3);
323 
324   thresh = __lsx_vldrepl_b(thresh0, 0);
325   p1_out = __lsx_vldrepl_b(thresh1, 0);
326   thresh = __lsx_vilvl_d(p1_out, thresh);
327 
328   b_limit = __lsx_vldrepl_b(b_limit0, 0);
329   p1_out = __lsx_vldrepl_b(b_limit1, 0);
330   b_limit = __lsx_vilvl_d(p1_out, b_limit);
331 
332   limit = __lsx_vldrepl_b(limit0, 0);
333   p1_out = __lsx_vldrepl_b(limit1, 0);
334   limit = __lsx_vilvl_d(p1_out, limit);
335 
336   /* mask and hev */
337   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
338                mask, flat);
339   /* flat4 */
340   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
341   /* filter4 */
342   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
343   /* if flat is zero for all pixels, then no need to calculate other filter */
344   if (__lsx_bz_v(flat)) {
345     DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
346     p2 = __lsx_vilvl_h(p1, p0);
347     p3 = __lsx_vilvh_h(p1, p0);
348     DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
349     q2 = __lsx_vilvl_h(p1, p0);
350     q3 = __lsx_vilvh_h(p1, p0);
351     dst -= 2;
352     __lsx_vstelm_w(p2, dst, 0, 0);
353     __lsx_vstelm_w(p2, dst + stride, 0, 1);
354     __lsx_vstelm_w(p2, dst + stride2, 0, 2);
355     __lsx_vstelm_w(p2, dst + stride3, 0, 3);
356     dst += stride4;
357     __lsx_vstelm_w(p3, dst, 0, 0);
358     __lsx_vstelm_w(p3, dst + stride, 0, 1);
359     __lsx_vstelm_w(p3, dst + stride2, 0, 2);
360     __lsx_vstelm_w(p3, dst + stride3, 0, 3);
361     dst += stride4;
362     __lsx_vstelm_w(q2, dst, 0, 0);
363     __lsx_vstelm_w(q2, dst + stride, 0, 1);
364     __lsx_vstelm_w(q2, dst + stride2, 0, 2);
365     __lsx_vstelm_w(q2, dst + stride3, 0, 3);
366     dst += stride4;
367     __lsx_vstelm_w(q3, dst, 0, 0);
368     __lsx_vstelm_w(q3, dst + stride, 0, 1);
369     __lsx_vstelm_w(q3, dst + stride2, 0, 2);
370     __lsx_vstelm_w(q3, dst + stride3, 0, 3);
371   } else {
372     DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
373               p0_l);
374     DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
375               q3_l);
376     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
377                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
378     DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
379     DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
380 
381     /* filter8 */
382     VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
383                 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
384 
385     /* convert 16 bit output data into 8 bit */
386     DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
387               p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
388               p1_filt8_l, p0_filt8_l, q0_filt8_l);
389     DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
390               q1_filt8_l, q2_filt8_l);
391 
392     /* store pixel values */
393     p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
394     p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
395     p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
396     q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
397     q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
398     q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
399 
400     DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
401     p2_filt8_l = __lsx_vilvl_h(q3, p3);
402     p2_filt8_h = __lsx_vilvh_h(q3, p3);
403     DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3);
404     p0_filt8_l = __lsx_vilvl_h(q3, p3);
405     p0_filt8_h = __lsx_vilvh_h(q3, p3);
406     q1_filt8_l = __lsx_vilvl_b(q2, q1);
407     q1_filt8_h = __lsx_vilvh_b(q2, q1);
408 
409     dst -= 3;
410     __lsx_vstelm_w(p2_filt8_l, dst, 0, 0);
411     __lsx_vstelm_h(q1_filt8_l, dst, 4, 0);
412     dst += stride;
413     __lsx_vstelm_w(p2_filt8_l, dst, 0, 1);
414     __lsx_vstelm_h(q1_filt8_l, dst, 4, 1);
415     dst += stride;
416     __lsx_vstelm_w(p2_filt8_l, dst, 0, 2);
417     __lsx_vstelm_h(q1_filt8_l, dst, 4, 2);
418     dst += stride;
419     __lsx_vstelm_w(p2_filt8_l, dst, 0, 3);
420     __lsx_vstelm_h(q1_filt8_l, dst, 4, 3);
421     dst += stride;
422     __lsx_vstelm_w(p2_filt8_h, dst, 0, 0);
423     __lsx_vstelm_h(q1_filt8_l, dst, 4, 4);
424     dst += stride;
425     __lsx_vstelm_w(p2_filt8_h, dst, 0, 1);
426     __lsx_vstelm_h(q1_filt8_l, dst, 4, 5);
427     dst += stride;
428     __lsx_vstelm_w(p2_filt8_h, dst, 0, 2);
429     __lsx_vstelm_h(q1_filt8_l, dst, 4, 6);
430     dst += stride;
431     __lsx_vstelm_w(p2_filt8_h, dst, 0, 3);
432     __lsx_vstelm_h(q1_filt8_l, dst, 4, 7);
433     dst += stride;
434     __lsx_vstelm_w(p0_filt8_l, dst, 0, 0);
435     __lsx_vstelm_h(q1_filt8_h, dst, 4, 0);
436     dst += stride;
437     __lsx_vstelm_w(p0_filt8_l, dst, 0, 1);
438     __lsx_vstelm_h(q1_filt8_h, dst, 4, 1);
439     dst += stride;
440     __lsx_vstelm_w(p0_filt8_l, dst, 0, 2);
441     __lsx_vstelm_h(q1_filt8_h, dst, 4, 2);
442     dst += stride;
443     __lsx_vstelm_w(p0_filt8_l, dst, 0, 3);
444     __lsx_vstelm_h(q1_filt8_h, dst, 4, 3);
445     dst += stride;
446     __lsx_vstelm_w(p0_filt8_h, dst, 0, 0);
447     __lsx_vstelm_h(q1_filt8_h, dst, 4, 4);
448     dst += stride;
449     __lsx_vstelm_w(p0_filt8_h, dst, 0, 1);
450     __lsx_vstelm_h(q1_filt8_h, dst, 4, 5);
451     dst += stride;
452     __lsx_vstelm_w(p0_filt8_h, dst, 0, 2);
453     __lsx_vstelm_h(q1_filt8_h, dst, 4, 6);
454     dst += stride;
455     __lsx_vstelm_w(p0_filt8_h, dst, 0, 3);
456     __lsx_vstelm_h(q1_filt8_h, dst, 4, 7);
457   }
458 }
459