1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/loongarch/loopfilter_lsx.h"
13
vpx_lpf_horizontal_8_lsx(uint8_t * dst,int32_t stride,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)14 void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
15 const uint8_t *b_limit_ptr,
16 const uint8_t *limit_ptr,
17 const uint8_t *thresh_ptr) {
18 __m128i mask, hev, flat, thresh, b_limit, limit;
19 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
20 __m128i p2_out, p1_out, p0_out, q0_out, q1_out;
21 __m128i p2_filter8, p1_filter8, p0_filter8;
22 __m128i q0_filter8, q1_filter8, q2_filter8;
23 __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
24
25 int32_t stride2 = stride << 1;
26 int32_t stride3 = stride2 + stride;
27 int32_t stride4 = stride2 << 1;
28
29 /* load vector elements */
30 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
31 -stride, p3, p2, p1, p0);
32 q0 = __lsx_vld(dst, 0);
33 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
34 q3 = __lsx_vldx(dst, stride3);
35
36 thresh = __lsx_vldrepl_b(thresh_ptr, 0);
37 b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
38 limit = __lsx_vldrepl_b(limit_ptr, 0);
39
40 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
41 mask, flat);
42 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
43 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
44
45 flat = __lsx_vilvl_d(flat, flat);
46
47 if (__lsx_bz_v(flat)) {
48 __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
49 __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
50 __lsx_vstelm_d(q0_out, dst, 0, 0);
51 __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
52 } else {
53 DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
54 p0_l);
55 DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
56 q3_l);
57 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
58 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
59
60 DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8,
61 p1_filter8, q0_filter8);
62 q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8);
63
64 p2 = __lsx_vilvl_d(p1_out, p2);
65 p0_out = __lsx_vilvl_d(q0_out, p0_out);
66 q1_out = __lsx_vilvl_d(q2, q1_out);
67
68 DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat,
69 p2_out, p1_out);
70 p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat);
71 dst -= stride3;
72
73 __lsx_vstelm_d(p2_out, dst, 0, 0);
74 __lsx_vstelm_d(p2_out, dst + stride, 0, 1);
75 __lsx_vstelm_d(p1_out, dst + stride2, 0, 0);
76 __lsx_vstelm_d(p1_out, dst + stride3, 0, 1);
77
78 dst += stride4;
79 __lsx_vstelm_d(p0_out, dst, 0, 0);
80 dst += stride;
81 __lsx_vstelm_d(p0_out, dst, 0, 1);
82 }
83 }
84
vpx_lpf_horizontal_8_dual_lsx(uint8_t * dst,int32_t stride,const uint8_t * b_limit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * b_limit1,const uint8_t * limit1,const uint8_t * thresh1)85 void vpx_lpf_horizontal_8_dual_lsx(
86 uint8_t *dst, int32_t stride, const uint8_t *b_limit0,
87 const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1,
88 const uint8_t *limit1, const uint8_t *thresh1) {
89 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
90 __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
91 __m128i flat, mask, hev, thresh, b_limit, limit;
92 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
93 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
94 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
95 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
96 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
97 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
98
99 int32_t stride2 = stride << 1;
100 int32_t stride3 = stride2 + stride;
101 int32_t stride4 = stride2 << 1;
102
103 DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
104 -stride, p3, p2, p1, p0);
105 q0 = __lsx_vld(dst, 0);
106 DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
107 q3 = __lsx_vldx(dst, stride3);
108
109 thresh = __lsx_vldrepl_b(thresh0, 0);
110 p2_out = __lsx_vldrepl_b(thresh1, 0);
111 thresh = __lsx_vilvl_d(p2_out, thresh);
112
113 b_limit = __lsx_vldrepl_b(b_limit0, 0);
114 p2_out = __lsx_vldrepl_b(b_limit1, 0);
115 b_limit = __lsx_vilvl_d(p2_out, b_limit);
116
117 limit = __lsx_vldrepl_b(limit0, 0);
118 p2_out = __lsx_vldrepl_b(limit1, 0);
119 limit = __lsx_vilvl_d(p2_out, limit);
120
121 /* mask and hev */
122 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
123 mask, flat);
124 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
125 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
126
127 if (__lsx_bz_v(flat)) {
128 __lsx_vst(p1_out, dst - stride2, 0);
129 __lsx_vst(p0_out, dst - stride, 0);
130 __lsx_vst(q0_out, dst, 0);
131 __lsx_vst(q1_out, dst + stride, 0);
132 } else {
133 DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
134 p0_l);
135 DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
136 q3_l);
137 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
138 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
139
140 DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
141 DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
142 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
143 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
144
145 /* convert 16 bit output data into 8 bit */
146 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
147 p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
148 p1_filt8_l, p0_filt8_l, q0_filt8_l);
149 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
150 q1_filt8_l, q2_filt8_l);
151
152 /* store pixel values */
153 p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
154 p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
155 p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
156 q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
157 q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
158 q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
159
160 __lsx_vst(p2_out, dst - stride3, 0);
161 __lsx_vst(p1_out, dst - stride2, 0);
162 __lsx_vst(p0_out, dst - stride, 0);
163 __lsx_vst(q0_out, dst, 0);
164 __lsx_vst(q1_out, dst + stride, 0);
165 __lsx_vst(q2_out, dst + stride2, 0);
166 }
167 }
168
vpx_lpf_vertical_8_lsx(uint8_t * dst,int32_t stride,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)169 void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
170 const uint8_t *b_limit_ptr,
171 const uint8_t *limit_ptr,
172 const uint8_t *thresh_ptr) {
173 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
174 __m128i p1_out, p0_out, q0_out, q1_out;
175 __m128i flat, mask, hev, thresh, b_limit, limit;
176 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
177 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
178 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
179 __m128i zero = __lsx_vldi(0);
180
181 int32_t stride2 = stride << 1;
182 int32_t stride3 = stride2 + stride;
183 int32_t stride4 = stride2 << 1;
184 uint8_t *dst_tmp = dst - 4;
185
186 /* load vector elements */
187 p3 = __lsx_vld(dst_tmp, 0);
188 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
189 p0 = __lsx_vldx(dst_tmp, stride3);
190 dst_tmp += stride4;
191 q0 = __lsx_vld(dst_tmp, 0);
192 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
193 q3 = __lsx_vldx(dst_tmp, stride3);
194
195 LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
196 q3);
197
198 thresh = __lsx_vldrepl_b(thresh_ptr, 0);
199 b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
200 limit = __lsx_vldrepl_b(limit_ptr, 0);
201
202 /* mask and hev */
203 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
204 mask, flat);
205 /* flat4 */
206 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
207 /* filter4 */
208 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
209
210 flat = __lsx_vilvl_d(zero, flat);
211
212 /* if flat is zero for all pixels, then no need to calculate other filter */
213 if (__lsx_bz_v(flat)) {
214 /* Store 4 pixels p1-_q1 */
215 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
216 p2 = __lsx_vilvl_h(p1, p0);
217 p3 = __lsx_vilvh_h(p1, p0);
218
219 dst -= 2;
220 __lsx_vstelm_w(p2, dst, 0, 0);
221 __lsx_vstelm_w(p2, dst + stride, 0, 1);
222 __lsx_vstelm_w(p2, dst + stride2, 0, 2);
223 __lsx_vstelm_w(p2, dst + stride3, 0, 3);
224 dst += stride4;
225 __lsx_vstelm_w(p3, dst, 0, 0);
226 __lsx_vstelm_w(p3, dst + stride, 0, 1);
227 __lsx_vstelm_w(p3, dst + stride2, 0, 2);
228 __lsx_vstelm_w(p3, dst + stride3, 0, 3);
229 } else {
230 DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
231 p1_l, p0_l);
232 DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
233 q2_l, q3_l);
234 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
235 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
236 /* convert 16 bit output data into 8 bit */
237 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
238 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
239 p1_filt8_l, p0_filt8_l, q0_filt8_l);
240 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
241 q1_filt8_l, q2_filt8_l);
242 /* store pixel values */
243 p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
244 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
245 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
246 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
247 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
248 q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
249
250 /* Store 6 pixels p2-_q2 */
251 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
252 p1 = __lsx_vilvl_h(q3, p3);
253 p2 = __lsx_vilvh_h(q3, p3);
254 p3 = __lsx_vilvl_b(q2, q1);
255 dst -= 3;
256 __lsx_vstelm_w(p1, dst, 0, 0);
257 __lsx_vstelm_h(p3, dst, 4, 0);
258 dst += stride;
259 __lsx_vstelm_w(p1, dst, 0, 1);
260 __lsx_vstelm_h(p3, dst, 4, 1);
261 dst += stride;
262 __lsx_vstelm_w(p1, dst, 0, 2);
263 __lsx_vstelm_h(p3, dst, 4, 2);
264 dst += stride;
265 __lsx_vstelm_w(p1, dst, 0, 3);
266 __lsx_vstelm_h(p3, dst, 4, 3);
267 dst += stride;
268 __lsx_vstelm_w(p2, dst, 0, 0);
269 __lsx_vstelm_h(p3, dst, 4, 4);
270 dst += stride;
271 __lsx_vstelm_w(p2, dst, 0, 1);
272 __lsx_vstelm_h(p3, dst, 4, 5);
273 dst += stride;
274 __lsx_vstelm_w(p2, dst, 0, 2);
275 __lsx_vstelm_h(p3, dst, 4, 6);
276 dst += stride;
277 __lsx_vstelm_w(p2, dst, 0, 3);
278 __lsx_vstelm_h(p3, dst, 4, 7);
279 }
280 }
281
vpx_lpf_vertical_8_dual_lsx(uint8_t * dst,int32_t stride,const uint8_t * b_limit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * b_limit1,const uint8_t * limit1,const uint8_t * thresh1)282 void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
283 const uint8_t *b_limit0, const uint8_t *limit0,
284 const uint8_t *thresh0,
285 const uint8_t *b_limit1, const uint8_t *limit1,
286 const uint8_t *thresh1) {
287 uint8_t *dst_tmp = dst - 4;
288 __m128i p3, p2, p1, p0, q3, q2, q1, q0;
289 __m128i p1_out, p0_out, q0_out, q1_out;
290 __m128i flat, mask, hev, thresh, b_limit, limit;
291 __m128i row4, row5, row6, row7, row12, row13, row14, row15;
292 __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
293 __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
294 __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
295 __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
296 __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
297 __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
298 int32_t stride2 = stride << 1;
299 int32_t stride3 = stride2 + stride;
300 int32_t stride4 = stride2 << 1;
301
302 p0 = __lsx_vld(dst_tmp, 0);
303 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
304 p3 = __lsx_vldx(dst_tmp, stride3);
305 dst_tmp += stride4;
306 row4 = __lsx_vld(dst_tmp, 0);
307 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
308 row7 = __lsx_vldx(dst_tmp, stride3);
309 dst_tmp += stride4;
310
311 q3 = __lsx_vld(dst_tmp, 0);
312 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
313 q0 = __lsx_vldx(dst_tmp, stride3);
314 dst_tmp += stride4;
315 row12 = __lsx_vld(dst_tmp, 0);
316 DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
317 row15 = __lsx_vldx(dst_tmp, stride3);
318
319 /* transpose 16x8 matrix into 8x16 */
320 LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
321 row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
322 q3);
323
324 thresh = __lsx_vldrepl_b(thresh0, 0);
325 p1_out = __lsx_vldrepl_b(thresh1, 0);
326 thresh = __lsx_vilvl_d(p1_out, thresh);
327
328 b_limit = __lsx_vldrepl_b(b_limit0, 0);
329 p1_out = __lsx_vldrepl_b(b_limit1, 0);
330 b_limit = __lsx_vilvl_d(p1_out, b_limit);
331
332 limit = __lsx_vldrepl_b(limit0, 0);
333 p1_out = __lsx_vldrepl_b(limit1, 0);
334 limit = __lsx_vilvl_d(p1_out, limit);
335
336 /* mask and hev */
337 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
338 mask, flat);
339 /* flat4 */
340 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
341 /* filter4 */
342 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
343 /* if flat is zero for all pixels, then no need to calculate other filter */
344 if (__lsx_bz_v(flat)) {
345 DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
346 p2 = __lsx_vilvl_h(p1, p0);
347 p3 = __lsx_vilvh_h(p1, p0);
348 DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
349 q2 = __lsx_vilvl_h(p1, p0);
350 q3 = __lsx_vilvh_h(p1, p0);
351 dst -= 2;
352 __lsx_vstelm_w(p2, dst, 0, 0);
353 __lsx_vstelm_w(p2, dst + stride, 0, 1);
354 __lsx_vstelm_w(p2, dst + stride2, 0, 2);
355 __lsx_vstelm_w(p2, dst + stride3, 0, 3);
356 dst += stride4;
357 __lsx_vstelm_w(p3, dst, 0, 0);
358 __lsx_vstelm_w(p3, dst + stride, 0, 1);
359 __lsx_vstelm_w(p3, dst + stride2, 0, 2);
360 __lsx_vstelm_w(p3, dst + stride3, 0, 3);
361 dst += stride4;
362 __lsx_vstelm_w(q2, dst, 0, 0);
363 __lsx_vstelm_w(q2, dst + stride, 0, 1);
364 __lsx_vstelm_w(q2, dst + stride2, 0, 2);
365 __lsx_vstelm_w(q2, dst + stride3, 0, 3);
366 dst += stride4;
367 __lsx_vstelm_w(q3, dst, 0, 0);
368 __lsx_vstelm_w(q3, dst + stride, 0, 1);
369 __lsx_vstelm_w(q3, dst + stride2, 0, 2);
370 __lsx_vstelm_w(q3, dst + stride3, 0, 3);
371 } else {
372 DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
373 p0_l);
374 DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
375 q3_l);
376 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
377 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
378 DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
379 DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
380
381 /* filter8 */
382 VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
383 p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
384
385 /* convert 16 bit output data into 8 bit */
386 DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
387 p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
388 p1_filt8_l, p0_filt8_l, q0_filt8_l);
389 DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
390 q1_filt8_l, q2_filt8_l);
391
392 /* store pixel values */
393 p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
394 p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
395 p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
396 q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
397 q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
398 q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
399
400 DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
401 p2_filt8_l = __lsx_vilvl_h(q3, p3);
402 p2_filt8_h = __lsx_vilvh_h(q3, p3);
403 DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3);
404 p0_filt8_l = __lsx_vilvl_h(q3, p3);
405 p0_filt8_h = __lsx_vilvh_h(q3, p3);
406 q1_filt8_l = __lsx_vilvl_b(q2, q1);
407 q1_filt8_h = __lsx_vilvh_b(q2, q1);
408
409 dst -= 3;
410 __lsx_vstelm_w(p2_filt8_l, dst, 0, 0);
411 __lsx_vstelm_h(q1_filt8_l, dst, 4, 0);
412 dst += stride;
413 __lsx_vstelm_w(p2_filt8_l, dst, 0, 1);
414 __lsx_vstelm_h(q1_filt8_l, dst, 4, 1);
415 dst += stride;
416 __lsx_vstelm_w(p2_filt8_l, dst, 0, 2);
417 __lsx_vstelm_h(q1_filt8_l, dst, 4, 2);
418 dst += stride;
419 __lsx_vstelm_w(p2_filt8_l, dst, 0, 3);
420 __lsx_vstelm_h(q1_filt8_l, dst, 4, 3);
421 dst += stride;
422 __lsx_vstelm_w(p2_filt8_h, dst, 0, 0);
423 __lsx_vstelm_h(q1_filt8_l, dst, 4, 4);
424 dst += stride;
425 __lsx_vstelm_w(p2_filt8_h, dst, 0, 1);
426 __lsx_vstelm_h(q1_filt8_l, dst, 4, 5);
427 dst += stride;
428 __lsx_vstelm_w(p2_filt8_h, dst, 0, 2);
429 __lsx_vstelm_h(q1_filt8_l, dst, 4, 6);
430 dst += stride;
431 __lsx_vstelm_w(p2_filt8_h, dst, 0, 3);
432 __lsx_vstelm_h(q1_filt8_l, dst, 4, 7);
433 dst += stride;
434 __lsx_vstelm_w(p0_filt8_l, dst, 0, 0);
435 __lsx_vstelm_h(q1_filt8_h, dst, 4, 0);
436 dst += stride;
437 __lsx_vstelm_w(p0_filt8_l, dst, 0, 1);
438 __lsx_vstelm_h(q1_filt8_h, dst, 4, 1);
439 dst += stride;
440 __lsx_vstelm_w(p0_filt8_l, dst, 0, 2);
441 __lsx_vstelm_h(q1_filt8_h, dst, 4, 2);
442 dst += stride;
443 __lsx_vstelm_w(p0_filt8_l, dst, 0, 3);
444 __lsx_vstelm_h(q1_filt8_h, dst, 4, 3);
445 dst += stride;
446 __lsx_vstelm_w(p0_filt8_h, dst, 0, 0);
447 __lsx_vstelm_h(q1_filt8_h, dst, 4, 4);
448 dst += stride;
449 __lsx_vstelm_w(p0_filt8_h, dst, 0, 1);
450 __lsx_vstelm_h(q1_filt8_h, dst, 4, 5);
451 dst += stride;
452 __lsx_vstelm_w(p0_filt8_h, dst, 0, 2);
453 __lsx_vstelm_h(q1_filt8_h, dst, 4, 6);
454 dst += stride;
455 __lsx_vstelm_w(p0_filt8_h, dst, 0, 3);
456 __lsx_vstelm_h(q1_filt8_h, dst, 4, 7);
457 }
458 }
459