xref: /aosp_15_r20/external/libvpx/vpx_dsp/mips/loopfilter_8_msa.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/loopfilter_msa.h"
13 
vpx_lpf_horizontal_8_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)14 void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
15                               const uint8_t *b_limit_ptr,
16                               const uint8_t *limit_ptr,
17                               const uint8_t *thresh_ptr) {
18   uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
19   v16u8 mask, hev, flat, thresh, b_limit, limit;
20   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
21   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
22   v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
23   v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
24   v16i8 zero = { 0 };
25 
26   /* load vector elements */
27   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
28 
29   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
30   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
31   limit = (v16u8)__msa_fill_b(*limit_ptr);
32 
33   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
34                mask, flat);
35   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
36   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
37 
38   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
39 
40   if (__msa_test_bz_v(flat)) {
41     p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
42     p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
43     q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
44     q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
45     SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
46   } else {
47     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
48                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
49     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
50                 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
51 
52     /* convert 16 bit output data into 8 bit */
53     PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
54                 q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
55     PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
56 
57     /* store pixel values */
58     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
59     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
60     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
61     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
62     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
63     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
64 
65     p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
66     p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
67     p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
68     q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
69     q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
70     q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
71 
72     src -= 3 * pitch;
73 
74     SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
75     src += (4 * pitch);
76     SD(q1_d, src);
77     src += pitch;
78     SD(q2_d, src);
79   }
80 }
81 
vpx_lpf_horizontal_8_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * b_limit1,const uint8_t * limit1,const uint8_t * thresh1)82 void vpx_lpf_horizontal_8_dual_msa(
83     uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
84     const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
85     const uint8_t *thresh1) {
86   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
87   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
88   v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
89   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
90   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
91   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
92   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
93   v16u8 zero = { 0 };
94 
95   /* load vector elements */
96   LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
97 
98   thresh = (v16u8)__msa_fill_b(*thresh0);
99   tmp = (v16u8)__msa_fill_b(*thresh1);
100   thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
101 
102   b_limit = (v16u8)__msa_fill_b(*b_limit0);
103   tmp = (v16u8)__msa_fill_b(*b_limit1);
104   b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
105 
106   limit = (v16u8)__msa_fill_b(*limit0);
107   tmp = (v16u8)__msa_fill_b(*limit1);
108   limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
109 
110   /* mask and hev */
111   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
112                mask, flat);
113   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
114   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
115 
116   if (__msa_test_bz_v(flat)) {
117     ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
118   } else {
119     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
120                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
121     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
122                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
123 
124     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
125     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
126     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
127                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
128 
129     /* convert 16 bit output data into 8 bit */
130     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
131                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
132                 p0_filt8_r, q0_filt8_r);
133     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
134                 q2_filt8_r);
135 
136     /* store pixel values */
137     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
138     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
139     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
140     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
141     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
142     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
143 
144     src -= 3 * pitch;
145 
146     ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
147     src += (4 * pitch);
148     ST_UB2(q1_out, q2_out, src, pitch);
149     src += (2 * pitch);
150   }
151 }
152 
vpx_lpf_vertical_8_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)153 void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
154                             const uint8_t *b_limit_ptr,
155                             const uint8_t *limit_ptr,
156                             const uint8_t *thresh_ptr) {
157   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
158   v16u8 p1_out, p0_out, q0_out, q1_out;
159   v16u8 flat, mask, hev, thresh, b_limit, limit;
160   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
161   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
162   v16u8 zero = { 0 };
163   v8i16 vec0, vec1, vec2, vec3, vec4;
164 
165   /* load vector elements */
166   LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
167 
168   TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
169                      q3);
170 
171   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
172   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
173   limit = (v16u8)__msa_fill_b(*limit_ptr);
174 
175   /* mask and hev */
176   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
177                mask, flat);
178   /* flat4 */
179   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
180   /* filter4 */
181   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
182 
183   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
184 
185   if (__msa_test_bz_v(flat)) {
186     /* Store 4 pixels p1-_q1 */
187     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
188     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
189 
190     src -= 2;
191     ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
192     src += 4 * pitch;
193     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
194   } else {
195     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
196                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
197     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
198                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
199     /* convert 16 bit output data into 8 bit */
200     PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
201                 p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
202                 p0_filt8_r, q0_filt8_r);
203     PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
204                 q2_filt8_r);
205 
206     /* store pixel values */
207     p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
208     p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
209     p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
210     q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
211     q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
212     q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
213 
214     /* Store 6 pixels p2-_q2 */
215     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
216     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
217     vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
218 
219     src -= 3;
220     ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
221     ST2x4_UB(vec4, 0, src + 4, pitch);
222     src += (4 * pitch);
223     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
224     ST2x4_UB(vec4, 4, src + 4, pitch);
225   }
226 }
227 
vpx_lpf_vertical_8_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * b_limit1,const uint8_t * limit1,const uint8_t * thresh1)228 void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
229                                  const uint8_t *b_limit0, const uint8_t *limit0,
230                                  const uint8_t *thresh0,
231                                  const uint8_t *b_limit1, const uint8_t *limit1,
232                                  const uint8_t *thresh1) {
233   uint8_t *temp_src;
234   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
235   v16u8 p1_out, p0_out, q0_out, q1_out;
236   v16u8 flat, mask, hev, thresh, b_limit, limit;
237   v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
238   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
239   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
240   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
241   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
242   v16u8 zero = { 0 };
243   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
244 
245   temp_src = src - 4;
246 
247   LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
248   temp_src += (8 * pitch);
249   LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
250 
251   /* transpose 16x8 matrix into 8x16 */
252   TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
253                       row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
254                       q3);
255 
256   thresh = (v16u8)__msa_fill_b(*thresh0);
257   vec0 = (v8i16)__msa_fill_b(*thresh1);
258   thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
259 
260   b_limit = (v16u8)__msa_fill_b(*b_limit0);
261   vec0 = (v8i16)__msa_fill_b(*b_limit1);
262   b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
263 
264   limit = (v16u8)__msa_fill_b(*limit0);
265   vec0 = (v8i16)__msa_fill_b(*limit1);
266   limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
267 
268   /* mask and hev */
269   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
270                mask, flat);
271   /* flat4 */
272   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
273   /* filter4 */
274   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
275 
276   if (__msa_test_bz_v(flat)) {
277     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
278     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
279     ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
280     ILVRL_H2_SH(vec1, vec0, vec4, vec5);
281 
282     src -= 2;
283     ST4x8_UB(vec2, vec3, src, pitch);
284     src += 8 * pitch;
285     ST4x8_UB(vec4, vec5, src, pitch);
286   } else {
287     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
288                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
289     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
290                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
291 
292     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
293     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
294 
295     /* filter8 */
296     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
297                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
298 
299     /* convert 16 bit output data into 8 bit */
300     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
301                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
302                 p0_filt8_r, q0_filt8_r);
303     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
304                 q2_filt8_r);
305 
306     /* store pixel values */
307     p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
308     p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
309     p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
310     q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
311     q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
312     q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
313 
314     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
315     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
316     ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
317     ILVRL_H2_SH(vec1, vec0, vec6, vec7);
318     ILVRL_B2_SH(q2, q1, vec2, vec5);
319 
320     src -= 3;
321     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
322     ST2x4_UB(vec2, 0, src + 4, pitch);
323     src += (4 * pitch);
324     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
325     ST2x4_UB(vec2, 4, src + 4, pitch);
326     src += (4 * pitch);
327     ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
328     ST2x4_UB(vec5, 0, src + 4, pitch);
329     src += (4 * pitch);
330     ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
331     ST2x4_UB(vec5, 4, src + 4, pitch);
332   }
333 }
334