1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
14
common_hz_8t_and_aver_dst_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)15 static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
16 int32_t src_stride, uint8_t *dst,
17 int32_t dst_stride,
18 int8_t *filter) {
19 uint32_t tp0, tp1, tp2, tp3;
20 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
21 v16u8 dst0 = { 0 }, res;
22 v16u8 mask0, mask1, mask2, mask3;
23 v8i16 filt, res0, res1;
24
25 mask0 = LD_UB(&mc_filt_mask_arr[16]);
26 src -= 3;
27
28 /* rearranging filter */
29 filt = LD_SH(filter);
30 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
31
32 mask1 = mask0 + 2;
33 mask2 = mask0 + 4;
34 mask3 = mask0 + 6;
35
36 LD_SB4(src, src_stride, src0, src1, src2, src3);
37 XORI_B4_128_SB(src0, src1, src2, src3);
38 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
39 filt0, filt1, filt2, filt3, res0, res1);
40 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
41 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
42 SRARI_H2_SH(res0, res1, FILTER_BITS);
43 SAT_SH2_SH(res0, res1, 7);
44 res = PCKEV_XORI128_UB(res0, res1);
45 res = (v16u8)__msa_aver_u_b(res, dst0);
46 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
47 }
48
common_hz_8t_and_aver_dst_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)49 static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
50 int32_t src_stride, uint8_t *dst,
51 int32_t dst_stride,
52 int8_t *filter) {
53 uint32_t tp0, tp1, tp2, tp3;
54 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
55 v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
56 v16u8 dst0 = { 0 }, dst1 = { 0 };
57 v8i16 filt, vec0, vec1, vec2, vec3;
58
59 mask0 = LD_UB(&mc_filt_mask_arr[16]);
60 src -= 3;
61
62 /* rearranging filter */
63 filt = LD_SH(filter);
64 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
65
66 mask1 = mask0 + 2;
67 mask2 = mask0 + 4;
68 mask3 = mask0 + 6;
69
70 LD_SB4(src, src_stride, src0, src1, src2, src3);
71 XORI_B4_128_SB(src0, src1, src2, src3);
72 src += (4 * src_stride);
73 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
74 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
75 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
76 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
77 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
78 filt0, filt1, filt2, filt3, vec0, vec1);
79 LD_SB4(src, src_stride, src0, src1, src2, src3);
80 XORI_B4_128_SB(src0, src1, src2, src3);
81 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
82 filt0, filt1, filt2, filt3, vec2, vec3);
83 SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS);
84 SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
85 PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2,
86 res3);
87 ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
88 XORI_B2_128_UB(res0, res2);
89 AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
90 ST4x8_UB(res0, res2, dst, dst_stride);
91 }
92
common_hz_8t_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)93 static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
94 int32_t src_stride, uint8_t *dst,
95 int32_t dst_stride, int8_t *filter,
96 int32_t height) {
97 if (4 == height) {
98 common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
99 } else if (8 == height) {
100 common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
101 }
102 }
103
common_hz_8t_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)104 static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
105 int32_t src_stride, uint8_t *dst,
106 int32_t dst_stride, int8_t *filter,
107 int32_t height) {
108 int32_t loop_cnt;
109 int64_t tp0, tp1, tp2, tp3;
110 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
111 v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 };
112 v8i16 filt, out0, out1, out2, out3;
113
114 mask0 = LD_UB(&mc_filt_mask_arr[0]);
115 src -= 3;
116
117 /* rearranging filter */
118 filt = LD_SH(filter);
119 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
120
121 mask1 = mask0 + 2;
122 mask2 = mask0 + 4;
123 mask3 = mask0 + 6;
124
125 for (loop_cnt = (height >> 2); loop_cnt--;) {
126 LD_SB4(src, src_stride, src0, src1, src2, src3);
127 XORI_B4_128_SB(src0, src1, src2, src3);
128 src += (4 * src_stride);
129 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
130 mask3, filt0, filt1, filt2, filt3, out0, out1,
131 out2, out3);
132 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
133 INSERT_D2_UB(tp0, tp1, dst0);
134 INSERT_D2_UB(tp2, tp3, dst1);
135 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
136 SAT_SH4_SH(out0, out1, out2, out3, 7);
137 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
138 dst_stride);
139 dst += (4 * dst_stride);
140 }
141 }
142
common_hz_8t_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)143 static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src,
144 int32_t src_stride, uint8_t *dst,
145 int32_t dst_stride,
146 int8_t *filter, int32_t height) {
147 int32_t loop_cnt;
148 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
149 v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
150 v8i16 filt, out0, out1, out2, out3;
151 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
152 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
153
154 mask0 = LD_UB(&mc_filt_mask_arr[0]);
155 src -= 3;
156
157 /* rearranging filter */
158 filt = LD_SH(filter);
159 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
160
161 mask1 = mask0 + 2;
162 mask2 = mask0 + 4;
163 mask3 = mask0 + 6;
164
165 for (loop_cnt = height >> 1; loop_cnt--;) {
166 LD_SB2(src, src_stride, src0, src2);
167 LD_SB2(src + 8, src_stride, src1, src3);
168 src += (2 * src_stride);
169
170 XORI_B4_128_SB(src0, src1, src2, src3);
171 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
172 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
173 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
174 vec14);
175 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
176 vec15);
177 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
178 vec2, vec3);
179 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
180 vec9, vec10, vec11);
181 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
182 vec2, vec3);
183 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
184 vec9, vec10, vec11);
185 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
186 out2, out3);
187 LD_UB2(dst, dst_stride, dst0, dst1);
188 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
189 SAT_SH4_SH(out0, out1, out2, out3, 7);
190 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
191 dst += dst_stride;
192 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
193 dst += dst_stride;
194 }
195 }
196
common_hz_8t_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)197 static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src,
198 int32_t src_stride, uint8_t *dst,
199 int32_t dst_stride,
200 int8_t *filter, int32_t height) {
201 uint32_t loop_cnt;
202 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
203 v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
204 v8i16 filt, out0, out1, out2, out3;
205 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
206 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
207
208 mask0 = LD_UB(&mc_filt_mask_arr[0]);
209 src -= 3;
210
211 /* rearranging filter */
212 filt = LD_SH(filter);
213 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
214
215 mask1 = mask0 + 2;
216 mask2 = mask0 + 4;
217 mask3 = mask0 + 6;
218
219 for (loop_cnt = height; loop_cnt--;) {
220 src0 = LD_SB(src);
221 src2 = LD_SB(src + 16);
222 src3 = LD_SB(src + 24);
223 src1 = __msa_sldi_b(src2, src0, 8);
224 src += src_stride;
225
226 XORI_B4_128_SB(src0, src1, src2, src3);
227 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12);
228 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13);
229 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
230 vec14);
231 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
232 vec15);
233 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
234 vec2, vec3);
235 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
236 vec9, vec10, vec11);
237 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1,
238 vec2, vec3);
239 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
240 vec9, vec10, vec11);
241 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
242 out2, out3);
243 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
244 SAT_SH4_SH(out0, out1, out2, out3, 7);
245 LD_UB2(dst, 16, dst1, dst2);
246 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
247 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
248 dst += dst_stride;
249 }
250 }
251
common_hz_8t_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)252 static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src,
253 int32_t src_stride, uint8_t *dst,
254 int32_t dst_stride,
255 int8_t *filter, int32_t height) {
256 uint32_t loop_cnt, cnt;
257 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
258 v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
259 v8i16 filt, out0, out1, out2, out3;
260 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
261 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
262
263 mask0 = LD_UB(&mc_filt_mask_arr[0]);
264 src -= 3;
265
266 /* rearranging filter */
267 filt = LD_SH(filter);
268 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
269
270 mask1 = mask0 + 2;
271 mask2 = mask0 + 4;
272 mask3 = mask0 + 6;
273
274 for (loop_cnt = height; loop_cnt--;) {
275 for (cnt = 0; cnt < 2; ++cnt) {
276 src0 = LD_SB(&src[cnt << 5]);
277 src2 = LD_SB(&src[16 + (cnt << 5)]);
278 src3 = LD_SB(&src[24 + (cnt << 5)]);
279 src1 = __msa_sldi_b(src2, src0, 8);
280
281 XORI_B4_128_SB(src0, src1, src2, src3);
282 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
283 vec12);
284 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
285 vec13);
286 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
287 vec14);
288 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
289 vec15);
290 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
291 vec1, vec2, vec3);
292 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
293 vec9, vec10, vec11);
294 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
295 vec1, vec2, vec3);
296 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8,
297 vec9, vec10, vec11);
298 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1,
299 out2, out3);
300 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
301 SAT_SH4_SH(out0, out1, out2, out3, 7);
302 LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
303 PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
304 PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
305 }
306
307 src += src_stride;
308 dst += dst_stride;
309 }
310 }
311
common_hz_2t_and_aver_dst_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)312 static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
313 int32_t src_stride, uint8_t *dst,
314 int32_t dst_stride,
315 int8_t *filter) {
316 uint32_t tp0, tp1, tp2, tp3;
317 v16i8 src0, src1, src2, src3, mask;
318 v16u8 filt0, dst0 = { 0 }, vec0, vec1, res;
319 v8u16 vec2, vec3, filt;
320
321 mask = LD_SB(&mc_filt_mask_arr[16]);
322
323 /* rearranging filter */
324 filt = LD_UH(filter);
325 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
326
327 LD_SB4(src, src_stride, src0, src1, src2, src3);
328 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
329 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
330 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
331 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
332 SRARI_H2_UH(vec2, vec3, FILTER_BITS);
333 res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
334 res = (v16u8)__msa_aver_u_b(res, dst0);
335 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
336 }
337
common_hz_2t_and_aver_dst_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)338 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
339 int32_t src_stride, uint8_t *dst,
340 int32_t dst_stride,
341 int8_t *filter) {
342 uint32_t tp0, tp1, tp2, tp3;
343 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
344 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
345 v16u8 dst0 = { 0 }, dst1 = { 0 };
346 v8u16 vec4, vec5, vec6, vec7, filt;
347
348 mask = LD_SB(&mc_filt_mask_arr[16]);
349
350 /* rearranging filter */
351 filt = LD_UH(filter);
352 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
353
354 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
355 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
356 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
357 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
358 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
359 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
360 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
361 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
362 vec6, vec7);
363 SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
364 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
365 res3);
366 ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
367 AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
368 ST4x8_UB(res0, res2, dst, dst_stride);
369 }
370
common_hz_2t_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)371 static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
372 int32_t src_stride, uint8_t *dst,
373 int32_t dst_stride, int8_t *filter,
374 int32_t height) {
375 if (4 == height) {
376 common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
377 } else if (8 == height) {
378 common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
379 }
380 }
381
common_hz_2t_and_aver_dst_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)382 static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
383 int32_t src_stride, uint8_t *dst,
384 int32_t dst_stride,
385 int8_t *filter) {
386 int64_t tp0, tp1, tp2, tp3;
387 v16i8 src0, src1, src2, src3, mask;
388 v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
389 v8u16 vec0, vec1, vec2, vec3, filt;
390
391 mask = LD_SB(&mc_filt_mask_arr[0]);
392
393 /* rearranging filter */
394 filt = LD_UH(filter);
395 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
396
397 LD_SB4(src, src_stride, src0, src1, src2, src3);
398 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
399 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
400 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
401 vec2, vec3);
402 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
403 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
404 INSERT_D2_UB(tp0, tp1, dst0);
405 INSERT_D2_UB(tp2, tp3, dst1);
406 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
407 }
408
common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)409 static void common_hz_2t_and_aver_dst_8x8mult_msa(
410 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
411 int8_t *filter, int32_t height) {
412 int64_t tp0, tp1, tp2, tp3;
413 v16i8 src0, src1, src2, src3, mask;
414 v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
415 v8u16 vec0, vec1, vec2, vec3, filt;
416
417 mask = LD_SB(&mc_filt_mask_arr[0]);
418
419 /* rearranging filter */
420 filt = LD_UH(filter);
421 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
422
423 LD_SB4(src, src_stride, src0, src1, src2, src3);
424 src += (4 * src_stride);
425 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
426 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
427 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
428 vec2, vec3);
429 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
430 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
431 INSERT_D2_UB(tp0, tp1, dst0);
432 INSERT_D2_UB(tp2, tp3, dst1);
433 LD_SB4(src, src_stride, src0, src1, src2, src3);
434 src += (4 * src_stride);
435 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
436 dst += (4 * dst_stride);
437
438 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
439 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
440 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
441 vec2, vec3);
442 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
443 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
444 INSERT_D2_UB(tp0, tp1, dst0);
445 INSERT_D2_UB(tp2, tp3, dst1);
446 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
447 dst += (4 * dst_stride);
448
449 if (16 == height) {
450 LD_SB4(src, src_stride, src0, src1, src2, src3);
451 src += (4 * src_stride);
452
453 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
454 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
455 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
456 vec2, vec3);
457 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
458 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
459 INSERT_D2_UB(tp0, tp1, dst0);
460 INSERT_D2_UB(tp2, tp3, dst1);
461 LD_SB4(src, src_stride, src0, src1, src2, src3);
462 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
463 dst += (4 * dst_stride);
464
465 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
466 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
467 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
468 vec2, vec3);
469 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
470 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
471 INSERT_D2_UB(tp0, tp1, dst0);
472 INSERT_D2_UB(tp2, tp3, dst1);
473 PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
474 }
475 }
476
common_hz_2t_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)477 static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src,
478 int32_t src_stride, uint8_t *dst,
479 int32_t dst_stride, int8_t *filter,
480 int32_t height) {
481 if (4 == height) {
482 common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
483 } else {
484 common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
485 filter, height);
486 }
487 }
488
common_hz_2t_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)489 static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src,
490 int32_t src_stride, uint8_t *dst,
491 int32_t dst_stride,
492 int8_t *filter, int32_t height) {
493 uint32_t loop_cnt;
494 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
495 v16u8 filt0, dst0, dst1, dst2, dst3;
496 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
497 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
498
499 mask = LD_SB(&mc_filt_mask_arr[0]);
500
501 /* rearranging filter */
502 filt = LD_UH(filter);
503 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
504
505 LD_SB4(src, src_stride, src0, src2, src4, src6);
506 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
507 src += (4 * src_stride);
508
509 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
510 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
511 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
512 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
513 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
514 res2, res3);
515 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
516 res6, res7);
517 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
518 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
519 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
520 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
521 dst += dst_stride;
522 PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
523 dst += dst_stride;
524 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
525 dst += dst_stride;
526 PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
527 dst += dst_stride;
528
529 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
530 LD_SB4(src, src_stride, src0, src2, src4, src6);
531 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
532 src += (4 * src_stride);
533
534 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
535 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
536 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
537 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
538 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
539 res2, res3);
540 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
541 res6, res7);
542 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
543 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
544 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
545 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
546 dst += dst_stride;
547 PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
548 dst += dst_stride;
549 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
550 dst += dst_stride;
551 PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
552 dst += dst_stride;
553 }
554 }
555
common_hz_2t_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)556 static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src,
557 int32_t src_stride, uint8_t *dst,
558 int32_t dst_stride,
559 int8_t *filter, int32_t height) {
560 uint32_t loop_cnt;
561 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
562 v16u8 filt0, dst0, dst1, dst2, dst3;
563 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
564 v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
565
566 mask = LD_SB(&mc_filt_mask_arr[0]);
567
568 /* rearranging filter */
569 filt = LD_UH(filter);
570 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
571
572 for (loop_cnt = (height >> 1); loop_cnt--;) {
573 src0 = LD_SB(src);
574 src2 = LD_SB(src + 16);
575 src3 = LD_SB(src + 24);
576 src1 = __msa_sldi_b(src2, src0, 8);
577 src += src_stride;
578 src4 = LD_SB(src);
579 src6 = LD_SB(src + 16);
580 src7 = LD_SB(src + 24);
581 src5 = __msa_sldi_b(src6, src4, 8);
582 src += src_stride;
583
584 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
585 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
586 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
587 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
588 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
589 res2, res3);
590 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
591 res6, res7);
592 SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
593 SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
594 LD_UB2(dst, 16, dst0, dst1);
595 PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
596 PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
597 dst += dst_stride;
598 LD_UB2(dst, 16, dst2, dst3);
599 PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
600 PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
601 dst += dst_stride;
602 }
603 }
604
common_hz_2t_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)605 static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
606 int32_t src_stride, uint8_t *dst,
607 int32_t dst_stride,
608 int8_t *filter, int32_t height) {
609 uint32_t loop_cnt;
610 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
611 v16u8 filt0, dst0, dst1, dst2, dst3;
612 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
613 v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
614
615 mask = LD_SB(&mc_filt_mask_arr[0]);
616
617 /* rearranging filter */
618 filt = LD_UH(filter);
619 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
620
621 for (loop_cnt = height; loop_cnt--;) {
622 LD_SB4(src, 16, src0, src2, src4, src6);
623 src7 = LD_SB(src + 56);
624 SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
625 src += src_stride;
626
627 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
628 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
629 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
630 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
631 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
632 out2, out3);
633 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
634 out6, out7);
635 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
636 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
637 LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
638 PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
639 PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
640 PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
641 PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
642 dst += dst_stride;
643 }
644 }
645
vpx_convolve8_avg_horiz_msa(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)646 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
647 uint8_t *dst, ptrdiff_t dst_stride,
648 const InterpKernel *filter, int x0_q4,
649 int x_step_q4, int y0_q4, int y_step_q4, int w,
650 int h) {
651 const int16_t *const filter_x = filter[x0_q4];
652 int8_t cnt, filt_hor[8];
653
654 assert(x_step_q4 == 16);
655 assert(((const int32_t *)filter_x)[1] != 0x800000);
656
657 for (cnt = 0; cnt < 8; ++cnt) {
658 filt_hor[cnt] = filter_x[cnt];
659 }
660
661 if (vpx_get_filter_taps(filter_x) == 2) {
662 switch (w) {
663 case 4:
664 common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
665 (int32_t)dst_stride, &filt_hor[3], h);
666 break;
667 case 8:
668 common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
669 (int32_t)dst_stride, &filt_hor[3], h);
670 break;
671 case 16:
672 common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
673 (int32_t)dst_stride, &filt_hor[3], h);
674 break;
675 case 32:
676 common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
677 (int32_t)dst_stride, &filt_hor[3], h);
678 break;
679 case 64:
680 common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
681 (int32_t)dst_stride, &filt_hor[3], h);
682 break;
683 default:
684 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
685 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
686 break;
687 }
688 } else {
689 switch (w) {
690 case 4:
691 common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
692 (int32_t)dst_stride, filt_hor, h);
693 break;
694 case 8:
695 common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
696 (int32_t)dst_stride, filt_hor, h);
697 break;
698 case 16:
699 common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
700 (int32_t)dst_stride, filt_hor, h);
701 break;
702 case 32:
703 common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
704 (int32_t)dst_stride, filt_hor, h);
705 break;
706 case 64:
707 common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
708 (int32_t)dst_stride, filt_hor, h);
709 break;
710 default:
711 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
712 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
713 break;
714 }
715 }
716 }
717