1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp8_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vp8/common/filter.h"
14 #include "vp8/common/mips/msa/vp8_macros_msa.h"
15
16 DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_msa[7][8]) = {
17 { 0, -6, 123, 12, -1, 0, 0, 0 },
18 { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
19 { 0, -9, 93, 50, -6, 0, 0, 0 },
20 { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
21 { 0, -6, 50, 93, -9, 0, 0, 0 },
22 { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
23 { 0, -1, 12, 123, -6, 0, 0, 0 },
24 };
25
26 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
27 /* 8 width cases */
28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 /* 4 width cases */
30 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 /* 4 width cases */
32 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
33 };
34
35 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
36 filt_h2) \
37 ({ \
38 v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m; \
39 v8i16 _6tap_out_m; \
40 \
41 VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
42 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m); \
43 _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m, \
44 filt_h0, filt_h1, filt_h2); \
45 \
46 _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT); \
47 _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7); \
48 \
49 _6tap_out_m; \
50 })
51
52 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
53 mask2, filt0, filt1, filt2, out0, out1) \
54 { \
55 v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m, \
56 _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m; \
57 \
58 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m, \
59 _6tap_4wid_vec1_m); \
60 DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0, \
61 out1); \
62 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m, \
63 _6tap_4wid_vec3_m); \
64 DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \
65 out1); \
66 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m, \
67 _6tap_4wid_vec5_m); \
68 DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \
69 out1); \
70 }
71
72 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
73 mask2, filt0, filt1, filt2, out0, out1, \
74 out2, out3) \
75 { \
76 v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \
77 _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m, \
78 _6tap_8wid_vec6_m, _6tap_8wid_vec7_m; \
79 \
80 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m, \
81 _6tap_8wid_vec1_m); \
82 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m, \
83 _6tap_8wid_vec3_m); \
84 DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \
85 _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \
86 out2, out3); \
87 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m, \
88 _6tap_8wid_vec1_m); \
89 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m, \
90 _6tap_8wid_vec3_m); \
91 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m, \
92 _6tap_8wid_vec5_m); \
93 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m, \
94 _6tap_8wid_vec7_m); \
95 DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \
96 _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
97 out2, out3); \
98 DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m, \
99 _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \
100 out2, out3); \
101 }
102
103 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
104 ({ \
105 v8i16 _4tap_dpadd_tmp0; \
106 \
107 _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
108 _4tap_dpadd_tmp0 = \
109 __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \
110 \
111 _4tap_dpadd_tmp0; \
112 })
113
114 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
115 ({ \
116 v16i8 _4tap_vec0_m, _4tap_vec1_m; \
117 v8i16 _4tap_out_m; \
118 \
119 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m, \
120 _4tap_vec1_m); \
121 _4tap_out_m = \
122 FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \
123 \
124 _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT); \
125 _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7); \
126 \
127 _4tap_out_m; \
128 })
129
130 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
131 filt0, filt1, out0, out1) \
132 { \
133 v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m, \
134 _4tap_4wid_vec3_m; \
135 \
136 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m, \
137 _4tap_4wid_vec1_m); \
138 DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0, \
139 out1); \
140 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m, \
141 _4tap_4wid_vec3_m); \
142 DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \
143 out1); \
144 }
145
146 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
147 filt0, filt1, out0, out1, out2, out3) \
148 { \
149 v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \
150 _4tap_8wid_vec3_m; \
151 \
152 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m, \
153 _4tap_8wid_vec1_m); \
154 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m, \
155 _4tap_8wid_vec3_m); \
156 DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \
157 _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \
158 out2, out3); \
159 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m, \
160 _4tap_8wid_vec1_m); \
161 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m, \
162 _4tap_8wid_vec3_m); \
163 DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \
164 _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
165 out2, out3); \
166 }
167
common_hz_6t_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)168 static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
169 uint8_t *RESTRICT dst, int32_t dst_stride,
170 const int8_t *filter) {
171 v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
172 v16u8 mask0, mask1, mask2, out;
173 v8i16 filt, out0, out1;
174
175 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
176 src -= 2;
177
178 filt = LD_SH(filter);
179 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
180
181 mask1 = mask0 + 2;
182 mask2 = mask0 + 4;
183
184 LD_SB4(src, src_stride, src0, src1, src2, src3);
185 XORI_B4_128_SB(src0, src1, src2, src3);
186 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
187 filt1, filt2, out0, out1);
188 SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
189 SAT_SH2_SH(out0, out1, 7);
190 out = PCKEV_XORI128_UB(out0, out1);
191 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
192 }
193
common_hz_6t_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)194 static void common_hz_6t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
195 uint8_t *RESTRICT dst, int32_t dst_stride,
196 const int8_t *filter) {
197 v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
198 v16u8 mask0, mask1, mask2, out;
199 v8i16 filt, out0, out1, out2, out3;
200
201 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
202 src -= 2;
203
204 filt = LD_SH(filter);
205 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
206
207 mask1 = mask0 + 2;
208 mask2 = mask0 + 4;
209
210 LD_SB4(src, src_stride, src0, src1, src2, src3);
211 XORI_B4_128_SB(src0, src1, src2, src3);
212 src += (4 * src_stride);
213 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
214 filt1, filt2, out0, out1);
215 LD_SB4(src, src_stride, src0, src1, src2, src3);
216 XORI_B4_128_SB(src0, src1, src2, src3);
217 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
218 filt1, filt2, out2, out3);
219 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
220 SAT_SH4_SH(out0, out1, out2, out3, 7);
221 out = PCKEV_XORI128_UB(out0, out1);
222 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
223 dst += (4 * dst_stride);
224 out = PCKEV_XORI128_UB(out2, out3);
225 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
226 }
227
common_hz_6t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)228 static void common_hz_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
229 uint8_t *RESTRICT dst, int32_t dst_stride,
230 const int8_t *filter, int32_t height) {
231 if (4 == height) {
232 common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
233 } else if (8 == height) {
234 common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
235 }
236 }
237
common_hz_6t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)238 static void common_hz_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
239 uint8_t *RESTRICT dst, int32_t dst_stride,
240 const int8_t *filter, int32_t height) {
241 uint32_t loop_cnt;
242 v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
243 v16u8 mask0, mask1, mask2, tmp0, tmp1;
244 v8i16 filt, out0, out1, out2, out3;
245
246 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
247 src -= 2;
248
249 filt = LD_SH(filter);
250 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
251
252 mask1 = mask0 + 2;
253 mask2 = mask0 + 4;
254
255 LD_SB4(src, src_stride, src0, src1, src2, src3);
256 XORI_B4_128_SB(src0, src1, src2, src3);
257 src += (4 * src_stride);
258 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
259 filt1, filt2, out0, out1, out2, out3);
260 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
261 SAT_SH4_SH(out0, out1, out2, out3, 7);
262 tmp0 = PCKEV_XORI128_UB(out0, out1);
263 tmp1 = PCKEV_XORI128_UB(out2, out3);
264 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
265 dst += (4 * dst_stride);
266
267 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
268 LD_SB4(src, src_stride, src0, src1, src2, src3);
269 XORI_B4_128_SB(src0, src1, src2, src3);
270 src += (4 * src_stride);
271 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
272 filt0, filt1, filt2, out0, out1, out2, out3);
273 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
274 SAT_SH4_SH(out0, out1, out2, out3, 7);
275 tmp0 = PCKEV_XORI128_UB(out0, out1);
276 tmp1 = PCKEV_XORI128_UB(out2, out3);
277 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
278 dst += (4 * dst_stride);
279 }
280 }
281
common_hz_6t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)282 static void common_hz_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
283 uint8_t *RESTRICT dst, int32_t dst_stride,
284 const int8_t *filter, int32_t height) {
285 uint32_t loop_cnt;
286 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
287 v16u8 mask0, mask1, mask2, out;
288 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
289
290 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
291 src -= 2;
292
293 filt = LD_SH(filter);
294 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
295
296 mask1 = mask0 + 2;
297 mask2 = mask0 + 4;
298
299 for (loop_cnt = (height >> 2); loop_cnt--;) {
300 LD_SB4(src, src_stride, src0, src2, src4, src6);
301 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
302 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
303 src += (4 * src_stride);
304
305 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
306 filt0, filt1, filt2, out0, out1, out2, out3);
307 HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
308 filt0, filt1, filt2, out4, out5, out6, out7);
309 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
310 SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
311 SAT_SH4_SH(out0, out1, out2, out3, 7);
312 SAT_SH4_SH(out4, out5, out6, out7, 7);
313 out = PCKEV_XORI128_UB(out0, out1);
314 ST_UB(out, dst);
315 dst += dst_stride;
316 out = PCKEV_XORI128_UB(out2, out3);
317 ST_UB(out, dst);
318 dst += dst_stride;
319 out = PCKEV_XORI128_UB(out4, out5);
320 ST_UB(out, dst);
321 dst += dst_stride;
322 out = PCKEV_XORI128_UB(out6, out7);
323 ST_UB(out, dst);
324 dst += dst_stride;
325 }
326 }
327
common_vt_6t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)328 static void common_vt_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
329 uint8_t *RESTRICT dst, int32_t dst_stride,
330 const int8_t *filter, int32_t height) {
331 uint32_t loop_cnt;
332 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
333 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
334 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
335 v16u8 out;
336 v8i16 filt, out10, out32;
337
338 src -= (2 * src_stride);
339
340 filt = LD_SH(filter);
341 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
342
343 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
344 src += (5 * src_stride);
345
346 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
347 src32_r, src43_r);
348 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
349 XORI_B2_128_SB(src2110, src4332);
350
351 for (loop_cnt = (height >> 2); loop_cnt--;) {
352 LD_SB4(src, src_stride, src5, src6, src7, src8);
353 src += (4 * src_stride);
354
355 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
356 src76_r, src87_r);
357 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
358 XORI_B2_128_SB(src6554, src8776);
359 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
360 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
361 SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
362 SAT_SH2_SH(out10, out32, 7);
363 out = PCKEV_XORI128_UB(out10, out32);
364 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
365 dst += (4 * dst_stride);
366
367 src2110 = src6554;
368 src4332 = src8776;
369 src4 = src8;
370 }
371 }
372
common_vt_6t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)373 static void common_vt_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
374 uint8_t *RESTRICT dst, int32_t dst_stride,
375 const int8_t *filter, int32_t height) {
376 uint32_t loop_cnt;
377 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
378 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
379 v16i8 src109_r, filt0, filt1, filt2;
380 v16u8 tmp0, tmp1;
381 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
382
383 src -= (2 * src_stride);
384
385 filt = LD_SH(filter);
386 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
387
388 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
389 src += (5 * src_stride);
390
391 XORI_B5_128_SB(src0, src1, src2, src3, src4);
392 ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, src10_r, src32_r,
393 src21_r, src43_r);
394
395 for (loop_cnt = (height >> 2); loop_cnt--;) {
396 LD_SB4(src, src_stride, src7, src8, src9, src10);
397 XORI_B4_128_SB(src7, src8, src9, src10);
398 src += (4 * src_stride);
399
400 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
401 src87_r, src98_r, src109_r);
402 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
403 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
404 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
405 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
406 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
407 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
408 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
409 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
410 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
411 dst += (4 * dst_stride);
412
413 src10_r = src76_r;
414 src32_r = src98_r;
415 src21_r = src87_r;
416 src43_r = src109_r;
417 src4 = src10;
418 }
419 }
420
common_vt_6t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)421 static void common_vt_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
422 uint8_t *RESTRICT dst, int32_t dst_stride,
423 const int8_t *filter, int32_t height) {
424 uint32_t loop_cnt;
425 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
426 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
427 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
428 v16i8 src65_l, src87_l, filt0, filt1, filt2;
429 v16u8 tmp0, tmp1, tmp2, tmp3;
430 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
431
432 src -= (2 * src_stride);
433
434 filt = LD_SH(filter);
435 SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
436
437 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
438 src += (5 * src_stride);
439
440 XORI_B5_128_SB(src0, src1, src2, src3, src4);
441 ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, src32_r,
442 src43_r, src21_r);
443 ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, src32_l,
444 src43_l, src21_l);
445
446 for (loop_cnt = (height >> 2); loop_cnt--;) {
447 LD_SB4(src, src_stride, src5, src6, src7, src8);
448 src += (4 * src_stride);
449
450 XORI_B4_128_SB(src5, src6, src7, src8);
451 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
452 src76_r, src87_r);
453 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
454 src76_l, src87_l);
455 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
456 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
457 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
458 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
459 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
460 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
461 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
462 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
463 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
464 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
465 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
466 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
467 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
468 tmp0, tmp1, tmp2, tmp3);
469 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
470 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
471 dst += (4 * dst_stride);
472
473 src10_r = src54_r;
474 src32_r = src76_r;
475 src21_r = src65_r;
476 src43_r = src87_r;
477 src10_l = src54_l;
478 src32_l = src76_l;
479 src21_l = src65_l;
480 src43_l = src87_l;
481 src4 = src8;
482 }
483 }
484
common_hv_6ht_6vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)485 static void common_hv_6ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
486 uint8_t *RESTRICT dst, int32_t dst_stride,
487 const int8_t *filter_horiz,
488 const int8_t *filter_vert,
489 int32_t height) {
490 uint32_t loop_cnt;
491 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
492 v16i8 filt_hz0, filt_hz1, filt_hz2;
493 v16u8 mask0, mask1, mask2, out;
494 v8i16 tmp0, tmp1;
495 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
496 v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
497
498 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
499 src -= (2 + 2 * src_stride);
500
501 filt = LD_SH(filter_horiz);
502 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
503 filt = LD_SH(filter_vert);
504 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
505
506 mask1 = mask0 + 2;
507 mask2 = mask0 + 4;
508
509 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
510 src += (5 * src_stride);
511
512 XORI_B5_128_SB(src0, src1, src2, src3, src4);
513 hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
514 filt_hz2);
515 hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
516 filt_hz2);
517 hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
518 hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
519 filt_hz2);
520 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
521
522 for (loop_cnt = (height >> 2); loop_cnt--;) {
523 LD_SB2(src, src_stride, src5, src6);
524 src += (2 * src_stride);
525
526 XORI_B2_128_SB(src5, src6);
527 hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
528 filt_hz1, filt_hz2);
529 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
530
531 LD_SB2(src, src_stride, src7, src8);
532 src += (2 * src_stride);
533
534 XORI_B2_128_SB(src7, src8);
535 hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
536 filt_hz1, filt_hz2);
537 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
538
539 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
540 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
541
542 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
543 tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
544
545 SRARI_H2_SH(tmp0, tmp1, 7);
546 SAT_SH2_SH(tmp0, tmp1, 7);
547 out = PCKEV_XORI128_UB(tmp0, tmp1);
548 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
549 dst += (4 * dst_stride);
550
551 hz_out3 = hz_out7;
552 out0 = out2;
553 out1 = out3;
554 }
555 }
556
common_hv_6ht_6vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)557 static void common_hv_6ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
558 uint8_t *RESTRICT dst, int32_t dst_stride,
559 const int8_t *filter_horiz,
560 const int8_t *filter_vert,
561 int32_t height) {
562 uint32_t loop_cnt;
563 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
564 v16i8 filt_hz0, filt_hz1, filt_hz2;
565 v16u8 mask0, mask1, mask2, vec0, vec1;
566 v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
567 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
568 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
569 v8i16 tmp0, tmp1, tmp2, tmp3;
570
571 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
572 src -= (2 + 2 * src_stride);
573
574 filt = LD_SH(filter_horiz);
575 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
576
577 mask1 = mask0 + 2;
578 mask2 = mask0 + 4;
579
580 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
581 src += (5 * src_stride);
582
583 XORI_B5_128_SB(src0, src1, src2, src3, src4);
584 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
585 filt_hz2);
586 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
587 filt_hz2);
588 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
589 filt_hz2);
590 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
591 filt_hz2);
592 hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
593 filt_hz2);
594
595 filt = LD_SH(filter_vert);
596 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
597
598 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
599 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
600
601 for (loop_cnt = (height >> 2); loop_cnt--;) {
602 LD_SB4(src, src_stride, src5, src6, src7, src8);
603 src += (4 * src_stride);
604
605 XORI_B4_128_SB(src5, src6, src7, src8);
606 hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
607 filt_hz1, filt_hz2);
608 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
609 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
610
611 hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
612 filt_hz1, filt_hz2);
613 out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
614 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
615
616 hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
617 filt_hz1, filt_hz2);
618 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
619 tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
620
621 hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
622 filt_hz1, filt_hz2);
623 out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
624 tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
625
626 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
627 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
628 vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
629 vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
630 ST8x4_UB(vec0, vec1, dst, dst_stride);
631 dst += (4 * dst_stride);
632
633 hz_out4 = hz_out8;
634 out0 = out2;
635 out1 = out7;
636 out3 = out5;
637 out4 = out6;
638 }
639 }
640
common_hv_6ht_6vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)641 static void common_hv_6ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
642 uint8_t *RESTRICT dst, int32_t dst_stride,
643 const int8_t *filter_horiz,
644 const int8_t *filter_vert,
645 int32_t height) {
646 int32_t multiple8_cnt;
647 for (multiple8_cnt = 2; multiple8_cnt--;) {
648 common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
649 filter_vert, height);
650 src += 8;
651 dst += 8;
652 }
653 }
654
common_hz_4t_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)655 static void common_hz_4t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
656 uint8_t *RESTRICT dst, int32_t dst_stride,
657 const int8_t *filter) {
658 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
659 v8i16 filt, out0, out1;
660 v16u8 out;
661
662 mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
663 src -= 1;
664
665 filt = LD_SH(filter);
666 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
667
668 mask1 = mask0 + 2;
669
670 LD_SB4(src, src_stride, src0, src1, src2, src3);
671 XORI_B4_128_SB(src0, src1, src2, src3);
672 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
673 out0, out1);
674 SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
675 SAT_SH2_SH(out0, out1, 7);
676 out = PCKEV_XORI128_UB(out0, out1);
677 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
678 }
679
common_hz_4t_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)680 static void common_hz_4t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
681 uint8_t *RESTRICT dst, int32_t dst_stride,
682 const int8_t *filter) {
683 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
684 v16u8 out;
685 v8i16 filt, out0, out1, out2, out3;
686
687 mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
688 src -= 1;
689
690 filt = LD_SH(filter);
691 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
692
693 mask1 = mask0 + 2;
694
695 LD_SB4(src, src_stride, src0, src1, src2, src3);
696 src += (4 * src_stride);
697
698 XORI_B4_128_SB(src0, src1, src2, src3);
699 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
700 out0, out1);
701 LD_SB4(src, src_stride, src0, src1, src2, src3);
702 XORI_B4_128_SB(src0, src1, src2, src3);
703 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
704 out2, out3);
705 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
706 SAT_SH4_SH(out0, out1, out2, out3, 7);
707 out = PCKEV_XORI128_UB(out0, out1);
708 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
709 dst += (4 * dst_stride);
710 out = PCKEV_XORI128_UB(out2, out3);
711 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
712 }
713
common_hz_4t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)714 static void common_hz_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
715 uint8_t *RESTRICT dst, int32_t dst_stride,
716 const int8_t *filter, int32_t height) {
717 if (4 == height) {
718 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
719 } else if (8 == height) {
720 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
721 }
722 }
723
common_hz_4t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)724 static void common_hz_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
725 uint8_t *RESTRICT dst, int32_t dst_stride,
726 const int8_t *filter, int32_t height) {
727 uint32_t loop_cnt;
728 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
729 v16u8 tmp0, tmp1;
730 v8i16 filt, out0, out1, out2, out3;
731
732 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
733 src -= 1;
734
735 filt = LD_SH(filter);
736 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
737
738 mask1 = mask0 + 2;
739
740 for (loop_cnt = (height >> 2); loop_cnt--;) {
741 LD_SB4(src, src_stride, src0, src1, src2, src3);
742 src += (4 * src_stride);
743
744 XORI_B4_128_SB(src0, src1, src2, src3);
745 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
746 filt1, out0, out1, out2, out3);
747 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
748 SAT_SH4_SH(out0, out1, out2, out3, 7);
749 tmp0 = PCKEV_XORI128_UB(out0, out1);
750 tmp1 = PCKEV_XORI128_UB(out2, out3);
751 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
752 dst += (4 * dst_stride);
753 }
754 }
755
common_hz_4t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)756 static void common_hz_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
757 uint8_t *RESTRICT dst, int32_t dst_stride,
758 const int8_t *filter, int32_t height) {
759 uint32_t loop_cnt;
760 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
761 v16i8 filt0, filt1, mask0, mask1;
762 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
763 v16u8 out;
764
765 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
766 src -= 1;
767
768 filt = LD_SH(filter);
769 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
770
771 mask1 = mask0 + 2;
772
773 for (loop_cnt = (height >> 2); loop_cnt--;) {
774 LD_SB4(src, src_stride, src0, src2, src4, src6);
775 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
776 src += (4 * src_stride);
777
778 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
779 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
780 filt1, out0, out1, out2, out3);
781 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
782 filt1, out4, out5, out6, out7);
783 SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
784 SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
785 SAT_SH4_SH(out0, out1, out2, out3, 7);
786 SAT_SH4_SH(out4, out5, out6, out7, 7);
787 out = PCKEV_XORI128_UB(out0, out1);
788 ST_UB(out, dst);
789 dst += dst_stride;
790 out = PCKEV_XORI128_UB(out2, out3);
791 ST_UB(out, dst);
792 dst += dst_stride;
793 out = PCKEV_XORI128_UB(out4, out5);
794 ST_UB(out, dst);
795 dst += dst_stride;
796 out = PCKEV_XORI128_UB(out6, out7);
797 ST_UB(out, dst);
798 dst += dst_stride;
799 }
800 }
801
common_vt_4t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)802 static void common_vt_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
803 uint8_t *RESTRICT dst, int32_t dst_stride,
804 const int8_t *filter, int32_t height) {
805 uint32_t loop_cnt;
806 v16i8 src0, src1, src2, src3, src4, src5;
807 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
808 v16i8 src2110, src4332, filt0, filt1;
809 v8i16 filt, out10, out32;
810 v16u8 out;
811
812 src -= src_stride;
813
814 filt = LD_SH(filter);
815 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
816
817 LD_SB3(src, src_stride, src0, src1, src2);
818 src += (3 * src_stride);
819
820 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
821
822 src2110 = (v16i8)__msa_ilvr_d((v2i64)src21_r, (v2i64)src10_r);
823 src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
824
825 for (loop_cnt = (height >> 2); loop_cnt--;) {
826 LD_SB3(src, src_stride, src3, src4, src5);
827 src += (3 * src_stride);
828 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
829 src4332 = (v16i8)__msa_ilvr_d((v2i64)src43_r, (v2i64)src32_r);
830 src4332 = (v16i8)__msa_xori_b((v16u8)src4332, 128);
831 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
832
833 src2 = LD_SB(src);
834 src += (src_stride);
835 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
836 src2110 = (v16i8)__msa_ilvr_d((v2i64)src65_r, (v2i64)src54_r);
837 src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
838 out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
839 SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
840 SAT_SH2_SH(out10, out32, 7);
841 out = PCKEV_XORI128_UB(out10, out32);
842 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
843 dst += (4 * dst_stride);
844 }
845 }
846
common_vt_4t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)847 static void common_vt_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
848 uint8_t *RESTRICT dst, int32_t dst_stride,
849 const int8_t *filter, int32_t height) {
850 uint32_t loop_cnt;
851 v16i8 src0, src1, src2, src7, src8, src9, src10;
852 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
853 v16u8 tmp0, tmp1;
854 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
855
856 src -= src_stride;
857
858 filt = LD_SH(filter);
859 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
860
861 LD_SB3(src, src_stride, src0, src1, src2);
862 src += (3 * src_stride);
863
864 XORI_B3_128_SB(src0, src1, src2);
865 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
866
867 for (loop_cnt = (height >> 2); loop_cnt--;) {
868 LD_SB4(src, src_stride, src7, src8, src9, src10);
869 src += (4 * src_stride);
870
871 XORI_B4_128_SB(src7, src8, src9, src10);
872 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, src72_r,
873 src87_r, src98_r, src109_r);
874 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
875 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
876 out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
877 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
878 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
879 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
880 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
881 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
882 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
883 dst += (4 * dst_stride);
884
885 src10_r = src98_r;
886 src21_r = src109_r;
887 src2 = src10;
888 }
889 }
890
common_vt_4t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)891 static void common_vt_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
892 uint8_t *RESTRICT dst, int32_t dst_stride,
893 const int8_t *filter, int32_t height) {
894 uint32_t loop_cnt;
895 v16i8 src0, src1, src2, src3, src4, src5, src6;
896 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
897 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
898 v16u8 tmp0, tmp1, tmp2, tmp3;
899 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
900
901 src -= src_stride;
902
903 filt = LD_SH(filter);
904 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
905
906 LD_SB3(src, src_stride, src0, src1, src2);
907 src += (3 * src_stride);
908
909 XORI_B3_128_SB(src0, src1, src2);
910 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
911 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
912
913 for (loop_cnt = (height >> 2); loop_cnt--;) {
914 LD_SB4(src, src_stride, src3, src4, src5, src6);
915 src += (4 * src_stride);
916
917 XORI_B4_128_SB(src3, src4, src5, src6);
918 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
919 src54_r, src65_r);
920 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_l, src43_l,
921 src54_l, src65_l);
922 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
923 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
924 out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
925 out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
926 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
927 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
928 out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
929 out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
930 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
931 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
932 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
933 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
934 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
935 tmp0, tmp1, tmp2, tmp3);
936 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
937 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
938 dst += (4 * dst_stride);
939
940 src10_r = src54_r;
941 src21_r = src65_r;
942 src10_l = src54_l;
943 src21_l = src65_l;
944 src2 = src6;
945 }
946 }
947
common_hv_4ht_4vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)948 static void common_hv_4ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
949 uint8_t *RESTRICT dst, int32_t dst_stride,
950 const int8_t *filter_horiz,
951 const int8_t *filter_vert,
952 int32_t height) {
953 uint32_t loop_cnt;
954 v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
955 v16u8 mask0, mask1, out;
956 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
957 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
958
959 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
960 src -= (1 + 1 * src_stride);
961
962 filt = LD_SH(filter_horiz);
963 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
964
965 mask1 = mask0 + 2;
966
967 LD_SB3(src, src_stride, src0, src1, src2);
968 src += (3 * src_stride);
969
970 XORI_B3_128_SB(src0, src1, src2);
971 hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
972 hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
973 vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
974
975 filt = LD_SH(filter_vert);
976 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
977
978 for (loop_cnt = (height >> 2); loop_cnt--;) {
979 LD_SB4(src, src_stride, src3, src4, src5, src6);
980 src += (4 * src_stride);
981
982 XORI_B2_128_SB(src3, src4);
983 hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
984 hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
985 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
986 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
987
988 XORI_B2_128_SB(src5, src6);
989 hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
990 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
991 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
992 tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
993
994 SRARI_H2_SH(tmp0, tmp1, 7);
995 SAT_SH2_SH(tmp0, tmp1, 7);
996 out = PCKEV_XORI128_UB(tmp0, tmp1);
997 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
998 dst += (4 * dst_stride);
999
1000 hz_out1 = hz_out5;
1001 vec0 = vec2;
1002 }
1003 }
1004
common_hv_4ht_4vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1005 static void common_hv_4ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1006 uint8_t *RESTRICT dst, int32_t dst_stride,
1007 const int8_t *filter_horiz,
1008 const int8_t *filter_vert,
1009 int32_t height) {
1010 uint32_t loop_cnt;
1011 v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1012 v16u8 mask0, mask1, out0, out1;
1013 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1014 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1015 v8i16 vec0, vec1, vec2, vec3, vec4;
1016
1017 mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
1018 src -= (1 + 1 * src_stride);
1019
1020 filt = LD_SH(filter_horiz);
1021 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1022
1023 mask1 = mask0 + 2;
1024
1025 LD_SB3(src, src_stride, src0, src1, src2);
1026 src += (3 * src_stride);
1027
1028 XORI_B3_128_SB(src0, src1, src2);
1029 hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1030 hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1031 hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1032 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1033
1034 filt = LD_SH(filter_vert);
1035 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1036
1037 for (loop_cnt = (height >> 2); loop_cnt--;) {
1038 LD_SB4(src, src_stride, src3, src4, src5, src6);
1039 src += (4 * src_stride);
1040
1041 XORI_B4_128_SB(src3, src4, src5, src6);
1042 hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1043 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1044 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1045
1046 hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1047 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
1048 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1049
1050 hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1051 vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1052 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1053
1054 hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1055 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1056 tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1057
1058 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1059 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1060 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1061 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1062 ST8x4_UB(out0, out1, dst, dst_stride);
1063 dst += (4 * dst_stride);
1064
1065 vec0 = vec4;
1066 vec2 = vec1;
1067 }
1068 }
1069
common_hv_4ht_4vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1070 static void common_hv_4ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1071 uint8_t *RESTRICT dst, int32_t dst_stride,
1072 const int8_t *filter_horiz,
1073 const int8_t *filter_vert,
1074 int32_t height) {
1075 int32_t multiple8_cnt;
1076 for (multiple8_cnt = 2; multiple8_cnt--;) {
1077 common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1078 filter_vert, height);
1079 src += 8;
1080 dst += 8;
1081 }
1082 }
1083
common_hv_6ht_4vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1084 static void common_hv_6ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1085 uint8_t *RESTRICT dst, int32_t dst_stride,
1086 const int8_t *filter_horiz,
1087 const int8_t *filter_vert,
1088 int32_t height) {
1089 uint32_t loop_cnt;
1090 v16i8 src0, src1, src2, src3, src4, src5, src6;
1091 v16i8 filt_hz0, filt_hz1, filt_hz2;
1092 v16u8 res0, res1, mask0, mask1, mask2;
1093 v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1094 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1095
1096 mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
1097 src -= (2 + 1 * src_stride);
1098
1099 filt = LD_SH(filter_horiz);
1100 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1101
1102 mask1 = mask0 + 2;
1103 mask2 = mask0 + 4;
1104
1105 LD_SB3(src, src_stride, src0, src1, src2);
1106 src += (3 * src_stride);
1107
1108 XORI_B3_128_SB(src0, src1, src2);
1109 hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1110 filt_hz2);
1111 hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1112 filt_hz2);
1113 vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1114
1115 filt = LD_SH(filter_vert);
1116 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1117
1118 for (loop_cnt = (height >> 2); loop_cnt--;) {
1119 LD_SB4(src, src_stride, src3, src4, src5, src6);
1120 src += (4 * src_stride);
1121
1122 XORI_B4_128_SB(src3, src4, src5, src6);
1123 hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1124 filt_hz1, filt_hz2);
1125 hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
1126 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1127 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1128
1129 hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1130 filt_hz1, filt_hz2);
1131 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
1132 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1133 tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1134
1135 SRARI_H2_SH(tmp0, tmp1, 7);
1136 SAT_SH2_SH(tmp0, tmp1, 7);
1137 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1138 XORI_B2_128_UB(res0, res1);
1139 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1140 dst += (4 * dst_stride);
1141
1142 hz_out1 = hz_out5;
1143 vec0 = vec2;
1144 }
1145 }
1146
common_hv_6ht_4vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1147 static void common_hv_6ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1148 uint8_t *RESTRICT dst, int32_t dst_stride,
1149 const int8_t *filter_horiz,
1150 const int8_t *filter_vert,
1151 int32_t height) {
1152 uint32_t loop_cnt;
1153 v16i8 src0, src1, src2, src3, src4, src5, src6;
1154 v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1155 v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1156 v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1157 v16u8 out0, out1;
1158
1159 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
1160 src -= (2 + src_stride);
1161
1162 filt = LD_SH(filter_horiz);
1163 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1164
1165 mask1 = mask0 + 2;
1166 mask2 = mask0 + 4;
1167
1168 LD_SB3(src, src_stride, src0, src1, src2);
1169 src += (3 * src_stride);
1170
1171 XORI_B3_128_SB(src0, src1, src2);
1172 hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
1173 filt_hz2);
1174 hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1175 filt_hz2);
1176 hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1177 filt_hz2);
1178 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1179
1180 filt = LD_SH(filter_vert);
1181 SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1182
1183 for (loop_cnt = (height >> 2); loop_cnt--;) {
1184 LD_SB4(src, src_stride, src3, src4, src5, src6);
1185 src += (4 * src_stride);
1186
1187 XORI_B4_128_SB(src3, src4, src5, src6);
1188
1189 hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1190 filt_hz1, filt_hz2);
1191 vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1192 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1193
1194 hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1195 filt_hz1, filt_hz2);
1196 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
1197 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1198
1199 hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1200 filt_hz1, filt_hz2);
1201 vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1202 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1203
1204 hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1205 filt_hz1, filt_hz2);
1206 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1207 tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1208
1209 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1210 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1211 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1212 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1213 ST8x4_UB(out0, out1, dst, dst_stride);
1214 dst += (4 * dst_stride);
1215 }
1216 }
1217
common_hv_6ht_4vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1218 static void common_hv_6ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1219 uint8_t *RESTRICT dst, int32_t dst_stride,
1220 const int8_t *filter_horiz,
1221 const int8_t *filter_vert,
1222 int32_t height) {
1223 int32_t multiple8_cnt;
1224 for (multiple8_cnt = 2; multiple8_cnt--;) {
1225 common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1226 filter_vert, height);
1227 src += 8;
1228 dst += 8;
1229 }
1230 }
1231
common_hv_4ht_6vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1232 static void common_hv_4ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1233 uint8_t *RESTRICT dst, int32_t dst_stride,
1234 const int8_t *filter_horiz,
1235 const int8_t *filter_vert,
1236 int32_t height) {
1237 uint32_t loop_cnt;
1238 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1239 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1240 v16u8 out;
1241 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1242 v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1243 v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1244
1245 mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
1246
1247 src -= (1 + 2 * src_stride);
1248
1249 filt = LD_SH(filter_horiz);
1250 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1251
1252 mask1 = mask0 + 2;
1253
1254 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1255 src += (5 * src_stride);
1256
1257 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1258 hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1259 hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1260 hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1261 hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1262 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1263
1264 filt = LD_SH(filter_vert);
1265 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1266
1267 for (loop_cnt = (height >> 2); loop_cnt--;) {
1268 LD_SB4(src, src_stride, src5, src6, src7, src8);
1269 XORI_B4_128_SB(src5, src6, src7, src8);
1270 src += (4 * src_stride);
1271
1272 hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1273 hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
1274 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1275 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1276
1277 hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1278 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
1279 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
1280 tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1281
1282 SRARI_H2_SH(tmp0, tmp1, 7);
1283 SAT_SH2_SH(tmp0, tmp1, 7);
1284 out = PCKEV_XORI128_UB(tmp0, tmp1);
1285 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1286 dst += (4 * dst_stride);
1287
1288 hz_out3 = hz_out7;
1289 out0 = out2;
1290 out1 = out3;
1291 }
1292 }
1293
common_hv_4ht_6vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1294 static void common_hv_4ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1295 uint8_t *RESTRICT dst, int32_t dst_stride,
1296 const int8_t *filter_horiz,
1297 const int8_t *filter_vert,
1298 int32_t height) {
1299 uint32_t loop_cnt;
1300 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1301 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1302 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1303 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1304 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1305 v16u8 vec0, vec1;
1306
1307 mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
1308 src -= (1 + 2 * src_stride);
1309
1310 filt = LD_SH(filter_horiz);
1311 SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1312
1313 mask1 = mask0 + 2;
1314
1315 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1316 src += (5 * src_stride);
1317
1318 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1319 hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1320 hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1321 hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1322 hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1323 hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1324 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1325 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1326
1327 filt = LD_SH(filter_vert);
1328 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1329
1330 for (loop_cnt = (height >> 2); loop_cnt--;) {
1331 LD_SB4(src, src_stride, src5, src6, src7, src8);
1332 src += (4 * src_stride);
1333
1334 XORI_B4_128_SB(src5, src6, src7, src8);
1335
1336 hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1337 out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1338 tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1339
1340 hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1341 out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
1342 tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1343
1344 hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1345 out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
1346 tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1347
1348 hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1349 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
1350 tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1351
1352 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1353 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1354 vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1355 vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1356 ST8x4_UB(vec0, vec1, dst, dst_stride);
1357 dst += (4 * dst_stride);
1358
1359 hz_out4 = hz_out8;
1360 out0 = out2;
1361 out1 = out6;
1362 out3 = out5;
1363 out4 = out7;
1364 }
1365 }
1366
common_hv_4ht_6vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1367 static void common_hv_4ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1368 uint8_t *RESTRICT dst, int32_t dst_stride,
1369 const int8_t *filter_horiz,
1370 const int8_t *filter_vert,
1371 int32_t height) {
1372 int32_t multiple8_cnt;
1373 for (multiple8_cnt = 2; multiple8_cnt--;) {
1374 common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1375 filter_vert, height);
1376 src += 8;
1377 dst += 8;
1378 }
1379 }
1380
vp8_sixtap_predict4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1381 void vp8_sixtap_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
1382 int32_t xoffset, int32_t yoffset,
1383 uint8_t *RESTRICT dst, int32_t dst_stride) {
1384 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1385 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1386
1387 if (yoffset) {
1388 if (xoffset) {
1389 switch (xoffset) {
1390 case 2:
1391 case 4:
1392 case 6:
1393 switch (yoffset) {
1394 case 2:
1395 case 4:
1396 case 6:
1397 common_hv_6ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
1398 h_filter, v_filter, 4);
1399 break;
1400
1401 case 1:
1402 case 3:
1403 case 5:
1404 case 7:
1405 common_hv_6ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
1406 h_filter, v_filter + 1, 4);
1407 break;
1408 }
1409 break;
1410
1411 case 1:
1412 case 3:
1413 case 5:
1414 case 7:
1415 switch (yoffset) {
1416 case 2:
1417 case 4:
1418 case 6:
1419 common_hv_4ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
1420 h_filter + 1, v_filter, 4);
1421 break;
1422
1423 case 1:
1424 case 3:
1425 case 5:
1426 case 7:
1427 common_hv_4ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
1428 h_filter + 1, v_filter + 1, 4);
1429 break;
1430 }
1431 break;
1432 }
1433 } else {
1434 switch (yoffset) {
1435 case 2:
1436 case 4:
1437 case 6:
1438 common_vt_6t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
1439 break;
1440
1441 case 1:
1442 case 3:
1443 case 5:
1444 case 7:
1445 common_vt_4t_4w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1446 4);
1447 break;
1448 }
1449 }
1450 } else {
1451 switch (xoffset) {
1452 case 0: {
1453 uint32_t tp0, tp1, tp2, tp3;
1454
1455 LW4(src, src_stride, tp0, tp1, tp2, tp3);
1456 SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
1457 break;
1458 }
1459 case 2:
1460 case 4:
1461 case 6:
1462 common_hz_6t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
1463 break;
1464
1465 case 1:
1466 case 3:
1467 case 5:
1468 case 7:
1469 common_hz_4t_4w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
1470 break;
1471 }
1472 }
1473 }
1474
vp8_sixtap_predict8x4_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1475 void vp8_sixtap_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
1476 int32_t xoffset, int32_t yoffset,
1477 uint8_t *RESTRICT dst, int32_t dst_stride) {
1478 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1479 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1480
1481 if (yoffset) {
1482 if (xoffset) {
1483 switch (xoffset) {
1484 case 2:
1485 case 4:
1486 case 6:
1487 switch (yoffset) {
1488 case 2:
1489 case 4:
1490 case 6:
1491 common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1492 h_filter, v_filter, 4);
1493 break;
1494
1495 case 1:
1496 case 3:
1497 case 5:
1498 case 7:
1499 common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1500 h_filter, v_filter + 1, 4);
1501 break;
1502 }
1503 break;
1504
1505 case 1:
1506 case 3:
1507 case 5:
1508 case 7:
1509 switch (yoffset) {
1510 case 2:
1511 case 4:
1512 case 6:
1513 common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1514 h_filter + 1, v_filter, 4);
1515 break;
1516
1517 case 1:
1518 case 3:
1519 case 5:
1520 case 7:
1521 common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1522 h_filter + 1, v_filter + 1, 4);
1523 break;
1524 }
1525 break;
1526 }
1527 } else {
1528 switch (yoffset) {
1529 case 2:
1530 case 4:
1531 case 6:
1532 common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
1533 break;
1534
1535 case 1:
1536 case 3:
1537 case 5:
1538 case 7:
1539 common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1540 4);
1541 break;
1542 }
1543 }
1544 } else {
1545 switch (xoffset) {
1546 case 0: vp8_copy_mem8x4(src, src_stride, dst, dst_stride); break;
1547 case 2:
1548 case 4:
1549 case 6:
1550 common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
1551 break;
1552
1553 case 1:
1554 case 3:
1555 case 5:
1556 case 7:
1557 common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
1558 break;
1559 }
1560 }
1561 }
1562
vp8_sixtap_predict8x8_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1563 void vp8_sixtap_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
1564 int32_t xoffset, int32_t yoffset,
1565 uint8_t *RESTRICT dst, int32_t dst_stride) {
1566 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1567 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1568
1569 if (yoffset) {
1570 if (xoffset) {
1571 switch (xoffset) {
1572 case 2:
1573 case 4:
1574 case 6:
1575 switch (yoffset) {
1576 case 2:
1577 case 4:
1578 case 6:
1579 common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1580 h_filter, v_filter, 8);
1581 break;
1582
1583 case 1:
1584 case 3:
1585 case 5:
1586 case 7:
1587 common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1588 h_filter, v_filter + 1, 8);
1589 break;
1590 }
1591 break;
1592
1593 case 1:
1594 case 3:
1595 case 5:
1596 case 7:
1597 switch (yoffset) {
1598 case 2:
1599 case 4:
1600 case 6:
1601 common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1602 h_filter + 1, v_filter, 8);
1603 break;
1604
1605 case 1:
1606 case 3:
1607 case 5:
1608 case 7:
1609 common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1610 h_filter + 1, v_filter + 1, 8);
1611 break;
1612 }
1613 break;
1614 }
1615 } else {
1616 switch (yoffset) {
1617 case 2:
1618 case 4:
1619 case 6:
1620 common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
1621 break;
1622
1623 case 1:
1624 case 3:
1625 case 5:
1626 case 7:
1627 common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1628 8);
1629 break;
1630 }
1631 }
1632 } else {
1633 switch (xoffset) {
1634 case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
1635 case 2:
1636 case 4:
1637 case 6:
1638 common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
1639 break;
1640
1641 case 1:
1642 case 3:
1643 case 5:
1644 case 7:
1645 common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 8);
1646 break;
1647 }
1648 }
1649 }
1650
vp8_sixtap_predict16x16_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1651 void vp8_sixtap_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
1652 int32_t xoffset, int32_t yoffset,
1653 uint8_t *RESTRICT dst, int32_t dst_stride) {
1654 const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1655 const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1656
1657 if (yoffset) {
1658 if (xoffset) {
1659 switch (xoffset) {
1660 case 2:
1661 case 4:
1662 case 6:
1663 switch (yoffset) {
1664 case 2:
1665 case 4:
1666 case 6:
1667 common_hv_6ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
1668 h_filter, v_filter, 16);
1669 break;
1670
1671 case 1:
1672 case 3:
1673 case 5:
1674 case 7:
1675 common_hv_6ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
1676 h_filter, v_filter + 1, 16);
1677 break;
1678 }
1679 break;
1680
1681 case 1:
1682 case 3:
1683 case 5:
1684 case 7:
1685 switch (yoffset) {
1686 case 2:
1687 case 4:
1688 case 6:
1689 common_hv_4ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
1690 h_filter + 1, v_filter, 16);
1691 break;
1692
1693 case 1:
1694 case 3:
1695 case 5:
1696 case 7:
1697 common_hv_4ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
1698 h_filter + 1, v_filter + 1, 16);
1699 break;
1700 }
1701 break;
1702 }
1703 } else {
1704 switch (yoffset) {
1705 case 2:
1706 case 4:
1707 case 6:
1708 common_vt_6t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16);
1709 break;
1710
1711 case 1:
1712 case 3:
1713 case 5:
1714 case 7:
1715 common_vt_4t_16w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1716 16);
1717 break;
1718 }
1719 }
1720 } else {
1721 switch (xoffset) {
1722 case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
1723 case 2:
1724 case 4:
1725 case 6:
1726 common_hz_6t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16);
1727 break;
1728
1729 case 1:
1730 case 3:
1731 case 5:
1732 case 7:
1733 common_hz_4t_16w_msa(src, src_stride, dst, dst_stride, h_filter + 1,
1734 16);
1735 break;
1736 }
1737 }
1738 }
1739