xref: /aosp_15_r20/external/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp8_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vp8/common/filter.h"
14 #include "vp8/common/mips/msa/vp8_macros_msa.h"
15 
16 DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_msa[7][8]) = {
17   { 0, -6, 123, 12, -1, 0, 0, 0 },
18   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
19   { 0, -9, 93, 50, -6, 0, 0, 0 },
20   { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
21   { 0, -6, 50, 93, -9, 0, 0, 0 },
22   { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
23   { 0, -1, 12, 123, -6, 0, 0, 0 },
24 };
25 
26 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
27   /* 8 width cases */
28   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29   /* 4 width cases */
30   0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31   /* 4 width cases */
32   8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
33 };
34 
35 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
36                         filt_h2)                                           \
37   ({                                                                       \
38     v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m;                        \
39     v8i16 _6tap_out_m;                                                     \
40                                                                            \
41     VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,    \
42                _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m);                  \
43     _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m,   \
44                                filt_h0, filt_h1, filt_h2);                 \
45                                                                            \
46     _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT);            \
47     _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7);                           \
48                                                                            \
49     _6tap_out_m;                                                           \
50   })
51 
52 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
53                                    mask2, filt0, filt1, filt2, out0, out1) \
54   {                                                                        \
55     v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m,         \
56         _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m;           \
57                                                                            \
58     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m,    \
59                _6tap_4wid_vec1_m);                                         \
60     DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0,  \
61                 out1);                                                     \
62     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m,    \
63                _6tap_4wid_vec3_m);                                         \
64     DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \
65                  out1);                                                    \
66     VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m,    \
67                _6tap_4wid_vec5_m);                                         \
68     DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \
69                  out1);                                                    \
70   }
71 
72 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
73                                    mask2, filt0, filt1, filt2, out0, out1,  \
74                                    out2, out3)                              \
75   {                                                                         \
76     v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,          \
77         _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m,            \
78         _6tap_8wid_vec6_m, _6tap_8wid_vec7_m;                               \
79                                                                             \
80     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m,     \
81                _6tap_8wid_vec1_m);                                          \
82     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m,     \
83                _6tap_8wid_vec3_m);                                          \
84     DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,    \
85                 _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1,  \
86                 out2, out3);                                                \
87     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m,     \
88                _6tap_8wid_vec1_m);                                          \
89     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m,     \
90                _6tap_8wid_vec3_m);                                          \
91     VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m,     \
92                _6tap_8wid_vec5_m);                                          \
93     VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m,     \
94                _6tap_8wid_vec7_m);                                          \
95     DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,   \
96                  _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
97                  out2, out3);                                               \
98     DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m,   \
99                  _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \
100                  out2, out3);                                               \
101   }
102 
103 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)                 \
104   ({                                                                  \
105     v8i16 _4tap_dpadd_tmp0;                                           \
106                                                                       \
107     _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);     \
108     _4tap_dpadd_tmp0 =                                                \
109         __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \
110                                                                       \
111     _4tap_dpadd_tmp0;                                                 \
112   })
113 
114 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)        \
115   ({                                                                       \
116     v16i8 _4tap_vec0_m, _4tap_vec1_m;                                      \
117     v8i16 _4tap_out_m;                                                     \
118                                                                            \
119     VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m,         \
120                _4tap_vec1_m);                                              \
121     _4tap_out_m =                                                          \
122         FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \
123                                                                            \
124     _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT);            \
125     _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7);                           \
126                                                                            \
127     _4tap_out_m;                                                           \
128   })
129 
130 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
131                                    filt0, filt1, out0, out1)               \
132   {                                                                        \
133     v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m,         \
134         _4tap_4wid_vec3_m;                                                 \
135                                                                            \
136     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m,    \
137                _4tap_4wid_vec1_m);                                         \
138     DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0,  \
139                 out1);                                                     \
140     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m,    \
141                _4tap_4wid_vec3_m);                                         \
142     DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \
143                  out1);                                                    \
144   }
145 
146 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
147                                    filt0, filt1, out0, out1, out2, out3)    \
148   {                                                                         \
149     v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,          \
150         _4tap_8wid_vec3_m;                                                  \
151                                                                             \
152     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m,     \
153                _4tap_8wid_vec1_m);                                          \
154     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m,     \
155                _4tap_8wid_vec3_m);                                          \
156     DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,    \
157                 _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1,  \
158                 out2, out3);                                                \
159     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m,     \
160                _4tap_8wid_vec1_m);                                          \
161     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m,     \
162                _4tap_8wid_vec3_m);                                          \
163     DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,   \
164                  _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
165                  out2, out3);                                               \
166   }
167 
common_hz_6t_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)168 static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
169                                  uint8_t *RESTRICT dst, int32_t dst_stride,
170                                  const int8_t *filter) {
171   v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
172   v16u8 mask0, mask1, mask2, out;
173   v8i16 filt, out0, out1;
174 
175   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
176   src -= 2;
177 
178   filt = LD_SH(filter);
179   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
180 
181   mask1 = mask0 + 2;
182   mask2 = mask0 + 4;
183 
184   LD_SB4(src, src_stride, src0, src1, src2, src3);
185   XORI_B4_128_SB(src0, src1, src2, src3);
186   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
187                              filt1, filt2, out0, out1);
188   SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
189   SAT_SH2_SH(out0, out1, 7);
190   out = PCKEV_XORI128_UB(out0, out1);
191   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
192 }
193 
common_hz_6t_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)194 static void common_hz_6t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
195                                  uint8_t *RESTRICT dst, int32_t dst_stride,
196                                  const int8_t *filter) {
197   v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
198   v16u8 mask0, mask1, mask2, out;
199   v8i16 filt, out0, out1, out2, out3;
200 
201   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
202   src -= 2;
203 
204   filt = LD_SH(filter);
205   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
206 
207   mask1 = mask0 + 2;
208   mask2 = mask0 + 4;
209 
210   LD_SB4(src, src_stride, src0, src1, src2, src3);
211   XORI_B4_128_SB(src0, src1, src2, src3);
212   src += (4 * src_stride);
213   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
214                              filt1, filt2, out0, out1);
215   LD_SB4(src, src_stride, src0, src1, src2, src3);
216   XORI_B4_128_SB(src0, src1, src2, src3);
217   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
218                              filt1, filt2, out2, out3);
219   SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
220   SAT_SH4_SH(out0, out1, out2, out3, 7);
221   out = PCKEV_XORI128_UB(out0, out1);
222   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
223   dst += (4 * dst_stride);
224   out = PCKEV_XORI128_UB(out2, out3);
225   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
226 }
227 
common_hz_6t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)228 static void common_hz_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
229                                 uint8_t *RESTRICT dst, int32_t dst_stride,
230                                 const int8_t *filter, int32_t height) {
231   if (4 == height) {
232     common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
233   } else if (8 == height) {
234     common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
235   }
236 }
237 
common_hz_6t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)238 static void common_hz_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
239                                 uint8_t *RESTRICT dst, int32_t dst_stride,
240                                 const int8_t *filter, int32_t height) {
241   uint32_t loop_cnt;
242   v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
243   v16u8 mask0, mask1, mask2, tmp0, tmp1;
244   v8i16 filt, out0, out1, out2, out3;
245 
246   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
247   src -= 2;
248 
249   filt = LD_SH(filter);
250   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
251 
252   mask1 = mask0 + 2;
253   mask2 = mask0 + 4;
254 
255   LD_SB4(src, src_stride, src0, src1, src2, src3);
256   XORI_B4_128_SB(src0, src1, src2, src3);
257   src += (4 * src_stride);
258   HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
259                              filt1, filt2, out0, out1, out2, out3);
260   SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
261   SAT_SH4_SH(out0, out1, out2, out3, 7);
262   tmp0 = PCKEV_XORI128_UB(out0, out1);
263   tmp1 = PCKEV_XORI128_UB(out2, out3);
264   ST8x4_UB(tmp0, tmp1, dst, dst_stride);
265   dst += (4 * dst_stride);
266 
267   for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
268     LD_SB4(src, src_stride, src0, src1, src2, src3);
269     XORI_B4_128_SB(src0, src1, src2, src3);
270     src += (4 * src_stride);
271     HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
272                                filt0, filt1, filt2, out0, out1, out2, out3);
273     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
274     SAT_SH4_SH(out0, out1, out2, out3, 7);
275     tmp0 = PCKEV_XORI128_UB(out0, out1);
276     tmp1 = PCKEV_XORI128_UB(out2, out3);
277     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
278     dst += (4 * dst_stride);
279   }
280 }
281 
common_hz_6t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)282 static void common_hz_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
283                                  uint8_t *RESTRICT dst, int32_t dst_stride,
284                                  const int8_t *filter, int32_t height) {
285   uint32_t loop_cnt;
286   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
287   v16u8 mask0, mask1, mask2, out;
288   v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
289 
290   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
291   src -= 2;
292 
293   filt = LD_SH(filter);
294   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
295 
296   mask1 = mask0 + 2;
297   mask2 = mask0 + 4;
298 
299   for (loop_cnt = (height >> 2); loop_cnt--;) {
300     LD_SB4(src, src_stride, src0, src2, src4, src6);
301     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
302     XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
303     src += (4 * src_stride);
304 
305     HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
306                                filt0, filt1, filt2, out0, out1, out2, out3);
307     HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
308                                filt0, filt1, filt2, out4, out5, out6, out7);
309     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
310     SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
311     SAT_SH4_SH(out0, out1, out2, out3, 7);
312     SAT_SH4_SH(out4, out5, out6, out7, 7);
313     out = PCKEV_XORI128_UB(out0, out1);
314     ST_UB(out, dst);
315     dst += dst_stride;
316     out = PCKEV_XORI128_UB(out2, out3);
317     ST_UB(out, dst);
318     dst += dst_stride;
319     out = PCKEV_XORI128_UB(out4, out5);
320     ST_UB(out, dst);
321     dst += dst_stride;
322     out = PCKEV_XORI128_UB(out6, out7);
323     ST_UB(out, dst);
324     dst += dst_stride;
325   }
326 }
327 
common_vt_6t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)328 static void common_vt_6t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
329                                 uint8_t *RESTRICT dst, int32_t dst_stride,
330                                 const int8_t *filter, int32_t height) {
331   uint32_t loop_cnt;
332   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
333   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
334   v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
335   v16u8 out;
336   v8i16 filt, out10, out32;
337 
338   src -= (2 * src_stride);
339 
340   filt = LD_SH(filter);
341   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
342 
343   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
344   src += (5 * src_stride);
345 
346   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
347              src32_r, src43_r);
348   ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
349   XORI_B2_128_SB(src2110, src4332);
350 
351   for (loop_cnt = (height >> 2); loop_cnt--;) {
352     LD_SB4(src, src_stride, src5, src6, src7, src8);
353     src += (4 * src_stride);
354 
355     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
356                src76_r, src87_r);
357     ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
358     XORI_B2_128_SB(src6554, src8776);
359     out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
360     out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
361     SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
362     SAT_SH2_SH(out10, out32, 7);
363     out = PCKEV_XORI128_UB(out10, out32);
364     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
365     dst += (4 * dst_stride);
366 
367     src2110 = src6554;
368     src4332 = src8776;
369     src4 = src8;
370   }
371 }
372 
common_vt_6t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)373 static void common_vt_6t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
374                                 uint8_t *RESTRICT dst, int32_t dst_stride,
375                                 const int8_t *filter, int32_t height) {
376   uint32_t loop_cnt;
377   v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
378   v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
379   v16i8 src109_r, filt0, filt1, filt2;
380   v16u8 tmp0, tmp1;
381   v8i16 filt, out0_r, out1_r, out2_r, out3_r;
382 
383   src -= (2 * src_stride);
384 
385   filt = LD_SH(filter);
386   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
387 
388   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
389   src += (5 * src_stride);
390 
391   XORI_B5_128_SB(src0, src1, src2, src3, src4);
392   ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3, src10_r, src32_r,
393              src21_r, src43_r);
394 
395   for (loop_cnt = (height >> 2); loop_cnt--;) {
396     LD_SB4(src, src_stride, src7, src8, src9, src10);
397     XORI_B4_128_SB(src7, src8, src9, src10);
398     src += (4 * src_stride);
399 
400     ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
401                src87_r, src98_r, src109_r);
402     out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
403     out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
404     out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
405     out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
406     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
407     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
408     tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
409     tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
410     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
411     dst += (4 * dst_stride);
412 
413     src10_r = src76_r;
414     src32_r = src98_r;
415     src21_r = src87_r;
416     src43_r = src109_r;
417     src4 = src10;
418   }
419 }
420 
common_vt_6t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)421 static void common_vt_6t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
422                                  uint8_t *RESTRICT dst, int32_t dst_stride,
423                                  const int8_t *filter, int32_t height) {
424   uint32_t loop_cnt;
425   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
426   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
427   v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
428   v16i8 src65_l, src87_l, filt0, filt1, filt2;
429   v16u8 tmp0, tmp1, tmp2, tmp3;
430   v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
431 
432   src -= (2 * src_stride);
433 
434   filt = LD_SH(filter);
435   SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
436 
437   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
438   src += (5 * src_stride);
439 
440   XORI_B5_128_SB(src0, src1, src2, src3, src4);
441   ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r, src32_r,
442              src43_r, src21_r);
443   ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l, src32_l,
444              src43_l, src21_l);
445 
446   for (loop_cnt = (height >> 2); loop_cnt--;) {
447     LD_SB4(src, src_stride, src5, src6, src7, src8);
448     src += (4 * src_stride);
449 
450     XORI_B4_128_SB(src5, src6, src7, src8);
451     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
452                src76_r, src87_r);
453     ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
454                src76_l, src87_l);
455     out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
456     out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
457     out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
458     out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
459     out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
460     out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
461     out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
462     out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
463     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
464     SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
465     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
466     SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
467     PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
468                 tmp0, tmp1, tmp2, tmp3);
469     XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
470     ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
471     dst += (4 * dst_stride);
472 
473     src10_r = src54_r;
474     src32_r = src76_r;
475     src21_r = src65_r;
476     src43_r = src87_r;
477     src10_l = src54_l;
478     src32_l = src76_l;
479     src21_l = src65_l;
480     src43_l = src87_l;
481     src4 = src8;
482   }
483 }
484 
common_hv_6ht_6vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)485 static void common_hv_6ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
486                                      uint8_t *RESTRICT dst, int32_t dst_stride,
487                                      const int8_t *filter_horiz,
488                                      const int8_t *filter_vert,
489                                      int32_t height) {
490   uint32_t loop_cnt;
491   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
492   v16i8 filt_hz0, filt_hz1, filt_hz2;
493   v16u8 mask0, mask1, mask2, out;
494   v8i16 tmp0, tmp1;
495   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
496   v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
497 
498   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
499   src -= (2 + 2 * src_stride);
500 
501   filt = LD_SH(filter_horiz);
502   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
503   filt = LD_SH(filter_vert);
504   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
505 
506   mask1 = mask0 + 2;
507   mask2 = mask0 + 4;
508 
509   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
510   src += (5 * src_stride);
511 
512   XORI_B5_128_SB(src0, src1, src2, src3, src4);
513   hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
514                             filt_hz2);
515   hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
516                             filt_hz2);
517   hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
518   hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
519                             filt_hz2);
520   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
521 
522   for (loop_cnt = (height >> 2); loop_cnt--;) {
523     LD_SB2(src, src_stride, src5, src6);
524     src += (2 * src_stride);
525 
526     XORI_B2_128_SB(src5, src6);
527     hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
528                               filt_hz1, filt_hz2);
529     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
530 
531     LD_SB2(src, src_stride, src7, src8);
532     src += (2 * src_stride);
533 
534     XORI_B2_128_SB(src7, src8);
535     hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
536                               filt_hz1, filt_hz2);
537     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
538 
539     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
540     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
541 
542     out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
543     tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
544 
545     SRARI_H2_SH(tmp0, tmp1, 7);
546     SAT_SH2_SH(tmp0, tmp1, 7);
547     out = PCKEV_XORI128_UB(tmp0, tmp1);
548     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
549     dst += (4 * dst_stride);
550 
551     hz_out3 = hz_out7;
552     out0 = out2;
553     out1 = out3;
554   }
555 }
556 
common_hv_6ht_6vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)557 static void common_hv_6ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
558                                      uint8_t *RESTRICT dst, int32_t dst_stride,
559                                      const int8_t *filter_horiz,
560                                      const int8_t *filter_vert,
561                                      int32_t height) {
562   uint32_t loop_cnt;
563   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
564   v16i8 filt_hz0, filt_hz1, filt_hz2;
565   v16u8 mask0, mask1, mask2, vec0, vec1;
566   v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
567   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
568   v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
569   v8i16 tmp0, tmp1, tmp2, tmp3;
570 
571   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
572   src -= (2 + 2 * src_stride);
573 
574   filt = LD_SH(filter_horiz);
575   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
576 
577   mask1 = mask0 + 2;
578   mask2 = mask0 + 4;
579 
580   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
581   src += (5 * src_stride);
582 
583   XORI_B5_128_SB(src0, src1, src2, src3, src4);
584   hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
585                             filt_hz2);
586   hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
587                             filt_hz2);
588   hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
589                             filt_hz2);
590   hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
591                             filt_hz2);
592   hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
593                             filt_hz2);
594 
595   filt = LD_SH(filter_vert);
596   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
597 
598   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
599   ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
600 
601   for (loop_cnt = (height >> 2); loop_cnt--;) {
602     LD_SB4(src, src_stride, src5, src6, src7, src8);
603     src += (4 * src_stride);
604 
605     XORI_B4_128_SB(src5, src6, src7, src8);
606     hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
607                               filt_hz1, filt_hz2);
608     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
609     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
610 
611     hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
612                               filt_hz1, filt_hz2);
613     out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
614     tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
615 
616     hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
617                               filt_hz1, filt_hz2);
618     out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
619     tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
620 
621     hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
622                               filt_hz1, filt_hz2);
623     out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
624     tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
625 
626     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
627     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
628     vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
629     vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
630     ST8x4_UB(vec0, vec1, dst, dst_stride);
631     dst += (4 * dst_stride);
632 
633     hz_out4 = hz_out8;
634     out0 = out2;
635     out1 = out7;
636     out3 = out5;
637     out4 = out6;
638   }
639 }
640 
common_hv_6ht_6vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)641 static void common_hv_6ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
642                                       uint8_t *RESTRICT dst, int32_t dst_stride,
643                                       const int8_t *filter_horiz,
644                                       const int8_t *filter_vert,
645                                       int32_t height) {
646   int32_t multiple8_cnt;
647   for (multiple8_cnt = 2; multiple8_cnt--;) {
648     common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
649                              filter_vert, height);
650     src += 8;
651     dst += 8;
652   }
653 }
654 
common_hz_4t_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)655 static void common_hz_4t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
656                                  uint8_t *RESTRICT dst, int32_t dst_stride,
657                                  const int8_t *filter) {
658   v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
659   v8i16 filt, out0, out1;
660   v16u8 out;
661 
662   mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
663   src -= 1;
664 
665   filt = LD_SH(filter);
666   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
667 
668   mask1 = mask0 + 2;
669 
670   LD_SB4(src, src_stride, src0, src1, src2, src3);
671   XORI_B4_128_SB(src0, src1, src2, src3);
672   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
673                              out0, out1);
674   SRARI_H2_SH(out0, out1, VP8_FILTER_SHIFT);
675   SAT_SH2_SH(out0, out1, 7);
676   out = PCKEV_XORI128_UB(out0, out1);
677   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
678 }
679 
common_hz_4t_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)680 static void common_hz_4t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
681                                  uint8_t *RESTRICT dst, int32_t dst_stride,
682                                  const int8_t *filter) {
683   v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
684   v16u8 out;
685   v8i16 filt, out0, out1, out2, out3;
686 
687   mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
688   src -= 1;
689 
690   filt = LD_SH(filter);
691   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
692 
693   mask1 = mask0 + 2;
694 
695   LD_SB4(src, src_stride, src0, src1, src2, src3);
696   src += (4 * src_stride);
697 
698   XORI_B4_128_SB(src0, src1, src2, src3);
699   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
700                              out0, out1);
701   LD_SB4(src, src_stride, src0, src1, src2, src3);
702   XORI_B4_128_SB(src0, src1, src2, src3);
703   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
704                              out2, out3);
705   SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
706   SAT_SH4_SH(out0, out1, out2, out3, 7);
707   out = PCKEV_XORI128_UB(out0, out1);
708   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
709   dst += (4 * dst_stride);
710   out = PCKEV_XORI128_UB(out2, out3);
711   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
712 }
713 
common_hz_4t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)714 static void common_hz_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
715                                 uint8_t *RESTRICT dst, int32_t dst_stride,
716                                 const int8_t *filter, int32_t height) {
717   if (4 == height) {
718     common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
719   } else if (8 == height) {
720     common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
721   }
722 }
723 
common_hz_4t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)724 static void common_hz_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
725                                 uint8_t *RESTRICT dst, int32_t dst_stride,
726                                 const int8_t *filter, int32_t height) {
727   uint32_t loop_cnt;
728   v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
729   v16u8 tmp0, tmp1;
730   v8i16 filt, out0, out1, out2, out3;
731 
732   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
733   src -= 1;
734 
735   filt = LD_SH(filter);
736   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
737 
738   mask1 = mask0 + 2;
739 
740   for (loop_cnt = (height >> 2); loop_cnt--;) {
741     LD_SB4(src, src_stride, src0, src1, src2, src3);
742     src += (4 * src_stride);
743 
744     XORI_B4_128_SB(src0, src1, src2, src3);
745     HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
746                                filt1, out0, out1, out2, out3);
747     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
748     SAT_SH4_SH(out0, out1, out2, out3, 7);
749     tmp0 = PCKEV_XORI128_UB(out0, out1);
750     tmp1 = PCKEV_XORI128_UB(out2, out3);
751     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
752     dst += (4 * dst_stride);
753   }
754 }
755 
common_hz_4t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)756 static void common_hz_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
757                                  uint8_t *RESTRICT dst, int32_t dst_stride,
758                                  const int8_t *filter, int32_t height) {
759   uint32_t loop_cnt;
760   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
761   v16i8 filt0, filt1, mask0, mask1;
762   v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
763   v16u8 out;
764 
765   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
766   src -= 1;
767 
768   filt = LD_SH(filter);
769   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
770 
771   mask1 = mask0 + 2;
772 
773   for (loop_cnt = (height >> 2); loop_cnt--;) {
774     LD_SB4(src, src_stride, src0, src2, src4, src6);
775     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
776     src += (4 * src_stride);
777 
778     XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
779     HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
780                                filt1, out0, out1, out2, out3);
781     HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
782                                filt1, out4, out5, out6, out7);
783     SRARI_H4_SH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
784     SRARI_H4_SH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
785     SAT_SH4_SH(out0, out1, out2, out3, 7);
786     SAT_SH4_SH(out4, out5, out6, out7, 7);
787     out = PCKEV_XORI128_UB(out0, out1);
788     ST_UB(out, dst);
789     dst += dst_stride;
790     out = PCKEV_XORI128_UB(out2, out3);
791     ST_UB(out, dst);
792     dst += dst_stride;
793     out = PCKEV_XORI128_UB(out4, out5);
794     ST_UB(out, dst);
795     dst += dst_stride;
796     out = PCKEV_XORI128_UB(out6, out7);
797     ST_UB(out, dst);
798     dst += dst_stride;
799   }
800 }
801 
common_vt_4t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)802 static void common_vt_4t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
803                                 uint8_t *RESTRICT dst, int32_t dst_stride,
804                                 const int8_t *filter, int32_t height) {
805   uint32_t loop_cnt;
806   v16i8 src0, src1, src2, src3, src4, src5;
807   v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
808   v16i8 src2110, src4332, filt0, filt1;
809   v8i16 filt, out10, out32;
810   v16u8 out;
811 
812   src -= src_stride;
813 
814   filt = LD_SH(filter);
815   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
816 
817   LD_SB3(src, src_stride, src0, src1, src2);
818   src += (3 * src_stride);
819 
820   ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
821 
822   src2110 = (v16i8)__msa_ilvr_d((v2i64)src21_r, (v2i64)src10_r);
823   src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
824 
825   for (loop_cnt = (height >> 2); loop_cnt--;) {
826     LD_SB3(src, src_stride, src3, src4, src5);
827     src += (3 * src_stride);
828     ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
829     src4332 = (v16i8)__msa_ilvr_d((v2i64)src43_r, (v2i64)src32_r);
830     src4332 = (v16i8)__msa_xori_b((v16u8)src4332, 128);
831     out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
832 
833     src2 = LD_SB(src);
834     src += (src_stride);
835     ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
836     src2110 = (v16i8)__msa_ilvr_d((v2i64)src65_r, (v2i64)src54_r);
837     src2110 = (v16i8)__msa_xori_b((v16u8)src2110, 128);
838     out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
839     SRARI_H2_SH(out10, out32, VP8_FILTER_SHIFT);
840     SAT_SH2_SH(out10, out32, 7);
841     out = PCKEV_XORI128_UB(out10, out32);
842     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
843     dst += (4 * dst_stride);
844   }
845 }
846 
common_vt_4t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)847 static void common_vt_4t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
848                                 uint8_t *RESTRICT dst, int32_t dst_stride,
849                                 const int8_t *filter, int32_t height) {
850   uint32_t loop_cnt;
851   v16i8 src0, src1, src2, src7, src8, src9, src10;
852   v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
853   v16u8 tmp0, tmp1;
854   v8i16 filt, out0_r, out1_r, out2_r, out3_r;
855 
856   src -= src_stride;
857 
858   filt = LD_SH(filter);
859   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
860 
861   LD_SB3(src, src_stride, src0, src1, src2);
862   src += (3 * src_stride);
863 
864   XORI_B3_128_SB(src0, src1, src2);
865   ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
866 
867   for (loop_cnt = (height >> 2); loop_cnt--;) {
868     LD_SB4(src, src_stride, src7, src8, src9, src10);
869     src += (4 * src_stride);
870 
871     XORI_B4_128_SB(src7, src8, src9, src10);
872     ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9, src72_r,
873                src87_r, src98_r, src109_r);
874     out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
875     out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
876     out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
877     out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
878     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
879     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
880     tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
881     tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
882     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
883     dst += (4 * dst_stride);
884 
885     src10_r = src98_r;
886     src21_r = src109_r;
887     src2 = src10;
888   }
889 }
890 
common_vt_4t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)891 static void common_vt_4t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
892                                  uint8_t *RESTRICT dst, int32_t dst_stride,
893                                  const int8_t *filter, int32_t height) {
894   uint32_t loop_cnt;
895   v16i8 src0, src1, src2, src3, src4, src5, src6;
896   v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
897   v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
898   v16u8 tmp0, tmp1, tmp2, tmp3;
899   v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
900 
901   src -= src_stride;
902 
903   filt = LD_SH(filter);
904   SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
905 
906   LD_SB3(src, src_stride, src0, src1, src2);
907   src += (3 * src_stride);
908 
909   XORI_B3_128_SB(src0, src1, src2);
910   ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
911   ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
912 
913   for (loop_cnt = (height >> 2); loop_cnt--;) {
914     LD_SB4(src, src_stride, src3, src4, src5, src6);
915     src += (4 * src_stride);
916 
917     XORI_B4_128_SB(src3, src4, src5, src6);
918     ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
919                src54_r, src65_r);
920     ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_l, src43_l,
921                src54_l, src65_l);
922     out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
923     out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
924     out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
925     out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
926     out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
927     out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
928     out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
929     out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
930     SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, VP8_FILTER_SHIFT);
931     SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, VP8_FILTER_SHIFT);
932     SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
933     SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
934     PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
935                 tmp0, tmp1, tmp2, tmp3);
936     XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
937     ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
938     dst += (4 * dst_stride);
939 
940     src10_r = src54_r;
941     src21_r = src65_r;
942     src10_l = src54_l;
943     src21_l = src65_l;
944     src2 = src6;
945   }
946 }
947 
common_hv_4ht_4vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)948 static void common_hv_4ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
949                                      uint8_t *RESTRICT dst, int32_t dst_stride,
950                                      const int8_t *filter_horiz,
951                                      const int8_t *filter_vert,
952                                      int32_t height) {
953   uint32_t loop_cnt;
954   v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
955   v16u8 mask0, mask1, out;
956   v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
957   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
958 
959   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
960   src -= (1 + 1 * src_stride);
961 
962   filt = LD_SH(filter_horiz);
963   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
964 
965   mask1 = mask0 + 2;
966 
967   LD_SB3(src, src_stride, src0, src1, src2);
968   src += (3 * src_stride);
969 
970   XORI_B3_128_SB(src0, src1, src2);
971   hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
972   hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
973   vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
974 
975   filt = LD_SH(filter_vert);
976   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
977 
978   for (loop_cnt = (height >> 2); loop_cnt--;) {
979     LD_SB4(src, src_stride, src3, src4, src5, src6);
980     src += (4 * src_stride);
981 
982     XORI_B2_128_SB(src3, src4);
983     hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
984     hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
985     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
986     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
987 
988     XORI_B2_128_SB(src5, src6);
989     hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
990     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
991     vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
992     tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
993 
994     SRARI_H2_SH(tmp0, tmp1, 7);
995     SAT_SH2_SH(tmp0, tmp1, 7);
996     out = PCKEV_XORI128_UB(tmp0, tmp1);
997     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
998     dst += (4 * dst_stride);
999 
1000     hz_out1 = hz_out5;
1001     vec0 = vec2;
1002   }
1003 }
1004 
common_hv_4ht_4vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1005 static void common_hv_4ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1006                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1007                                      const int8_t *filter_horiz,
1008                                      const int8_t *filter_vert,
1009                                      int32_t height) {
1010   uint32_t loop_cnt;
1011   v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1012   v16u8 mask0, mask1, out0, out1;
1013   v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1014   v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1015   v8i16 vec0, vec1, vec2, vec3, vec4;
1016 
1017   mask0 = LD_UB(&vp8_mc_filt_mask_arr[0]);
1018   src -= (1 + 1 * src_stride);
1019 
1020   filt = LD_SH(filter_horiz);
1021   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1022 
1023   mask1 = mask0 + 2;
1024 
1025   LD_SB3(src, src_stride, src0, src1, src2);
1026   src += (3 * src_stride);
1027 
1028   XORI_B3_128_SB(src0, src1, src2);
1029   hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1030   hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1031   hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1032   ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1033 
1034   filt = LD_SH(filter_vert);
1035   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1036 
1037   for (loop_cnt = (height >> 2); loop_cnt--;) {
1038     LD_SB4(src, src_stride, src3, src4, src5, src6);
1039     src += (4 * src_stride);
1040 
1041     XORI_B4_128_SB(src3, src4, src5, src6);
1042     hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1043     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1044     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1045 
1046     hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1047     vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
1048     tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1049 
1050     hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1051     vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1052     tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1053 
1054     hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1055     ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1056     tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1057 
1058     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1059     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1060     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1061     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1062     ST8x4_UB(out0, out1, dst, dst_stride);
1063     dst += (4 * dst_stride);
1064 
1065     vec0 = vec4;
1066     vec2 = vec1;
1067   }
1068 }
1069 
common_hv_4ht_4vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1070 static void common_hv_4ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1071                                       uint8_t *RESTRICT dst, int32_t dst_stride,
1072                                       const int8_t *filter_horiz,
1073                                       const int8_t *filter_vert,
1074                                       int32_t height) {
1075   int32_t multiple8_cnt;
1076   for (multiple8_cnt = 2; multiple8_cnt--;) {
1077     common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1078                              filter_vert, height);
1079     src += 8;
1080     dst += 8;
1081   }
1082 }
1083 
common_hv_6ht_4vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1084 static void common_hv_6ht_4vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1085                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1086                                      const int8_t *filter_horiz,
1087                                      const int8_t *filter_vert,
1088                                      int32_t height) {
1089   uint32_t loop_cnt;
1090   v16i8 src0, src1, src2, src3, src4, src5, src6;
1091   v16i8 filt_hz0, filt_hz1, filt_hz2;
1092   v16u8 res0, res1, mask0, mask1, mask2;
1093   v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1094   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1095 
1096   mask0 = LD_UB(&vp8_mc_filt_mask_arr[16]);
1097   src -= (2 + 1 * src_stride);
1098 
1099   filt = LD_SH(filter_horiz);
1100   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1101 
1102   mask1 = mask0 + 2;
1103   mask2 = mask0 + 4;
1104 
1105   LD_SB3(src, src_stride, src0, src1, src2);
1106   src += (3 * src_stride);
1107 
1108   XORI_B3_128_SB(src0, src1, src2);
1109   hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1110                             filt_hz2);
1111   hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1112                             filt_hz2);
1113   vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1114 
1115   filt = LD_SH(filter_vert);
1116   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1117 
1118   for (loop_cnt = (height >> 2); loop_cnt--;) {
1119     LD_SB4(src, src_stride, src3, src4, src5, src6);
1120     src += (4 * src_stride);
1121 
1122     XORI_B4_128_SB(src3, src4, src5, src6);
1123     hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1124                               filt_hz1, filt_hz2);
1125     hz_out2 = (v8i16)__msa_sldi_b((v16i8)hz_out3, (v16i8)hz_out1, 8);
1126     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1127     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1128 
1129     hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1130                               filt_hz1, filt_hz2);
1131     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
1132     vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1133     tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1134 
1135     SRARI_H2_SH(tmp0, tmp1, 7);
1136     SAT_SH2_SH(tmp0, tmp1, 7);
1137     PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1138     XORI_B2_128_UB(res0, res1);
1139     ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1140     dst += (4 * dst_stride);
1141 
1142     hz_out1 = hz_out5;
1143     vec0 = vec2;
1144   }
1145 }
1146 
common_hv_6ht_4vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1147 static void common_hv_6ht_4vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1148                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1149                                      const int8_t *filter_horiz,
1150                                      const int8_t *filter_vert,
1151                                      int32_t height) {
1152   uint32_t loop_cnt;
1153   v16i8 src0, src1, src2, src3, src4, src5, src6;
1154   v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1155   v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1156   v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1157   v16u8 out0, out1;
1158 
1159   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
1160   src -= (2 + src_stride);
1161 
1162   filt = LD_SH(filter_horiz);
1163   SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1164 
1165   mask1 = mask0 + 2;
1166   mask2 = mask0 + 4;
1167 
1168   LD_SB3(src, src_stride, src0, src1, src2);
1169   src += (3 * src_stride);
1170 
1171   XORI_B3_128_SB(src0, src1, src2);
1172   hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
1173                             filt_hz2);
1174   hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1175                             filt_hz2);
1176   hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1177                             filt_hz2);
1178   ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1179 
1180   filt = LD_SH(filter_vert);
1181   SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1182 
1183   for (loop_cnt = (height >> 2); loop_cnt--;) {
1184     LD_SB4(src, src_stride, src3, src4, src5, src6);
1185     src += (4 * src_stride);
1186 
1187     XORI_B4_128_SB(src3, src4, src5, src6);
1188 
1189     hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1190                               filt_hz1, filt_hz2);
1191     vec1 = (v8i16)__msa_ilvev_b((v16i8)hz_out3, (v16i8)hz_out2);
1192     tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1193 
1194     hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1195                               filt_hz1, filt_hz2);
1196     vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out3);
1197     tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1198 
1199     hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1200                               filt_hz1, filt_hz2);
1201     vec0 = (v8i16)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1202     tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1203 
1204     hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1205                               filt_hz1, filt_hz2);
1206     ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1207     tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1208 
1209     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1210     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1211     out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1212     out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1213     ST8x4_UB(out0, out1, dst, dst_stride);
1214     dst += (4 * dst_stride);
1215   }
1216 }
1217 
common_hv_6ht_4vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1218 static void common_hv_6ht_4vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1219                                       uint8_t *RESTRICT dst, int32_t dst_stride,
1220                                       const int8_t *filter_horiz,
1221                                       const int8_t *filter_vert,
1222                                       int32_t height) {
1223   int32_t multiple8_cnt;
1224   for (multiple8_cnt = 2; multiple8_cnt--;) {
1225     common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1226                              filter_vert, height);
1227     src += 8;
1228     dst += 8;
1229   }
1230 }
1231 
common_hv_4ht_6vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1232 static void common_hv_4ht_6vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1233                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1234                                      const int8_t *filter_horiz,
1235                                      const int8_t *filter_vert,
1236                                      int32_t height) {
1237   uint32_t loop_cnt;
1238   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1239   v16i8 filt_hz0, filt_hz1, mask0, mask1;
1240   v16u8 out;
1241   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1242   v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1243   v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1244 
1245   mask0 = LD_SB(&vp8_mc_filt_mask_arr[16]);
1246 
1247   src -= (1 + 2 * src_stride);
1248 
1249   filt = LD_SH(filter_horiz);
1250   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1251 
1252   mask1 = mask0 + 2;
1253 
1254   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1255   src += (5 * src_stride);
1256 
1257   XORI_B5_128_SB(src0, src1, src2, src3, src4);
1258   hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1259   hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1260   hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1261   hz_out1 = (v8i16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1262   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1263 
1264   filt = LD_SH(filter_vert);
1265   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1266 
1267   for (loop_cnt = (height >> 2); loop_cnt--;) {
1268     LD_SB4(src, src_stride, src5, src6, src7, src8);
1269     XORI_B4_128_SB(src5, src6, src7, src8);
1270     src += (4 * src_stride);
1271 
1272     hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1273     hz_out4 = (v8i16)__msa_sldi_b((v16i8)hz_out5, (v16i8)hz_out3, 8);
1274     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1275     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1276 
1277     hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1278     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
1279     out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
1280     tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1281 
1282     SRARI_H2_SH(tmp0, tmp1, 7);
1283     SAT_SH2_SH(tmp0, tmp1, 7);
1284     out = PCKEV_XORI128_UB(tmp0, tmp1);
1285     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1286     dst += (4 * dst_stride);
1287 
1288     hz_out3 = hz_out7;
1289     out0 = out2;
1290     out1 = out3;
1291   }
1292 }
1293 
common_hv_4ht_6vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1294 static void common_hv_4ht_6vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1295                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1296                                      const int8_t *filter_horiz,
1297                                      const int8_t *filter_vert,
1298                                      int32_t height) {
1299   uint32_t loop_cnt;
1300   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1301   v16i8 filt_hz0, filt_hz1, mask0, mask1;
1302   v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1303   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1304   v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1305   v16u8 vec0, vec1;
1306 
1307   mask0 = LD_SB(&vp8_mc_filt_mask_arr[0]);
1308   src -= (1 + 2 * src_stride);
1309 
1310   filt = LD_SH(filter_horiz);
1311   SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1312 
1313   mask1 = mask0 + 2;
1314 
1315   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1316   src += (5 * src_stride);
1317 
1318   XORI_B5_128_SB(src0, src1, src2, src3, src4);
1319   hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1320   hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1321   hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1322   hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1323   hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1324   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1325   ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1326 
1327   filt = LD_SH(filter_vert);
1328   SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1329 
1330   for (loop_cnt = (height >> 2); loop_cnt--;) {
1331     LD_SB4(src, src_stride, src5, src6, src7, src8);
1332     src += (4 * src_stride);
1333 
1334     XORI_B4_128_SB(src5, src6, src7, src8);
1335 
1336     hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1337     out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
1338     tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1339 
1340     hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1341     out5 = (v8i16)__msa_ilvev_b((v16i8)hz_out6, (v16i8)hz_out5);
1342     tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1343 
1344     hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1345     out6 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
1346     tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1347 
1348     hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1349     out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
1350     tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1351 
1352     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1353     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1354     vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1355     vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1356     ST8x4_UB(vec0, vec1, dst, dst_stride);
1357     dst += (4 * dst_stride);
1358 
1359     hz_out4 = hz_out8;
1360     out0 = out2;
1361     out1 = out6;
1362     out3 = out5;
1363     out4 = out7;
1364   }
1365 }
1366 
common_hv_4ht_6vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1367 static void common_hv_4ht_6vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
1368                                       uint8_t *RESTRICT dst, int32_t dst_stride,
1369                                       const int8_t *filter_horiz,
1370                                       const int8_t *filter_vert,
1371                                       int32_t height) {
1372   int32_t multiple8_cnt;
1373   for (multiple8_cnt = 2; multiple8_cnt--;) {
1374     common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1375                              filter_vert, height);
1376     src += 8;
1377     dst += 8;
1378   }
1379 }
1380 
vp8_sixtap_predict4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1381 void vp8_sixtap_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
1382                                int32_t xoffset, int32_t yoffset,
1383                                uint8_t *RESTRICT dst, int32_t dst_stride) {
1384   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1385   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1386 
1387   if (yoffset) {
1388     if (xoffset) {
1389       switch (xoffset) {
1390         case 2:
1391         case 4:
1392         case 6:
1393           switch (yoffset) {
1394             case 2:
1395             case 4:
1396             case 6:
1397               common_hv_6ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
1398                                        h_filter, v_filter, 4);
1399               break;
1400 
1401             case 1:
1402             case 3:
1403             case 5:
1404             case 7:
1405               common_hv_6ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
1406                                        h_filter, v_filter + 1, 4);
1407               break;
1408           }
1409           break;
1410 
1411         case 1:
1412         case 3:
1413         case 5:
1414         case 7:
1415           switch (yoffset) {
1416             case 2:
1417             case 4:
1418             case 6:
1419               common_hv_4ht_6vt_4w_msa(src, src_stride, dst, dst_stride,
1420                                        h_filter + 1, v_filter, 4);
1421               break;
1422 
1423             case 1:
1424             case 3:
1425             case 5:
1426             case 7:
1427               common_hv_4ht_4vt_4w_msa(src, src_stride, dst, dst_stride,
1428                                        h_filter + 1, v_filter + 1, 4);
1429               break;
1430           }
1431           break;
1432       }
1433     } else {
1434       switch (yoffset) {
1435         case 2:
1436         case 4:
1437         case 6:
1438           common_vt_6t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
1439           break;
1440 
1441         case 1:
1442         case 3:
1443         case 5:
1444         case 7:
1445           common_vt_4t_4w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1446                               4);
1447           break;
1448       }
1449     }
1450   } else {
1451     switch (xoffset) {
1452       case 0: {
1453         uint32_t tp0, tp1, tp2, tp3;
1454 
1455         LW4(src, src_stride, tp0, tp1, tp2, tp3);
1456         SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
1457         break;
1458       }
1459       case 2:
1460       case 4:
1461       case 6:
1462         common_hz_6t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
1463         break;
1464 
1465       case 1:
1466       case 3:
1467       case 5:
1468       case 7:
1469         common_hz_4t_4w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
1470         break;
1471     }
1472   }
1473 }
1474 
vp8_sixtap_predict8x4_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1475 void vp8_sixtap_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
1476                                int32_t xoffset, int32_t yoffset,
1477                                uint8_t *RESTRICT dst, int32_t dst_stride) {
1478   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1479   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1480 
1481   if (yoffset) {
1482     if (xoffset) {
1483       switch (xoffset) {
1484         case 2:
1485         case 4:
1486         case 6:
1487           switch (yoffset) {
1488             case 2:
1489             case 4:
1490             case 6:
1491               common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1492                                        h_filter, v_filter, 4);
1493               break;
1494 
1495             case 1:
1496             case 3:
1497             case 5:
1498             case 7:
1499               common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1500                                        h_filter, v_filter + 1, 4);
1501               break;
1502           }
1503           break;
1504 
1505         case 1:
1506         case 3:
1507         case 5:
1508         case 7:
1509           switch (yoffset) {
1510             case 2:
1511             case 4:
1512             case 6:
1513               common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1514                                        h_filter + 1, v_filter, 4);
1515               break;
1516 
1517             case 1:
1518             case 3:
1519             case 5:
1520             case 7:
1521               common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1522                                        h_filter + 1, v_filter + 1, 4);
1523               break;
1524           }
1525           break;
1526       }
1527     } else {
1528       switch (yoffset) {
1529         case 2:
1530         case 4:
1531         case 6:
1532           common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
1533           break;
1534 
1535         case 1:
1536         case 3:
1537         case 5:
1538         case 7:
1539           common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1540                               4);
1541           break;
1542       }
1543     }
1544   } else {
1545     switch (xoffset) {
1546       case 0: vp8_copy_mem8x4(src, src_stride, dst, dst_stride); break;
1547       case 2:
1548       case 4:
1549       case 6:
1550         common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
1551         break;
1552 
1553       case 1:
1554       case 3:
1555       case 5:
1556       case 7:
1557         common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 4);
1558         break;
1559     }
1560   }
1561 }
1562 
vp8_sixtap_predict8x8_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1563 void vp8_sixtap_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
1564                                int32_t xoffset, int32_t yoffset,
1565                                uint8_t *RESTRICT dst, int32_t dst_stride) {
1566   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1567   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1568 
1569   if (yoffset) {
1570     if (xoffset) {
1571       switch (xoffset) {
1572         case 2:
1573         case 4:
1574         case 6:
1575           switch (yoffset) {
1576             case 2:
1577             case 4:
1578             case 6:
1579               common_hv_6ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1580                                        h_filter, v_filter, 8);
1581               break;
1582 
1583             case 1:
1584             case 3:
1585             case 5:
1586             case 7:
1587               common_hv_6ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1588                                        h_filter, v_filter + 1, 8);
1589               break;
1590           }
1591           break;
1592 
1593         case 1:
1594         case 3:
1595         case 5:
1596         case 7:
1597           switch (yoffset) {
1598             case 2:
1599             case 4:
1600             case 6:
1601               common_hv_4ht_6vt_8w_msa(src, src_stride, dst, dst_stride,
1602                                        h_filter + 1, v_filter, 8);
1603               break;
1604 
1605             case 1:
1606             case 3:
1607             case 5:
1608             case 7:
1609               common_hv_4ht_4vt_8w_msa(src, src_stride, dst, dst_stride,
1610                                        h_filter + 1, v_filter + 1, 8);
1611               break;
1612           }
1613           break;
1614       }
1615     } else {
1616       switch (yoffset) {
1617         case 2:
1618         case 4:
1619         case 6:
1620           common_vt_6t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
1621           break;
1622 
1623         case 1:
1624         case 3:
1625         case 5:
1626         case 7:
1627           common_vt_4t_8w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1628                               8);
1629           break;
1630       }
1631     }
1632   } else {
1633     switch (xoffset) {
1634       case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
1635       case 2:
1636       case 4:
1637       case 6:
1638         common_hz_6t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
1639         break;
1640 
1641       case 1:
1642       case 3:
1643       case 5:
1644       case 7:
1645         common_hz_4t_8w_msa(src, src_stride, dst, dst_stride, h_filter + 1, 8);
1646         break;
1647     }
1648   }
1649 }
1650 
vp8_sixtap_predict16x16_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1651 void vp8_sixtap_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
1652                                  int32_t xoffset, int32_t yoffset,
1653                                  uint8_t *RESTRICT dst, int32_t dst_stride) {
1654   const int8_t *h_filter = vp8_subpel_filters_msa[xoffset - 1];
1655   const int8_t *v_filter = vp8_subpel_filters_msa[yoffset - 1];
1656 
1657   if (yoffset) {
1658     if (xoffset) {
1659       switch (xoffset) {
1660         case 2:
1661         case 4:
1662         case 6:
1663           switch (yoffset) {
1664             case 2:
1665             case 4:
1666             case 6:
1667               common_hv_6ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
1668                                         h_filter, v_filter, 16);
1669               break;
1670 
1671             case 1:
1672             case 3:
1673             case 5:
1674             case 7:
1675               common_hv_6ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
1676                                         h_filter, v_filter + 1, 16);
1677               break;
1678           }
1679           break;
1680 
1681         case 1:
1682         case 3:
1683         case 5:
1684         case 7:
1685           switch (yoffset) {
1686             case 2:
1687             case 4:
1688             case 6:
1689               common_hv_4ht_6vt_16w_msa(src, src_stride, dst, dst_stride,
1690                                         h_filter + 1, v_filter, 16);
1691               break;
1692 
1693             case 1:
1694             case 3:
1695             case 5:
1696             case 7:
1697               common_hv_4ht_4vt_16w_msa(src, src_stride, dst, dst_stride,
1698                                         h_filter + 1, v_filter + 1, 16);
1699               break;
1700           }
1701           break;
1702       }
1703     } else {
1704       switch (yoffset) {
1705         case 2:
1706         case 4:
1707         case 6:
1708           common_vt_6t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16);
1709           break;
1710 
1711         case 1:
1712         case 3:
1713         case 5:
1714         case 7:
1715           common_vt_4t_16w_msa(src, src_stride, dst, dst_stride, v_filter + 1,
1716                                16);
1717           break;
1718       }
1719     }
1720   } else {
1721     switch (xoffset) {
1722       case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
1723       case 2:
1724       case 4:
1725       case 6:
1726         common_hz_6t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16);
1727         break;
1728 
1729       case 1:
1730       case 3:
1731       case 5:
1732       case 7:
1733         common_hz_4t_16w_msa(src, src_stride, dst, dst_stride, h_filter + 1,
1734                              16);
1735         break;
1736     }
1737   }
1738 }
1739