xref: /aosp_15_r20/external/libvpx/vpx_dsp/mips/variance_msa.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13 
14 #define CALC_MSE_B(src, ref, var)                                   \
15   {                                                                 \
16     v16u8 src_l0_m, src_l1_m;                                       \
17     v8i16 res_l0_m, res_l1_m;                                       \
18                                                                     \
19     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
20     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
21     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
22   }
23 
24 #define CALC_MSE_AVG_B(src, ref, var, sub)                          \
25   {                                                                 \
26     v16u8 src_l0_m, src_l1_m;                                       \
27     v8i16 res_l0_m, res_l1_m;                                       \
28                                                                     \
29     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
30     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
31     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
32                                                                     \
33     sub += res_l0_m + res_l1_m;                                     \
34   }
35 
36 #define VARIANCE_WxH(sse, diff, shift) \
37   (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
38 
39 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
40   (sse) - (((int64_t)(diff) * (diff)) >> (shift))
41 
sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)42 static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
43                                     const uint8_t *ref_ptr, int32_t ref_stride,
44                                     int32_t height, int32_t *diff) {
45   uint32_t src0, src1, src2, src3;
46   uint32_t ref0, ref1, ref2, ref3;
47   int32_t ht_cnt;
48   v16u8 src = { 0 };
49   v16u8 ref = { 0 };
50   v8i16 avg = { 0 };
51   v4i32 vec, var = { 0 };
52 
53   for (ht_cnt = (height >> 2); ht_cnt--;) {
54     LW4(src_ptr, src_stride, src0, src1, src2, src3);
55     src_ptr += (4 * src_stride);
56     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
57     ref_ptr += (4 * ref_stride);
58 
59     INSERT_W4_UB(src0, src1, src2, src3, src);
60     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
61     CALC_MSE_AVG_B(src, ref, var, avg);
62   }
63 
64   vec = __msa_hadd_s_w(avg, avg);
65   *diff = HADD_SW_S32(vec);
66 
67   return HADD_SW_S32(var);
68 }
69 
sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)70 static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
71                                     const uint8_t *ref_ptr, int32_t ref_stride,
72                                     int32_t height, int32_t *diff) {
73   int32_t ht_cnt;
74   v16u8 src0, src1, src2, src3;
75   v16u8 ref0, ref1, ref2, ref3;
76   v8i16 avg = { 0 };
77   v4i32 vec, var = { 0 };
78 
79   for (ht_cnt = (height >> 2); ht_cnt--;) {
80     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
81     src_ptr += (4 * src_stride);
82     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
83     ref_ptr += (4 * ref_stride);
84 
85     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
86                 ref0, ref1);
87     CALC_MSE_AVG_B(src0, ref0, var, avg);
88     CALC_MSE_AVG_B(src1, ref1, var, avg);
89   }
90 
91   vec = __msa_hadd_s_w(avg, avg);
92   *diff = HADD_SW_S32(vec);
93 
94   return HADD_SW_S32(var);
95 }
96 
sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)97 static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
98                                      const uint8_t *ref_ptr, int32_t ref_stride,
99                                      int32_t height, int32_t *diff) {
100   int32_t ht_cnt;
101   v16u8 src, ref;
102   v8i16 avg = { 0 };
103   v4i32 vec, var = { 0 };
104 
105   for (ht_cnt = (height >> 2); ht_cnt--;) {
106     src = LD_UB(src_ptr);
107     src_ptr += src_stride;
108     ref = LD_UB(ref_ptr);
109     ref_ptr += ref_stride;
110     CALC_MSE_AVG_B(src, ref, var, avg);
111 
112     src = LD_UB(src_ptr);
113     src_ptr += src_stride;
114     ref = LD_UB(ref_ptr);
115     ref_ptr += ref_stride;
116     CALC_MSE_AVG_B(src, ref, var, avg);
117 
118     src = LD_UB(src_ptr);
119     src_ptr += src_stride;
120     ref = LD_UB(ref_ptr);
121     ref_ptr += ref_stride;
122     CALC_MSE_AVG_B(src, ref, var, avg);
123 
124     src = LD_UB(src_ptr);
125     src_ptr += src_stride;
126     ref = LD_UB(ref_ptr);
127     ref_ptr += ref_stride;
128     CALC_MSE_AVG_B(src, ref, var, avg);
129   }
130 
131   vec = __msa_hadd_s_w(avg, avg);
132   *diff = HADD_SW_S32(vec);
133 
134   return HADD_SW_S32(var);
135 }
136 
sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)137 static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
138                                      const uint8_t *ref_ptr, int32_t ref_stride,
139                                      int32_t height, int32_t *diff) {
140   int32_t ht_cnt;
141   v16u8 src0, src1, ref0, ref1;
142   v8i16 avg = { 0 };
143   v4i32 vec, var = { 0 };
144 
145   for (ht_cnt = (height >> 2); ht_cnt--;) {
146     LD_UB2(src_ptr, 16, src0, src1);
147     src_ptr += src_stride;
148     LD_UB2(ref_ptr, 16, ref0, ref1);
149     ref_ptr += ref_stride;
150     CALC_MSE_AVG_B(src0, ref0, var, avg);
151     CALC_MSE_AVG_B(src1, ref1, var, avg);
152 
153     LD_UB2(src_ptr, 16, src0, src1);
154     src_ptr += src_stride;
155     LD_UB2(ref_ptr, 16, ref0, ref1);
156     ref_ptr += ref_stride;
157     CALC_MSE_AVG_B(src0, ref0, var, avg);
158     CALC_MSE_AVG_B(src1, ref1, var, avg);
159 
160     LD_UB2(src_ptr, 16, src0, src1);
161     src_ptr += src_stride;
162     LD_UB2(ref_ptr, 16, ref0, ref1);
163     ref_ptr += ref_stride;
164     CALC_MSE_AVG_B(src0, ref0, var, avg);
165     CALC_MSE_AVG_B(src1, ref1, var, avg);
166 
167     LD_UB2(src_ptr, 16, src0, src1);
168     src_ptr += src_stride;
169     LD_UB2(ref_ptr, 16, ref0, ref1);
170     ref_ptr += ref_stride;
171     CALC_MSE_AVG_B(src0, ref0, var, avg);
172     CALC_MSE_AVG_B(src1, ref1, var, avg);
173   }
174 
175   vec = __msa_hadd_s_w(avg, avg);
176   *diff = HADD_SW_S32(vec);
177 
178   return HADD_SW_S32(var);
179 }
180 
sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)181 static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
182                                    const uint8_t *ref_ptr, int32_t ref_stride,
183                                    int32_t *diff) {
184   int32_t ht_cnt;
185   v16u8 src0, src1, ref0, ref1;
186   v8i16 avg0 = { 0 };
187   v8i16 avg1 = { 0 };
188   v4i32 vec, var = { 0 };
189 
190   for (ht_cnt = 16; ht_cnt--;) {
191     LD_UB2(src_ptr, 16, src0, src1);
192     src_ptr += src_stride;
193     LD_UB2(ref_ptr, 16, ref0, ref1);
194     ref_ptr += ref_stride;
195     CALC_MSE_AVG_B(src0, ref0, var, avg0);
196     CALC_MSE_AVG_B(src1, ref1, var, avg1);
197 
198     LD_UB2(src_ptr, 16, src0, src1);
199     src_ptr += src_stride;
200     LD_UB2(ref_ptr, 16, ref0, ref1);
201     ref_ptr += ref_stride;
202     CALC_MSE_AVG_B(src0, ref0, var, avg0);
203     CALC_MSE_AVG_B(src1, ref1, var, avg1);
204 
205     LD_UB2(src_ptr, 16, src0, src1);
206     src_ptr += src_stride;
207     LD_UB2(ref_ptr, 16, ref0, ref1);
208     ref_ptr += ref_stride;
209     CALC_MSE_AVG_B(src0, ref0, var, avg0);
210     CALC_MSE_AVG_B(src1, ref1, var, avg1);
211 
212     LD_UB2(src_ptr, 16, src0, src1);
213     src_ptr += src_stride;
214     LD_UB2(ref_ptr, 16, ref0, ref1);
215     ref_ptr += ref_stride;
216     CALC_MSE_AVG_B(src0, ref0, var, avg0);
217     CALC_MSE_AVG_B(src1, ref1, var, avg1);
218   }
219 
220   vec = __msa_hadd_s_w(avg0, avg0);
221   vec += __msa_hadd_s_w(avg1, avg1);
222   *diff = HADD_SW_S32(vec);
223 
224   return HADD_SW_S32(var);
225 }
226 
sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)227 static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
228                                    const uint8_t *ref_ptr, int32_t ref_stride,
229                                    int32_t *diff) {
230   int32_t ht_cnt;
231   v16u8 src0, src1, src2, src3;
232   v16u8 ref0, ref1, ref2, ref3;
233   v8i16 avg0 = { 0 };
234   v8i16 avg1 = { 0 };
235   v4i32 vec, var = { 0 };
236 
237   for (ht_cnt = 16; ht_cnt--;) {
238     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
239     src_ptr += src_stride;
240     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
241     ref_ptr += ref_stride;
242     CALC_MSE_AVG_B(src0, ref0, var, avg0);
243     CALC_MSE_AVG_B(src2, ref2, var, avg0);
244     CALC_MSE_AVG_B(src1, ref1, var, avg1);
245     CALC_MSE_AVG_B(src3, ref3, var, avg1);
246 
247     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
248     src_ptr += src_stride;
249     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
250     ref_ptr += ref_stride;
251     CALC_MSE_AVG_B(src0, ref0, var, avg0);
252     CALC_MSE_AVG_B(src2, ref2, var, avg0);
253     CALC_MSE_AVG_B(src1, ref1, var, avg1);
254     CALC_MSE_AVG_B(src3, ref3, var, avg1);
255   }
256 
257   vec = __msa_hadd_s_w(avg0, avg0);
258   vec += __msa_hadd_s_w(avg1, avg1);
259   *diff = HADD_SW_S32(vec);
260 
261   return HADD_SW_S32(var);
262 }
263 
sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)264 static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
265                                    const uint8_t *ref_ptr, int32_t ref_stride,
266                                    int32_t *diff) {
267   int32_t ht_cnt;
268   v16u8 src0, src1, src2, src3;
269   v16u8 ref0, ref1, ref2, ref3;
270   v8i16 avg0 = { 0 };
271   v8i16 avg1 = { 0 };
272   v8i16 avg2 = { 0 };
273   v8i16 avg3 = { 0 };
274   v4i32 vec, var = { 0 };
275 
276   for (ht_cnt = 32; ht_cnt--;) {
277     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
278     src_ptr += src_stride;
279     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
280     ref_ptr += ref_stride;
281 
282     CALC_MSE_AVG_B(src0, ref0, var, avg0);
283     CALC_MSE_AVG_B(src1, ref1, var, avg1);
284     CALC_MSE_AVG_B(src2, ref2, var, avg2);
285     CALC_MSE_AVG_B(src3, ref3, var, avg3);
286     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
287     src_ptr += src_stride;
288     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
289     ref_ptr += ref_stride;
290     CALC_MSE_AVG_B(src0, ref0, var, avg0);
291     CALC_MSE_AVG_B(src1, ref1, var, avg1);
292     CALC_MSE_AVG_B(src2, ref2, var, avg2);
293     CALC_MSE_AVG_B(src3, ref3, var, avg3);
294   }
295 
296   vec = __msa_hadd_s_w(avg0, avg0);
297   vec += __msa_hadd_s_w(avg1, avg1);
298   vec += __msa_hadd_s_w(avg2, avg2);
299   vec += __msa_hadd_s_w(avg3, avg3);
300   *diff = HADD_SW_S32(vec);
301 
302   return HADD_SW_S32(var);
303 }
304 
get_mb_ss_msa(const int16_t * src)305 static uint32_t get_mb_ss_msa(const int16_t *src) {
306   uint32_t sum, cnt;
307   v8i16 src0, src1, src2, src3;
308   v4i32 src0_l, src1_l, src2_l, src3_l;
309   v4i32 src0_r, src1_r, src2_r, src3_r;
310   v2i64 sq_src_l = { 0 };
311   v2i64 sq_src_r = { 0 };
312 
313   for (cnt = 8; cnt--;) {
314     LD_SH4(src, 8, src0, src1, src2, src3);
315     src += 4 * 8;
316 
317     UNPCK_SH_SW(src0, src0_l, src0_r);
318     UNPCK_SH_SW(src1, src1_l, src1_r);
319     UNPCK_SH_SW(src2, src2_l, src2_r);
320     UNPCK_SH_SW(src3, src3_l, src3_r);
321 
322     DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
323     DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
324     DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
325     DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
326   }
327 
328   sq_src_l += __msa_splati_d(sq_src_l, 1);
329   sq_src_r += __msa_splati_d(sq_src_r, 1);
330 
331   sum = __msa_copy_s_d(sq_src_l, 0);
332   sum += __msa_copy_s_d(sq_src_r, 0);
333 
334   return sum;
335 }
336 
sse_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)337 static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
338                                const uint8_t *ref_ptr, int32_t ref_stride,
339                                int32_t height) {
340   int32_t ht_cnt;
341   uint32_t src0, src1, src2, src3;
342   uint32_t ref0, ref1, ref2, ref3;
343   v16u8 src = { 0 };
344   v16u8 ref = { 0 };
345   v4i32 var = { 0 };
346 
347   for (ht_cnt = (height >> 2); ht_cnt--;) {
348     LW4(src_ptr, src_stride, src0, src1, src2, src3);
349     src_ptr += (4 * src_stride);
350     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
351     ref_ptr += (4 * ref_stride);
352 
353     INSERT_W4_UB(src0, src1, src2, src3, src);
354     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
355     CALC_MSE_B(src, ref, var);
356   }
357 
358   return HADD_SW_S32(var);
359 }
360 
sse_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)361 static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
362                                const uint8_t *ref_ptr, int32_t ref_stride,
363                                int32_t height) {
364   int32_t ht_cnt;
365   v16u8 src0, src1, src2, src3;
366   v16u8 ref0, ref1, ref2, ref3;
367   v4i32 var = { 0 };
368 
369   for (ht_cnt = (height >> 2); ht_cnt--;) {
370     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
371     src_ptr += (4 * src_stride);
372     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
373     ref_ptr += (4 * ref_stride);
374 
375     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
376                 ref0, ref1);
377     CALC_MSE_B(src0, ref0, var);
378     CALC_MSE_B(src1, ref1, var);
379   }
380 
381   return HADD_SW_S32(var);
382 }
383 
sse_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)384 static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
385                                 const uint8_t *ref_ptr, int32_t ref_stride,
386                                 int32_t height) {
387   int32_t ht_cnt;
388   v16u8 src, ref;
389   v4i32 var = { 0 };
390 
391   for (ht_cnt = (height >> 2); ht_cnt--;) {
392     src = LD_UB(src_ptr);
393     src_ptr += src_stride;
394     ref = LD_UB(ref_ptr);
395     ref_ptr += ref_stride;
396     CALC_MSE_B(src, ref, var);
397 
398     src = LD_UB(src_ptr);
399     src_ptr += src_stride;
400     ref = LD_UB(ref_ptr);
401     ref_ptr += ref_stride;
402     CALC_MSE_B(src, ref, var);
403 
404     src = LD_UB(src_ptr);
405     src_ptr += src_stride;
406     ref = LD_UB(ref_ptr);
407     ref_ptr += ref_stride;
408     CALC_MSE_B(src, ref, var);
409 
410     src = LD_UB(src_ptr);
411     src_ptr += src_stride;
412     ref = LD_UB(ref_ptr);
413     ref_ptr += ref_stride;
414     CALC_MSE_B(src, ref, var);
415   }
416 
417   return HADD_SW_S32(var);
418 }
419 
sse_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)420 static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
421                                 const uint8_t *ref_ptr, int32_t ref_stride,
422                                 int32_t height) {
423   int32_t ht_cnt;
424   v16u8 src0, src1, ref0, ref1;
425   v4i32 var = { 0 };
426 
427   for (ht_cnt = (height >> 2); ht_cnt--;) {
428     LD_UB2(src_ptr, 16, src0, src1);
429     src_ptr += src_stride;
430     LD_UB2(ref_ptr, 16, ref0, ref1);
431     ref_ptr += ref_stride;
432     CALC_MSE_B(src0, ref0, var);
433     CALC_MSE_B(src1, ref1, var);
434 
435     LD_UB2(src_ptr, 16, src0, src1);
436     src_ptr += src_stride;
437     LD_UB2(ref_ptr, 16, ref0, ref1);
438     ref_ptr += ref_stride;
439     CALC_MSE_B(src0, ref0, var);
440     CALC_MSE_B(src1, ref1, var);
441 
442     LD_UB2(src_ptr, 16, src0, src1);
443     src_ptr += src_stride;
444     LD_UB2(ref_ptr, 16, ref0, ref1);
445     ref_ptr += ref_stride;
446     CALC_MSE_B(src0, ref0, var);
447     CALC_MSE_B(src1, ref1, var);
448 
449     LD_UB2(src_ptr, 16, src0, src1);
450     src_ptr += src_stride;
451     LD_UB2(ref_ptr, 16, ref0, ref1);
452     ref_ptr += ref_stride;
453     CALC_MSE_B(src0, ref0, var);
454     CALC_MSE_B(src1, ref1, var);
455   }
456 
457   return HADD_SW_S32(var);
458 }
459 
sse_64width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)460 static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
461                                 const uint8_t *ref_ptr, int32_t ref_stride,
462                                 int32_t height) {
463   int32_t ht_cnt;
464   v16u8 src0, src1, src2, src3;
465   v16u8 ref0, ref1, ref2, ref3;
466   v4i32 var = { 0 };
467 
468   for (ht_cnt = height >> 1; ht_cnt--;) {
469     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
470     src_ptr += src_stride;
471     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
472     ref_ptr += ref_stride;
473     CALC_MSE_B(src0, ref0, var);
474     CALC_MSE_B(src2, ref2, var);
475     CALC_MSE_B(src1, ref1, var);
476     CALC_MSE_B(src3, ref3, var);
477 
478     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
479     src_ptr += src_stride;
480     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
481     ref_ptr += ref_stride;
482     CALC_MSE_B(src0, ref0, var);
483     CALC_MSE_B(src2, ref2, var);
484     CALC_MSE_B(src1, ref1, var);
485     CALC_MSE_B(src3, ref3, var);
486   }
487 
488   return HADD_SW_S32(var);
489 }
490 
vpx_get4x4sse_cs_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride)491 uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
492                               const uint8_t *ref_ptr, int32_t ref_stride) {
493   uint32_t src0, src1, src2, src3;
494   uint32_t ref0, ref1, ref2, ref3;
495   v16i8 src = { 0 };
496   v16i8 ref = { 0 };
497   v4i32 err0 = { 0 };
498 
499   LW4(src_ptr, src_stride, src0, src1, src2, src3);
500   LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
501   INSERT_W4_SB(src0, src1, src2, src3, src);
502   INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
503   CALC_MSE_B(src, ref, err0);
504 
505   return HADD_SW_S32(err0);
506 }
507 
508 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
509 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
510 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
511 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
512 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
513 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
514 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
515 
516 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
517 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
518 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
519 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
520 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
521 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
522 
523 #define VPX_VARIANCE_WDXHT_MSA(wd, ht)                                         \
524   uint32_t vpx_variance##wd##x##ht##_msa(                                      \
525       const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
526       int32_t ref_stride, uint32_t *sse) {                                     \
527     int32_t diff;                                                              \
528                                                                                \
529     *sse =                                                                     \
530         sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
531                                                                                \
532     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
533   }
534 
535 VPX_VARIANCE_WDXHT_MSA(4, 4);
536 VPX_VARIANCE_WDXHT_MSA(4, 8);
537 
538 VPX_VARIANCE_WDXHT_MSA(8, 4)
539 VPX_VARIANCE_WDXHT_MSA(8, 8)
540 VPX_VARIANCE_WDXHT_MSA(8, 16)
541 
542 VPX_VARIANCE_WDXHT_MSA(16, 8)
543 VPX_VARIANCE_WDXHT_MSA(16, 16)
544 VPX_VARIANCE_WDXHT_MSA(16, 32)
545 
546 VPX_VARIANCE_WDXHT_MSA(32, 16)
547 VPX_VARIANCE_WDXHT_MSA(32, 32)
548 
vpx_variance32x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)549 uint32_t vpx_variance32x64_msa(const uint8_t *src, int32_t src_stride,
550                                const uint8_t *ref, int32_t ref_stride,
551                                uint32_t *sse) {
552   int32_t diff;
553 
554   *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
555 
556   return VARIANCE_32Wx64H(*sse, diff);
557 }
558 
vpx_variance64x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)559 uint32_t vpx_variance64x32_msa(const uint8_t *src, int32_t src_stride,
560                                const uint8_t *ref, int32_t ref_stride,
561                                uint32_t *sse) {
562   int32_t diff;
563 
564   *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
565 
566   return VARIANCE_64Wx32H(*sse, diff);
567 }
568 
vpx_variance64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)569 uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
570                                const uint8_t *ref, int32_t ref_stride,
571                                uint32_t *sse) {
572   int32_t diff;
573 
574   *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
575 
576   return VARIANCE_64Wx64H(*sse, diff);
577 }
578 
vpx_mse8x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)579 uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
580                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
581   *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
582 
583   return *sse;
584 }
585 
vpx_mse8x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)586 uint32_t vpx_mse8x16_msa(const uint8_t *src, int32_t src_stride,
587                          const uint8_t *ref, int32_t ref_stride,
588                          uint32_t *sse) {
589   *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
590 
591   return *sse;
592 }
593 
vpx_mse16x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)594 uint32_t vpx_mse16x8_msa(const uint8_t *src, int32_t src_stride,
595                          const uint8_t *ref, int32_t ref_stride,
596                          uint32_t *sse) {
597   *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
598 
599   return *sse;
600 }
601 
vpx_mse16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)602 uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
603                           const uint8_t *ref, int32_t ref_stride,
604                           uint32_t *sse) {
605   *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
606 
607   return *sse;
608 }
609 
vpx_get8x8var_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse,int32_t * sum)610 void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
611                        const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
612                        int32_t *sum) {
613   *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
614 }
615 
vpx_get16x16var_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse,int32_t * sum)616 void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
617                          const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
618                          int32_t *sum) {
619   *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
620 }
621 
vpx_get_mb_ss_msa(const int16_t * src)622 uint32_t vpx_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
623