1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13
14 #define CALC_MSE_B(src, ref, var) \
15 { \
16 v16u8 src_l0_m, src_l1_m; \
17 v8i16 res_l0_m, res_l1_m; \
18 \
19 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
20 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
21 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
22 }
23
24 #define CALC_MSE_AVG_B(src, ref, var, sub) \
25 { \
26 v16u8 src_l0_m, src_l1_m; \
27 v8i16 res_l0_m, res_l1_m; \
28 \
29 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
30 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
31 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
32 \
33 sub += res_l0_m + res_l1_m; \
34 }
35
36 #define VARIANCE_WxH(sse, diff, shift) \
37 (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
38
39 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
40 (sse) - (((int64_t)(diff) * (diff)) >> (shift))
41
sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)42 static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
43 const uint8_t *ref_ptr, int32_t ref_stride,
44 int32_t height, int32_t *diff) {
45 uint32_t src0, src1, src2, src3;
46 uint32_t ref0, ref1, ref2, ref3;
47 int32_t ht_cnt;
48 v16u8 src = { 0 };
49 v16u8 ref = { 0 };
50 v8i16 avg = { 0 };
51 v4i32 vec, var = { 0 };
52
53 for (ht_cnt = (height >> 2); ht_cnt--;) {
54 LW4(src_ptr, src_stride, src0, src1, src2, src3);
55 src_ptr += (4 * src_stride);
56 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
57 ref_ptr += (4 * ref_stride);
58
59 INSERT_W4_UB(src0, src1, src2, src3, src);
60 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
61 CALC_MSE_AVG_B(src, ref, var, avg);
62 }
63
64 vec = __msa_hadd_s_w(avg, avg);
65 *diff = HADD_SW_S32(vec);
66
67 return HADD_SW_S32(var);
68 }
69
sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)70 static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
71 const uint8_t *ref_ptr, int32_t ref_stride,
72 int32_t height, int32_t *diff) {
73 int32_t ht_cnt;
74 v16u8 src0, src1, src2, src3;
75 v16u8 ref0, ref1, ref2, ref3;
76 v8i16 avg = { 0 };
77 v4i32 vec, var = { 0 };
78
79 for (ht_cnt = (height >> 2); ht_cnt--;) {
80 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
81 src_ptr += (4 * src_stride);
82 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
83 ref_ptr += (4 * ref_stride);
84
85 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
86 ref0, ref1);
87 CALC_MSE_AVG_B(src0, ref0, var, avg);
88 CALC_MSE_AVG_B(src1, ref1, var, avg);
89 }
90
91 vec = __msa_hadd_s_w(avg, avg);
92 *diff = HADD_SW_S32(vec);
93
94 return HADD_SW_S32(var);
95 }
96
sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)97 static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
98 const uint8_t *ref_ptr, int32_t ref_stride,
99 int32_t height, int32_t *diff) {
100 int32_t ht_cnt;
101 v16u8 src, ref;
102 v8i16 avg = { 0 };
103 v4i32 vec, var = { 0 };
104
105 for (ht_cnt = (height >> 2); ht_cnt--;) {
106 src = LD_UB(src_ptr);
107 src_ptr += src_stride;
108 ref = LD_UB(ref_ptr);
109 ref_ptr += ref_stride;
110 CALC_MSE_AVG_B(src, ref, var, avg);
111
112 src = LD_UB(src_ptr);
113 src_ptr += src_stride;
114 ref = LD_UB(ref_ptr);
115 ref_ptr += ref_stride;
116 CALC_MSE_AVG_B(src, ref, var, avg);
117
118 src = LD_UB(src_ptr);
119 src_ptr += src_stride;
120 ref = LD_UB(ref_ptr);
121 ref_ptr += ref_stride;
122 CALC_MSE_AVG_B(src, ref, var, avg);
123
124 src = LD_UB(src_ptr);
125 src_ptr += src_stride;
126 ref = LD_UB(ref_ptr);
127 ref_ptr += ref_stride;
128 CALC_MSE_AVG_B(src, ref, var, avg);
129 }
130
131 vec = __msa_hadd_s_w(avg, avg);
132 *diff = HADD_SW_S32(vec);
133
134 return HADD_SW_S32(var);
135 }
136
sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)137 static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
138 const uint8_t *ref_ptr, int32_t ref_stride,
139 int32_t height, int32_t *diff) {
140 int32_t ht_cnt;
141 v16u8 src0, src1, ref0, ref1;
142 v8i16 avg = { 0 };
143 v4i32 vec, var = { 0 };
144
145 for (ht_cnt = (height >> 2); ht_cnt--;) {
146 LD_UB2(src_ptr, 16, src0, src1);
147 src_ptr += src_stride;
148 LD_UB2(ref_ptr, 16, ref0, ref1);
149 ref_ptr += ref_stride;
150 CALC_MSE_AVG_B(src0, ref0, var, avg);
151 CALC_MSE_AVG_B(src1, ref1, var, avg);
152
153 LD_UB2(src_ptr, 16, src0, src1);
154 src_ptr += src_stride;
155 LD_UB2(ref_ptr, 16, ref0, ref1);
156 ref_ptr += ref_stride;
157 CALC_MSE_AVG_B(src0, ref0, var, avg);
158 CALC_MSE_AVG_B(src1, ref1, var, avg);
159
160 LD_UB2(src_ptr, 16, src0, src1);
161 src_ptr += src_stride;
162 LD_UB2(ref_ptr, 16, ref0, ref1);
163 ref_ptr += ref_stride;
164 CALC_MSE_AVG_B(src0, ref0, var, avg);
165 CALC_MSE_AVG_B(src1, ref1, var, avg);
166
167 LD_UB2(src_ptr, 16, src0, src1);
168 src_ptr += src_stride;
169 LD_UB2(ref_ptr, 16, ref0, ref1);
170 ref_ptr += ref_stride;
171 CALC_MSE_AVG_B(src0, ref0, var, avg);
172 CALC_MSE_AVG_B(src1, ref1, var, avg);
173 }
174
175 vec = __msa_hadd_s_w(avg, avg);
176 *diff = HADD_SW_S32(vec);
177
178 return HADD_SW_S32(var);
179 }
180
sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)181 static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
182 const uint8_t *ref_ptr, int32_t ref_stride,
183 int32_t *diff) {
184 int32_t ht_cnt;
185 v16u8 src0, src1, ref0, ref1;
186 v8i16 avg0 = { 0 };
187 v8i16 avg1 = { 0 };
188 v4i32 vec, var = { 0 };
189
190 for (ht_cnt = 16; ht_cnt--;) {
191 LD_UB2(src_ptr, 16, src0, src1);
192 src_ptr += src_stride;
193 LD_UB2(ref_ptr, 16, ref0, ref1);
194 ref_ptr += ref_stride;
195 CALC_MSE_AVG_B(src0, ref0, var, avg0);
196 CALC_MSE_AVG_B(src1, ref1, var, avg1);
197
198 LD_UB2(src_ptr, 16, src0, src1);
199 src_ptr += src_stride;
200 LD_UB2(ref_ptr, 16, ref0, ref1);
201 ref_ptr += ref_stride;
202 CALC_MSE_AVG_B(src0, ref0, var, avg0);
203 CALC_MSE_AVG_B(src1, ref1, var, avg1);
204
205 LD_UB2(src_ptr, 16, src0, src1);
206 src_ptr += src_stride;
207 LD_UB2(ref_ptr, 16, ref0, ref1);
208 ref_ptr += ref_stride;
209 CALC_MSE_AVG_B(src0, ref0, var, avg0);
210 CALC_MSE_AVG_B(src1, ref1, var, avg1);
211
212 LD_UB2(src_ptr, 16, src0, src1);
213 src_ptr += src_stride;
214 LD_UB2(ref_ptr, 16, ref0, ref1);
215 ref_ptr += ref_stride;
216 CALC_MSE_AVG_B(src0, ref0, var, avg0);
217 CALC_MSE_AVG_B(src1, ref1, var, avg1);
218 }
219
220 vec = __msa_hadd_s_w(avg0, avg0);
221 vec += __msa_hadd_s_w(avg1, avg1);
222 *diff = HADD_SW_S32(vec);
223
224 return HADD_SW_S32(var);
225 }
226
sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)227 static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
228 const uint8_t *ref_ptr, int32_t ref_stride,
229 int32_t *diff) {
230 int32_t ht_cnt;
231 v16u8 src0, src1, src2, src3;
232 v16u8 ref0, ref1, ref2, ref3;
233 v8i16 avg0 = { 0 };
234 v8i16 avg1 = { 0 };
235 v4i32 vec, var = { 0 };
236
237 for (ht_cnt = 16; ht_cnt--;) {
238 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
239 src_ptr += src_stride;
240 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
241 ref_ptr += ref_stride;
242 CALC_MSE_AVG_B(src0, ref0, var, avg0);
243 CALC_MSE_AVG_B(src2, ref2, var, avg0);
244 CALC_MSE_AVG_B(src1, ref1, var, avg1);
245 CALC_MSE_AVG_B(src3, ref3, var, avg1);
246
247 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
248 src_ptr += src_stride;
249 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
250 ref_ptr += ref_stride;
251 CALC_MSE_AVG_B(src0, ref0, var, avg0);
252 CALC_MSE_AVG_B(src2, ref2, var, avg0);
253 CALC_MSE_AVG_B(src1, ref1, var, avg1);
254 CALC_MSE_AVG_B(src3, ref3, var, avg1);
255 }
256
257 vec = __msa_hadd_s_w(avg0, avg0);
258 vec += __msa_hadd_s_w(avg1, avg1);
259 *diff = HADD_SW_S32(vec);
260
261 return HADD_SW_S32(var);
262 }
263
sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)264 static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
265 const uint8_t *ref_ptr, int32_t ref_stride,
266 int32_t *diff) {
267 int32_t ht_cnt;
268 v16u8 src0, src1, src2, src3;
269 v16u8 ref0, ref1, ref2, ref3;
270 v8i16 avg0 = { 0 };
271 v8i16 avg1 = { 0 };
272 v8i16 avg2 = { 0 };
273 v8i16 avg3 = { 0 };
274 v4i32 vec, var = { 0 };
275
276 for (ht_cnt = 32; ht_cnt--;) {
277 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
278 src_ptr += src_stride;
279 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
280 ref_ptr += ref_stride;
281
282 CALC_MSE_AVG_B(src0, ref0, var, avg0);
283 CALC_MSE_AVG_B(src1, ref1, var, avg1);
284 CALC_MSE_AVG_B(src2, ref2, var, avg2);
285 CALC_MSE_AVG_B(src3, ref3, var, avg3);
286 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
287 src_ptr += src_stride;
288 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
289 ref_ptr += ref_stride;
290 CALC_MSE_AVG_B(src0, ref0, var, avg0);
291 CALC_MSE_AVG_B(src1, ref1, var, avg1);
292 CALC_MSE_AVG_B(src2, ref2, var, avg2);
293 CALC_MSE_AVG_B(src3, ref3, var, avg3);
294 }
295
296 vec = __msa_hadd_s_w(avg0, avg0);
297 vec += __msa_hadd_s_w(avg1, avg1);
298 vec += __msa_hadd_s_w(avg2, avg2);
299 vec += __msa_hadd_s_w(avg3, avg3);
300 *diff = HADD_SW_S32(vec);
301
302 return HADD_SW_S32(var);
303 }
304
get_mb_ss_msa(const int16_t * src)305 static uint32_t get_mb_ss_msa(const int16_t *src) {
306 uint32_t sum, cnt;
307 v8i16 src0, src1, src2, src3;
308 v4i32 src0_l, src1_l, src2_l, src3_l;
309 v4i32 src0_r, src1_r, src2_r, src3_r;
310 v2i64 sq_src_l = { 0 };
311 v2i64 sq_src_r = { 0 };
312
313 for (cnt = 8; cnt--;) {
314 LD_SH4(src, 8, src0, src1, src2, src3);
315 src += 4 * 8;
316
317 UNPCK_SH_SW(src0, src0_l, src0_r);
318 UNPCK_SH_SW(src1, src1_l, src1_r);
319 UNPCK_SH_SW(src2, src2_l, src2_r);
320 UNPCK_SH_SW(src3, src3_l, src3_r);
321
322 DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
323 DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
324 DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
325 DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
326 }
327
328 sq_src_l += __msa_splati_d(sq_src_l, 1);
329 sq_src_r += __msa_splati_d(sq_src_r, 1);
330
331 sum = __msa_copy_s_d(sq_src_l, 0);
332 sum += __msa_copy_s_d(sq_src_r, 0);
333
334 return sum;
335 }
336
sse_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)337 static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
338 const uint8_t *ref_ptr, int32_t ref_stride,
339 int32_t height) {
340 int32_t ht_cnt;
341 uint32_t src0, src1, src2, src3;
342 uint32_t ref0, ref1, ref2, ref3;
343 v16u8 src = { 0 };
344 v16u8 ref = { 0 };
345 v4i32 var = { 0 };
346
347 for (ht_cnt = (height >> 2); ht_cnt--;) {
348 LW4(src_ptr, src_stride, src0, src1, src2, src3);
349 src_ptr += (4 * src_stride);
350 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
351 ref_ptr += (4 * ref_stride);
352
353 INSERT_W4_UB(src0, src1, src2, src3, src);
354 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
355 CALC_MSE_B(src, ref, var);
356 }
357
358 return HADD_SW_S32(var);
359 }
360
sse_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)361 static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
362 const uint8_t *ref_ptr, int32_t ref_stride,
363 int32_t height) {
364 int32_t ht_cnt;
365 v16u8 src0, src1, src2, src3;
366 v16u8 ref0, ref1, ref2, ref3;
367 v4i32 var = { 0 };
368
369 for (ht_cnt = (height >> 2); ht_cnt--;) {
370 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
371 src_ptr += (4 * src_stride);
372 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
373 ref_ptr += (4 * ref_stride);
374
375 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
376 ref0, ref1);
377 CALC_MSE_B(src0, ref0, var);
378 CALC_MSE_B(src1, ref1, var);
379 }
380
381 return HADD_SW_S32(var);
382 }
383
sse_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)384 static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
385 const uint8_t *ref_ptr, int32_t ref_stride,
386 int32_t height) {
387 int32_t ht_cnt;
388 v16u8 src, ref;
389 v4i32 var = { 0 };
390
391 for (ht_cnt = (height >> 2); ht_cnt--;) {
392 src = LD_UB(src_ptr);
393 src_ptr += src_stride;
394 ref = LD_UB(ref_ptr);
395 ref_ptr += ref_stride;
396 CALC_MSE_B(src, ref, var);
397
398 src = LD_UB(src_ptr);
399 src_ptr += src_stride;
400 ref = LD_UB(ref_ptr);
401 ref_ptr += ref_stride;
402 CALC_MSE_B(src, ref, var);
403
404 src = LD_UB(src_ptr);
405 src_ptr += src_stride;
406 ref = LD_UB(ref_ptr);
407 ref_ptr += ref_stride;
408 CALC_MSE_B(src, ref, var);
409
410 src = LD_UB(src_ptr);
411 src_ptr += src_stride;
412 ref = LD_UB(ref_ptr);
413 ref_ptr += ref_stride;
414 CALC_MSE_B(src, ref, var);
415 }
416
417 return HADD_SW_S32(var);
418 }
419
sse_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)420 static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
421 const uint8_t *ref_ptr, int32_t ref_stride,
422 int32_t height) {
423 int32_t ht_cnt;
424 v16u8 src0, src1, ref0, ref1;
425 v4i32 var = { 0 };
426
427 for (ht_cnt = (height >> 2); ht_cnt--;) {
428 LD_UB2(src_ptr, 16, src0, src1);
429 src_ptr += src_stride;
430 LD_UB2(ref_ptr, 16, ref0, ref1);
431 ref_ptr += ref_stride;
432 CALC_MSE_B(src0, ref0, var);
433 CALC_MSE_B(src1, ref1, var);
434
435 LD_UB2(src_ptr, 16, src0, src1);
436 src_ptr += src_stride;
437 LD_UB2(ref_ptr, 16, ref0, ref1);
438 ref_ptr += ref_stride;
439 CALC_MSE_B(src0, ref0, var);
440 CALC_MSE_B(src1, ref1, var);
441
442 LD_UB2(src_ptr, 16, src0, src1);
443 src_ptr += src_stride;
444 LD_UB2(ref_ptr, 16, ref0, ref1);
445 ref_ptr += ref_stride;
446 CALC_MSE_B(src0, ref0, var);
447 CALC_MSE_B(src1, ref1, var);
448
449 LD_UB2(src_ptr, 16, src0, src1);
450 src_ptr += src_stride;
451 LD_UB2(ref_ptr, 16, ref0, ref1);
452 ref_ptr += ref_stride;
453 CALC_MSE_B(src0, ref0, var);
454 CALC_MSE_B(src1, ref1, var);
455 }
456
457 return HADD_SW_S32(var);
458 }
459
sse_64width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)460 static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
461 const uint8_t *ref_ptr, int32_t ref_stride,
462 int32_t height) {
463 int32_t ht_cnt;
464 v16u8 src0, src1, src2, src3;
465 v16u8 ref0, ref1, ref2, ref3;
466 v4i32 var = { 0 };
467
468 for (ht_cnt = height >> 1; ht_cnt--;) {
469 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
470 src_ptr += src_stride;
471 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
472 ref_ptr += ref_stride;
473 CALC_MSE_B(src0, ref0, var);
474 CALC_MSE_B(src2, ref2, var);
475 CALC_MSE_B(src1, ref1, var);
476 CALC_MSE_B(src3, ref3, var);
477
478 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
479 src_ptr += src_stride;
480 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
481 ref_ptr += ref_stride;
482 CALC_MSE_B(src0, ref0, var);
483 CALC_MSE_B(src2, ref2, var);
484 CALC_MSE_B(src1, ref1, var);
485 CALC_MSE_B(src3, ref3, var);
486 }
487
488 return HADD_SW_S32(var);
489 }
490
vpx_get4x4sse_cs_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride)491 uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
492 const uint8_t *ref_ptr, int32_t ref_stride) {
493 uint32_t src0, src1, src2, src3;
494 uint32_t ref0, ref1, ref2, ref3;
495 v16i8 src = { 0 };
496 v16i8 ref = { 0 };
497 v4i32 err0 = { 0 };
498
499 LW4(src_ptr, src_stride, src0, src1, src2, src3);
500 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
501 INSERT_W4_SB(src0, src1, src2, src3, src);
502 INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
503 CALC_MSE_B(src, ref, err0);
504
505 return HADD_SW_S32(err0);
506 }
507
508 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
509 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
510 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
511 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
512 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
513 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
514 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
515
516 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
517 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
518 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
519 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
520 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
521 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
522
523 #define VPX_VARIANCE_WDXHT_MSA(wd, ht) \
524 uint32_t vpx_variance##wd##x##ht##_msa( \
525 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
526 int32_t ref_stride, uint32_t *sse) { \
527 int32_t diff; \
528 \
529 *sse = \
530 sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
531 \
532 return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
533 }
534
535 VPX_VARIANCE_WDXHT_MSA(4, 4);
536 VPX_VARIANCE_WDXHT_MSA(4, 8);
537
538 VPX_VARIANCE_WDXHT_MSA(8, 4)
539 VPX_VARIANCE_WDXHT_MSA(8, 8)
540 VPX_VARIANCE_WDXHT_MSA(8, 16)
541
542 VPX_VARIANCE_WDXHT_MSA(16, 8)
543 VPX_VARIANCE_WDXHT_MSA(16, 16)
544 VPX_VARIANCE_WDXHT_MSA(16, 32)
545
546 VPX_VARIANCE_WDXHT_MSA(32, 16)
547 VPX_VARIANCE_WDXHT_MSA(32, 32)
548
vpx_variance32x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)549 uint32_t vpx_variance32x64_msa(const uint8_t *src, int32_t src_stride,
550 const uint8_t *ref, int32_t ref_stride,
551 uint32_t *sse) {
552 int32_t diff;
553
554 *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
555
556 return VARIANCE_32Wx64H(*sse, diff);
557 }
558
vpx_variance64x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)559 uint32_t vpx_variance64x32_msa(const uint8_t *src, int32_t src_stride,
560 const uint8_t *ref, int32_t ref_stride,
561 uint32_t *sse) {
562 int32_t diff;
563
564 *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
565
566 return VARIANCE_64Wx32H(*sse, diff);
567 }
568
vpx_variance64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)569 uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
570 const uint8_t *ref, int32_t ref_stride,
571 uint32_t *sse) {
572 int32_t diff;
573
574 *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
575
576 return VARIANCE_64Wx64H(*sse, diff);
577 }
578
vpx_mse8x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)579 uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
580 const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
581 *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
582
583 return *sse;
584 }
585
vpx_mse8x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)586 uint32_t vpx_mse8x16_msa(const uint8_t *src, int32_t src_stride,
587 const uint8_t *ref, int32_t ref_stride,
588 uint32_t *sse) {
589 *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
590
591 return *sse;
592 }
593
vpx_mse16x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)594 uint32_t vpx_mse16x8_msa(const uint8_t *src, int32_t src_stride,
595 const uint8_t *ref, int32_t ref_stride,
596 uint32_t *sse) {
597 *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
598
599 return *sse;
600 }
601
vpx_mse16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)602 uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
603 const uint8_t *ref, int32_t ref_stride,
604 uint32_t *sse) {
605 *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
606
607 return *sse;
608 }
609
vpx_get8x8var_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse,int32_t * sum)610 void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
611 const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
612 int32_t *sum) {
613 *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
614 }
615
vpx_get16x16var_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse,int32_t * sum)616 void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
617 const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
618 int32_t *sum) {
619 *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
620 }
621
vpx_get_mb_ss_msa(const int16_t * src)622 uint32_t vpx_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
623