xref: /aosp_15_r20/external/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vpx_dsp/loongarch/variance_lsx.h"
14 #include "vpx_dsp/variance.h"
15 
16 static const uint8_t bilinear_filters_lsx[8][2] = {
17   { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
18   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
19 };
20 
21 #define VARIANCE_WxH(sse, diff, shift) \
22   (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
23 
24 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
25   (sse) - (((int64_t)(diff) * (diff)) >> (shift))
26 
avg_sse_diff_64x64_lsx(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)27 static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr,
28                                        int32_t src_stride,
29                                        const uint8_t *ref_ptr,
30                                        int32_t ref_stride,
31                                        const uint8_t *sec_pred, int32_t *diff) {
32   int32_t res, ht_cnt = 32;
33   __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
34   __m128i pred0, pred1, pred2, pred3, vec, vec_tmp;
35   __m128i avg0, avg1, avg2, avg3;
36   __m128i var = __lsx_vldi(0);
37 
38   avg0 = var;
39   avg1 = var;
40   avg2 = var;
41   avg3 = var;
42 
43   for (; ht_cnt--;) {
44     DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
45               pred0, pred1, pred2, pred3);
46     sec_pred += 64;
47     DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
48               src0, src1, src2, src3);
49     src_ptr += src_stride;
50     DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
51               ref0, ref1, ref2, ref3);
52     ref_ptr += ref_stride;
53 
54     DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
55               pred3, src0, src1, src2, src3);
56     CALC_MSE_AVG_B(src0, ref0, var, avg0);
57     CALC_MSE_AVG_B(src1, ref1, var, avg1);
58     CALC_MSE_AVG_B(src2, ref2, var, avg2);
59     CALC_MSE_AVG_B(src3, ref3, var, avg3);
60 
61     DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
62               pred0, pred1, pred2, pred3);
63     sec_pred += 64;
64     DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
65               src0, src1, src2, src3);
66     src_ptr += src_stride;
67     DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
68               ref0, ref1, ref2, ref3);
69     ref_ptr += ref_stride;
70 
71     DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
72               pred3, src0, src1, src2, src3);
73     CALC_MSE_AVG_B(src0, ref0, var, avg0);
74     CALC_MSE_AVG_B(src1, ref1, var, avg1);
75     CALC_MSE_AVG_B(src2, ref2, var, avg2);
76     CALC_MSE_AVG_B(src3, ref3, var, avg3);
77   }
78   vec = __lsx_vhaddw_w_h(avg0, avg0);
79   vec_tmp = __lsx_vhaddw_w_h(avg1, avg1);
80   vec = __lsx_vadd_w(vec, vec_tmp);
81   vec_tmp = __lsx_vhaddw_w_h(avg2, avg2);
82   vec = __lsx_vadd_w(vec, vec_tmp);
83   vec_tmp = __lsx_vhaddw_w_h(avg3, avg3);
84   vec = __lsx_vadd_w(vec, vec_tmp);
85   HADD_SW_S32(vec, *diff);
86   HADD_SW_S32(var, res);
87 
88   return res;
89 }
90 
sub_pixel_sse_diff_8width_h_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)91 static uint32_t sub_pixel_sse_diff_8width_h_lsx(
92     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
93     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
94   uint32_t loop_cnt = (height >> 2);
95   int32_t res;
96   __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
97   __m128i vec0, vec1, vec2, vec3, filt0, out, vec;
98   __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
99   __m128i avg = __lsx_vldi(0);
100   __m128i var = avg;
101   int32_t src_stride2 = src_stride << 1;
102   int32_t src_stride3 = src_stride2 + src_stride;
103   int32_t src_stride4 = src_stride2 << 1;
104   int32_t dst_stride2 = dst_stride << 1;
105   int32_t dst_stride3 = dst_stride2 + dst_stride;
106   int32_t dst_stride4 = dst_stride2 << 1;
107 
108   filt0 = __lsx_vldrepl_h(filter, 0);
109   for (; loop_cnt--;) {
110     src0 = __lsx_vld(src, 0);
111     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
112     src3 = __lsx_vldx(src, src_stride3);
113     src += src_stride4;
114     ref0 = __lsx_vld(dst, 0);
115     DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
116     ref3 = __lsx_vldx(dst, dst_stride3);
117     dst += dst_stride4;
118 
119     DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
120     DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
121     DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
122     DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
123               filt0, vec0, vec1, vec2, vec3);
124     DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1,
125               FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS,
126               src0, src1, src2, src3);
127     out = __lsx_vpackev_d(src1, src0);
128     CALC_MSE_AVG_B(out, ref0, var, avg);
129     out = __lsx_vpackev_d(src3, src2);
130     CALC_MSE_AVG_B(out, ref1, var, avg);
131   }
132   vec = __lsx_vhaddw_w_h(avg, avg);
133   HADD_SW_S32(vec, *diff);
134   HADD_SW_S32(var, res);
135   return res;
136 }
137 
sub_pixel_sse_diff_16width_h_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)138 static uint32_t sub_pixel_sse_diff_16width_h_lsx(
139     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
140     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
141   uint32_t loop_cnt = (height >> 2);
142   int32_t res;
143   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
144   __m128i dst0, dst1, dst2, dst3, filt0;
145   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
146   __m128i out0, out1, out2, out3, out4, out5, out6, out7;
147   __m128i vec, var = __lsx_vldi(0);
148   __m128i avg = var;
149   __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
150   int32_t dst_stride2 = dst_stride << 1;
151   int32_t dst_stride3 = dst_stride2 + dst_stride;
152   int32_t dst_stride4 = dst_stride2 << 1;
153 
154   filt0 = __lsx_vldrepl_h(filter, 0);
155 
156   for (; loop_cnt--;) {
157     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
158     src += src_stride;
159     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
160     src += src_stride;
161     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
162     src += src_stride;
163     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
164     src += src_stride;
165 
166     dst0 = __lsx_vld(dst, 0);
167     DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
168     dst3 = __lsx_vldx(dst, dst_stride3);
169     dst += dst_stride4;
170 
171     DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
172     DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
173     DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
174     DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
175 
176     DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
177               filt0, out0, out1, out2, out3);
178     DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
179               filt0, out4, out5, out6, out7);
180     DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
181               FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
182               src0, src1, src2, src3);
183     CALC_MSE_AVG_B(src0, dst0, var, avg);
184     CALC_MSE_AVG_B(src1, dst1, var, avg);
185     CALC_MSE_AVG_B(src2, dst2, var, avg);
186     CALC_MSE_AVG_B(src3, dst3, var, avg);
187   }
188   vec = __lsx_vhaddw_w_h(avg, avg);
189   HADD_SW_S32(vec, *diff);
190   HADD_SW_S32(var, res);
191   return res;
192 }
193 
sub_pixel_sse_diff_32width_h_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)194 static uint32_t sub_pixel_sse_diff_32width_h_lsx(
195     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
196     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
197   uint32_t sse = 0;
198   int32_t diff0[2];
199 
200   sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
201                                           filter, height, &diff0[0]);
202   src += 16;
203   dst += 16;
204 
205   sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
206                                           filter, height, &diff0[1]);
207 
208   *diff = diff0[0] + diff0[1];
209 
210   return sse;
211 }
212 
sub_pixel_sse_diff_8width_v_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)213 static uint32_t sub_pixel_sse_diff_8width_v_lsx(
214     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
215     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
216   uint32_t loop_cnt = (height >> 2);
217   int32_t res;
218   __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
219   __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0;
220   __m128i avg = __lsx_vldi(0);
221   __m128i var = avg;
222   int32_t src_stride2 = src_stride << 1;
223   int32_t src_stride3 = src_stride2 + src_stride;
224   int32_t src_stride4 = src_stride2 << 1;
225   int32_t dst_stride2 = dst_stride << 1;
226   int32_t dst_stride3 = dst_stride2 + dst_stride;
227   int32_t dst_stride4 = dst_stride2 << 1;
228 
229   filt0 = __lsx_vldrepl_h(filter, 0);
230   src0 = __lsx_vld(src, 0);
231   src += src_stride;
232 
233   for (; loop_cnt--;) {
234     src1 = __lsx_vld(src, 0);
235     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
236     src4 = __lsx_vldx(src, src_stride3);
237     src += src_stride4;
238     ref0 = __lsx_vld(dst, 0);
239     DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
240     ref3 = __lsx_vldx(dst, dst_stride3);
241     dst += dst_stride4;
242 
243     DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
244     DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
245               vec0, vec1, vec2, vec3);
246     DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
247               filt0, tmp0, tmp1, tmp2, tmp3);
248     DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
249               FILTER_BITS, src0, src1);
250     CALC_MSE_AVG_B(src0, ref0, var, avg);
251     CALC_MSE_AVG_B(src1, ref1, var, avg);
252 
253     src0 = src4;
254   }
255   vec = __lsx_vhaddw_w_h(avg, avg);
256   HADD_SW_S32(vec, *diff);
257   HADD_SW_S32(var, res);
258   return res;
259 }
260 
sub_pixel_sse_diff_16width_v_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)261 static uint32_t sub_pixel_sse_diff_16width_v_lsx(
262     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
263     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
264   uint32_t loop_cnt = (height >> 2);
265   int32_t res;
266   __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
267   __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec;
268   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
269   __m128i var = __lsx_vldi(0);
270   __m128i avg = var;
271   int32_t src_stride2 = src_stride << 1;
272   int32_t src_stride3 = src_stride2 + src_stride;
273   int32_t src_stride4 = src_stride2 << 1;
274   int32_t dst_stride2 = dst_stride << 1;
275   int32_t dst_stride3 = dst_stride2 + dst_stride;
276   int32_t dst_stride4 = dst_stride2 << 1;
277 
278   filt0 = __lsx_vldrepl_h(filter, 0);
279 
280   src0 = __lsx_vld(src, 0);
281   src += src_stride;
282 
283   for (; loop_cnt--;) {
284     src1 = __lsx_vld(src, 0);
285     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
286     src4 = __lsx_vldx(src, src_stride3);
287     src += src_stride4;
288     ref0 = __lsx_vld(dst, 0);
289     DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
290     ref3 = __lsx_vldx(dst, dst_stride3);
291     dst += dst_stride4;
292 
293     DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
294     DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
295     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
296     out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
297 
298     DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
299     DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
300     DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
301     out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
302 
303     DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
304     out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
305     DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
306     out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
307 
308     src0 = src4;
309 
310     CALC_MSE_AVG_B(out0, ref0, var, avg);
311     CALC_MSE_AVG_B(out1, ref1, var, avg);
312     CALC_MSE_AVG_B(out2, ref2, var, avg);
313     CALC_MSE_AVG_B(out3, ref3, var, avg);
314   }
315   vec = __lsx_vhaddw_w_h(avg, avg);
316   HADD_SW_S32(vec, *diff);
317   HADD_SW_S32(var, res);
318   return res;
319 }
320 
sub_pixel_sse_diff_32width_v_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)321 static uint32_t sub_pixel_sse_diff_32width_v_lsx(
322     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
323     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
324   uint32_t sse = 0;
325   int32_t diff0[2];
326 
327   sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
328                                           filter, height, &diff0[0]);
329   src += 16;
330   dst += 16;
331 
332   sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
333                                           filter, height, &diff0[1]);
334 
335   *diff = diff0[0] + diff0[1];
336 
337   return sse;
338 }
339 
sub_pixel_sse_diff_8width_hv_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)340 static uint32_t sub_pixel_sse_diff_8width_hv_lsx(
341     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
342     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
343     int32_t height, int32_t *diff) {
344   uint32_t loop_cnt = (height >> 2);
345   int32_t res;
346   __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1;
347   __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt;
348   __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
349   __m128i avg = __lsx_vldi(0);
350   __m128i var = avg;
351 
352   filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
353   filt_vt = __lsx_vldrepl_h(filter_vert, 0);
354 
355   src0 = __lsx_vld(src, 0);
356   src += src_stride;
357   HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
358 
359   for (; loop_cnt--;) {
360     DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0);
361     src += src_stride;
362     dst += dst_stride;
363     DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1);
364     src += src_stride;
365     dst += dst_stride;
366     DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2);
367     src += src_stride;
368     dst += dst_stride;
369     DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3);
370     src += src_stride;
371     dst += dst_stride;
372 
373     DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
374     HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1);
375     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
376     tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
377     HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
378     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
379     tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
380 
381     HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1);
382     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
383     tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
384     HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0);
385     vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
386     tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
387     DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
388               FILTER_BITS, out0, out1);
389     CALC_MSE_AVG_B(out0, ref0, var, avg);
390     CALC_MSE_AVG_B(out1, ref1, var, avg);
391   }
392   vec = __lsx_vhaddw_w_h(avg, avg);
393   HADD_SW_S32(vec, *diff);
394   HADD_SW_S32(var, res);
395   return res;
396 }
397 
sub_pixel_sse_diff_16width_hv_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)398 static uint32_t sub_pixel_sse_diff_16width_hv_lsx(
399     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
400     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
401     int32_t height, int32_t *diff) {
402   uint32_t loop_cnt = (height >> 2);
403   int32_t res;
404   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
405   __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1;
406   __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec;
407   __m128i var = __lsx_vldi(0);
408   __m128i avg = var;
409   __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
410   int32_t dst_stride2 = dst_stride << 1;
411   int32_t dst_stride3 = dst_stride2 + dst_stride;
412   int32_t dst_stride4 = dst_stride2 << 1;
413 
414   filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
415   filt_vt = __lsx_vldrepl_h(filter_vert, 0);
416 
417   DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
418   src += src_stride;
419 
420   HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
421   HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
422 
423   for (; loop_cnt--;) {
424     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
425     src += src_stride;
426     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
427     src += src_stride;
428     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
429     src += src_stride;
430     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
431     src += src_stride;
432 
433     ref0 = __lsx_vld(dst, 0);
434     DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
435     ref3 = __lsx_vldx(dst, dst_stride3);
436     dst += dst_stride4;
437 
438     HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
439     HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
440     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
441     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
442     src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
443 
444     HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
445     HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
446     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
447     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
448     src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
449 
450     HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
451     HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
452     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
453     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
454     src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
455 
456     HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
457     HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
458     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
459     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
460     src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
461 
462     CALC_MSE_AVG_B(src0, ref0, var, avg);
463     CALC_MSE_AVG_B(src1, ref1, var, avg);
464     CALC_MSE_AVG_B(src2, ref2, var, avg);
465     CALC_MSE_AVG_B(src3, ref3, var, avg);
466   }
467   vec = __lsx_vhaddw_w_h(avg, avg);
468   HADD_SW_S32(vec, *diff);
469   HADD_SW_S32(var, res);
470 
471   return res;
472 }
473 
sub_pixel_sse_diff_32width_hv_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)474 static uint32_t sub_pixel_sse_diff_32width_hv_lsx(
475     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
476     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
477     int32_t height, int32_t *diff) {
478   uint32_t sse = 0;
479   int32_t diff0[2];
480 
481   sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
482                                            filter_horiz, filter_vert, height,
483                                            &diff0[0]);
484   src += 16;
485   dst += 16;
486 
487   sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
488                                            filter_horiz, filter_vert, height,
489                                            &diff0[1]);
490 
491   *diff = diff0[0] + diff0[1];
492 
493   return sse;
494 }
495 
subpel_avg_ssediff_16w_h_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)496 static uint32_t subpel_avg_ssediff_16w_h_lsx(
497     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
498     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
499     int32_t height, int32_t *diff, int32_t width) {
500   uint32_t loop_cnt = (height >> 2);
501   int32_t res;
502   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
503   __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
504   __m128i pred0, pred1, pred2, pred3, filt0, vec;
505   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
506   __m128i out0, out1, out2, out3, out4, out5, out6, out7;
507   __m128i mask = { 0x403030202010100, 0x807070606050504 };
508   __m128i avg = __lsx_vldi(0);
509   __m128i var = avg;
510 
511   filt0 = __lsx_vldrepl_h(filter, 0);
512 
513   for (; loop_cnt--;) {
514     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
515     src += src_stride;
516     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
517     src += src_stride;
518     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
519     src += src_stride;
520     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
521     src += src_stride;
522 
523     dst0 = __lsx_vld(dst, 0);
524     dst += dst_stride;
525     dst1 = __lsx_vld(dst, 0);
526     dst += dst_stride;
527     dst2 = __lsx_vld(dst, 0);
528     dst += dst_stride;
529     dst3 = __lsx_vld(dst, 0);
530     dst += dst_stride;
531 
532     pred0 = __lsx_vld(sec_pred, 0);
533     sec_pred += width;
534     pred1 = __lsx_vld(sec_pred, 0);
535     sec_pred += width;
536     pred2 = __lsx_vld(sec_pred, 0);
537     sec_pred += width;
538     pred3 = __lsx_vld(sec_pred, 0);
539     sec_pred += width;
540 
541     DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
542     DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
543     DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
544     DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
545 
546     DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
547               filt0, out0, out1, out2, out3);
548     DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
549               filt0, out4, out5, out6, out7);
550     DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
551               FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
552               tmp0, tmp1, tmp2, tmp3);
553     DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3,
554               pred3, tmp0, tmp1, tmp2, tmp3);
555 
556     CALC_MSE_AVG_B(tmp0, dst0, var, avg);
557     CALC_MSE_AVG_B(tmp1, dst1, var, avg);
558     CALC_MSE_AVG_B(tmp2, dst2, var, avg);
559     CALC_MSE_AVG_B(tmp3, dst3, var, avg);
560   }
561   vec = __lsx_vhaddw_w_h(avg, avg);
562   HADD_SW_S32(vec, *diff);
563   HADD_SW_S32(var, res);
564 
565   return res;
566 }
567 
subpel_avg_ssediff_16w_v_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)568 static uint32_t subpel_avg_ssediff_16w_v_lsx(
569     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
570     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
571     int32_t height, int32_t *diff, int32_t width) {
572   uint32_t loop_cnt = (height >> 2);
573   int32_t res;
574   __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
575   __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3;
576   __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
577   __m128i tmp0, tmp1, vec, filt0;
578   __m128i avg = __lsx_vldi(0);
579   __m128i var = avg;
580 
581   filt0 = __lsx_vldrepl_h(filter, 0);
582 
583   src0 = __lsx_vld(src, 0);
584   src += src_stride;
585 
586   for (; loop_cnt--;) {
587     src1 = __lsx_vld(src, 0);
588     src += src_stride;
589     src2 = __lsx_vld(src, 0);
590     src += src_stride;
591     src3 = __lsx_vld(src, 0);
592     src += src_stride;
593     src4 = __lsx_vld(src, 0);
594     src += src_stride;
595 
596     pred0 = __lsx_vld(sec_pred, 0);
597     sec_pred += width;
598     pred1 = __lsx_vld(sec_pred, 0);
599     sec_pred += width;
600     pred2 = __lsx_vld(sec_pred, 0);
601     sec_pred += width;
602     pred3 = __lsx_vld(sec_pred, 0);
603     sec_pred += width;
604 
605     DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
606     DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
607     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
608     out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
609 
610     DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
611     DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
612     DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
613     out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
614 
615     DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
616     out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
617 
618     DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
619     out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
620 
621     src0 = src4;
622     ref0 = __lsx_vld(dst, 0);
623     dst += dst_stride;
624     ref1 = __lsx_vld(dst, 0);
625     dst += dst_stride;
626     ref2 = __lsx_vld(dst, 0);
627     dst += dst_stride;
628     ref3 = __lsx_vld(dst, 0);
629     dst += dst_stride;
630 
631     DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
632               pred3, out0, out1, out2, out3);
633 
634     CALC_MSE_AVG_B(out0, ref0, var, avg);
635     CALC_MSE_AVG_B(out1, ref1, var, avg);
636     CALC_MSE_AVG_B(out2, ref2, var, avg);
637     CALC_MSE_AVG_B(out3, ref3, var, avg);
638   }
639   vec = __lsx_vhaddw_w_h(avg, avg);
640   HADD_SW_S32(vec, *diff);
641   HADD_SW_S32(var, res);
642   return res;
643 }
644 
subpel_avg_ssediff_16w_hv_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff,int32_t width)645 static uint32_t subpel_avg_ssediff_16w_hv_lsx(
646     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
647     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
648     const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
649   uint32_t loop_cnt = (height >> 2);
650   int32_t res;
651   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
652   __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
653   __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
654   __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1;
655   __m128i mask = { 0x403030202010100, 0x807070606050504 };
656   __m128i avg = __lsx_vldi(0);
657   __m128i var = avg;
658 
659   filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
660   filt_vt = __lsx_vldrepl_h(filter_vert, 0);
661 
662   DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
663   src += src_stride;
664 
665   HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
666   HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
667 
668   for (; loop_cnt--;) {
669     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
670     src += src_stride;
671     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
672     src += src_stride;
673     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
674     src += src_stride;
675     DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
676     src += src_stride;
677 
678     pred0 = __lsx_vld(sec_pred, 0);
679     sec_pred += width;
680     pred1 = __lsx_vld(sec_pred, 0);
681     sec_pred += width;
682     pred2 = __lsx_vld(sec_pred, 0);
683     sec_pred += width;
684     pred3 = __lsx_vld(sec_pred, 0);
685     sec_pred += width;
686 
687     HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
688     HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
689     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
690     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
691     out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
692 
693     HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
694     HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
695     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
696     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
697     out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
698 
699     HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
700     HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
701     DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
702     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
703     out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
704 
705     HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
706     HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
707     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
708     DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
709     out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
710 
711     ref0 = __lsx_vld(dst, 0);
712     dst += dst_stride;
713     ref1 = __lsx_vld(dst, 0);
714     dst += dst_stride;
715     ref2 = __lsx_vld(dst, 0);
716     dst += dst_stride;
717     ref3 = __lsx_vld(dst, 0);
718     dst += dst_stride;
719 
720     DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
721               pred3, out0, out1, out2, out3);
722 
723     CALC_MSE_AVG_B(out0, ref0, var, avg);
724     CALC_MSE_AVG_B(out1, ref1, var, avg);
725     CALC_MSE_AVG_B(out2, ref2, var, avg);
726     CALC_MSE_AVG_B(out3, ref3, var, avg);
727   }
728   vec = __lsx_vhaddw_w_h(avg, avg);
729   HADD_SW_S32(vec, *diff);
730   HADD_SW_S32(var, res);
731   return res;
732 }
733 
sub_pixel_avg_sse_diff_64width_h_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)734 static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx(
735     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
736     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
737     int32_t height, int32_t *diff) {
738   uint32_t loop_cnt, sse = 0;
739   int32_t diff0[4];
740 
741   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
742     sse +=
743         subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred,
744                                      filter, height, &diff0[loop_cnt], 64);
745     src += 16;
746     dst += 16;
747     sec_pred += 16;
748   }
749 
750   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
751 
752   return sse;
753 }
754 
sub_pixel_avg_sse_diff_64width_v_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)755 static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx(
756     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
757     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
758     int32_t height, int32_t *diff) {
759   uint32_t loop_cnt, sse = 0;
760   int32_t diff0[4];
761 
762   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
763     sse +=
764         subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred,
765                                      filter, height, &diff0[loop_cnt], 64);
766     src += 16;
767     dst += 16;
768     sec_pred += 16;
769   }
770 
771   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
772 
773   return sse;
774 }
775 
sub_pixel_avg_sse_diff_64width_hv_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)776 static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx(
777     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
778     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
779     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
780   uint32_t loop_cnt, sse = 0;
781   int32_t diff0[4];
782 
783   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
784     sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride,
785                                          sec_pred, filter_horiz, filter_vert,
786                                          height, &diff0[loop_cnt], 64);
787     src += 16;
788     dst += 16;
789     sec_pred += 16;
790   }
791 
792   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
793 
794   return sse;
795 }
796 
797 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
798 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
799 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
800 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
801 
802 #define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht)                              \
803   uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx(                           \
804       const uint8_t *src, int32_t src_stride, int32_t x_offset,               \
805       int32_t y_offset, const uint8_t *ref, int32_t ref_stride,               \
806       uint32_t *sse) {                                                        \
807     int32_t diff;                                                             \
808     uint32_t var;                                                             \
809     const uint8_t *h_filter = bilinear_filters_lsx[x_offset];                 \
810     const uint8_t *v_filter = bilinear_filters_lsx[y_offset];                 \
811                                                                               \
812     if (y_offset) {                                                           \
813       if (x_offset) {                                                         \
814         *sse = sub_pixel_sse_diff_##wd##width_hv_lsx(                         \
815             src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
816       } else {                                                                \
817         *sse = sub_pixel_sse_diff_##wd##width_v_lsx(                          \
818             src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
819       }                                                                       \
820                                                                               \
821       var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
822     } else {                                                                  \
823       if (x_offset) {                                                         \
824         *sse = sub_pixel_sse_diff_##wd##width_h_lsx(                          \
825             src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
826                                                                               \
827         var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
828       } else {                                                                \
829         var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \
830                                             sse);                             \
831       }                                                                       \
832     }                                                                         \
833                                                                               \
834     return var;                                                               \
835   }
836 
837 VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8)
838 VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16)
839 VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32)
840 
841 #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht)                           \
842   uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx(                           \
843       const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
844       int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
845       uint32_t *sse, const uint8_t *sec_pred) {                               \
846     int32_t diff;                                                             \
847     const uint8_t *h_filter = bilinear_filters_lsx[x_offset];                 \
848     const uint8_t *v_filter = bilinear_filters_lsx[y_offset];                 \
849                                                                               \
850     if (y_offset) {                                                           \
851       if (x_offset) {                                                         \
852         *sse = sub_pixel_avg_sse_diff_64width_hv_lsx(                         \
853             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
854             v_filter, ht, &diff);                                             \
855       } else {                                                                \
856         *sse = sub_pixel_avg_sse_diff_64width_v_lsx(                          \
857             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
858             &diff);                                                           \
859       }                                                                       \
860     } else {                                                                  \
861       if (x_offset) {                                                         \
862         *sse = sub_pixel_avg_sse_diff_64width_h_lsx(                          \
863             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
864             &diff);                                                           \
865       } else {                                                                \
866         *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr,       \
867                                           ref_stride, sec_pred, &diff);       \
868       }                                                                       \
869     }                                                                         \
870                                                                               \
871     return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
872   }
873 
874 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64)
875