xref: /aosp_15_r20/external/libvpx/vpx_dsp/loongarch/subtract_lsx.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  *  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  *  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  *  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  *  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  *  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  */
10*fb1b10abSAndroid Build Coastguard Worker 
11*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
12*fb1b10abSAndroid Build Coastguard Worker #include "vpx_util/loongson_intrinsics.h"
13*fb1b10abSAndroid Build Coastguard Worker 
sub_blk_4x4_lsx(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)14*fb1b10abSAndroid Build Coastguard Worker static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride,
15*fb1b10abSAndroid Build Coastguard Worker                             const uint8_t *pred_ptr, int32_t pred_stride,
16*fb1b10abSAndroid Build Coastguard Worker                             int16_t *diff_ptr, int32_t diff_stride) {
17*fb1b10abSAndroid Build Coastguard Worker   __m128i src0, src1, src2, src3;
18*fb1b10abSAndroid Build Coastguard Worker   __m128i pred0, pred1, pred2, pred3;
19*fb1b10abSAndroid Build Coastguard Worker   __m128i diff0, diff1;
20*fb1b10abSAndroid Build Coastguard Worker   __m128i reg0, reg1;
21*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride2 = src_stride << 1;
22*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride2 = pred_stride << 1;
23*fb1b10abSAndroid Build Coastguard Worker   int32_t diff_stride2 = diff_stride << 1;
24*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride3 = src_stride2 + src_stride;
25*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride3 = pred_stride2 + pred_stride;
26*fb1b10abSAndroid Build Coastguard Worker   int32_t diff_stride3 = diff_stride2 + diff_stride;
27*fb1b10abSAndroid Build Coastguard Worker 
28*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0,
29*fb1b10abSAndroid Build Coastguard Worker             src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
30*fb1b10abSAndroid Build Coastguard Worker             src2, src3);
31*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0,
32*fb1b10abSAndroid Build Coastguard Worker             pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
33*fb1b10abSAndroid Build Coastguard Worker             pred1, pred2, pred3);
34*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2,
35*fb1b10abSAndroid Build Coastguard Worker             src0, src2, pred0, pred2);
36*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0);
37*fb1b10abSAndroid Build Coastguard Worker   reg0 = __lsx_vilvl_b(src0, pred0);
38*fb1b10abSAndroid Build Coastguard Worker   reg1 = __lsx_vilvh_b(src0, pred0);
39*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1);
40*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstelm_d(diff0, diff_ptr, 0, 0);
41*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1);
42*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0);
43*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1);
44*fb1b10abSAndroid Build Coastguard Worker }
45*fb1b10abSAndroid Build Coastguard Worker 
sub_blk_8x8_lsx(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)46*fb1b10abSAndroid Build Coastguard Worker static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride,
47*fb1b10abSAndroid Build Coastguard Worker                             const uint8_t *pred_ptr, int32_t pred_stride,
48*fb1b10abSAndroid Build Coastguard Worker                             int16_t *diff_ptr, int32_t diff_stride) {
49*fb1b10abSAndroid Build Coastguard Worker   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
50*fb1b10abSAndroid Build Coastguard Worker   __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
51*fb1b10abSAndroid Build Coastguard Worker   __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
52*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride2 = src_stride << 1;
53*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride2 = pred_stride << 1;
54*fb1b10abSAndroid Build Coastguard Worker   int32_t dst_stride = diff_stride << 1;
55*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride3 = src_stride2 + src_stride;
56*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride3 = pred_stride2 + pred_stride;
57*fb1b10abSAndroid Build Coastguard Worker   int32_t dst_stride2 = dst_stride << 1;
58*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride4 = src_stride2 << 1;
59*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride4 = pred_stride2 << 1;
60*fb1b10abSAndroid Build Coastguard Worker   int32_t dst_stride3 = dst_stride + dst_stride2;
61*fb1b10abSAndroid Build Coastguard Worker 
62*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
63*fb1b10abSAndroid Build Coastguard Worker             src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
64*fb1b10abSAndroid Build Coastguard Worker             src2, src3);
65*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
66*fb1b10abSAndroid Build Coastguard Worker             pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
67*fb1b10abSAndroid Build Coastguard Worker             pred1, pred2, pred3);
68*fb1b10abSAndroid Build Coastguard Worker   src_ptr += src_stride4;
69*fb1b10abSAndroid Build Coastguard Worker   pred_ptr += pred_stride4;
70*fb1b10abSAndroid Build Coastguard Worker 
71*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
72*fb1b10abSAndroid Build Coastguard Worker             src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5,
73*fb1b10abSAndroid Build Coastguard Worker             src6, src7);
74*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
75*fb1b10abSAndroid Build Coastguard Worker             pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4,
76*fb1b10abSAndroid Build Coastguard Worker             pred5, pred6, pred7);
77*fb1b10abSAndroid Build Coastguard Worker 
78*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
79*fb1b10abSAndroid Build Coastguard Worker             reg0, reg1, reg2, reg3);
80*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
81*fb1b10abSAndroid Build Coastguard Worker             reg4, reg5, reg6, reg7);
82*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
83*fb1b10abSAndroid Build Coastguard Worker             src0, src1, src2, src3);
84*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
85*fb1b10abSAndroid Build Coastguard Worker             src4, src5, src6, src7);
86*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(src0, diff_ptr, 0);
87*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src1, diff_ptr, dst_stride);
88*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src2, diff_ptr, dst_stride2);
89*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src3, diff_ptr, dst_stride3);
90*fb1b10abSAndroid Build Coastguard Worker   diff_ptr += dst_stride2;
91*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(src4, diff_ptr, 0);
92*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src5, diff_ptr, dst_stride);
93*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src6, diff_ptr, dst_stride2);
94*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src7, diff_ptr, dst_stride3);
95*fb1b10abSAndroid Build Coastguard Worker }
96*fb1b10abSAndroid Build Coastguard Worker 
sub_blk_16x16_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)97*fb1b10abSAndroid Build Coastguard Worker static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride,
98*fb1b10abSAndroid Build Coastguard Worker                               const uint8_t *pred, int32_t pred_stride,
99*fb1b10abSAndroid Build Coastguard Worker                               int16_t *diff, int32_t diff_stride) {
100*fb1b10abSAndroid Build Coastguard Worker   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
101*fb1b10abSAndroid Build Coastguard Worker   __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
102*fb1b10abSAndroid Build Coastguard Worker   __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
103*fb1b10abSAndroid Build Coastguard Worker   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
104*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride2 = src_stride << 1;
105*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride2 = pred_stride << 1;
106*fb1b10abSAndroid Build Coastguard Worker   int32_t dst_stride = diff_stride << 1;
107*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride3 = src_stride2 + src_stride;
108*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride3 = pred_stride2 + pred_stride;
109*fb1b10abSAndroid Build Coastguard Worker   int32_t dst_stride2 = dst_stride << 1;
110*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride4 = src_stride2 << 1;
111*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride4 = pred_stride2 << 1;
112*fb1b10abSAndroid Build Coastguard Worker   int32_t dst_stride3 = dst_stride + dst_stride2;
113*fb1b10abSAndroid Build Coastguard Worker   int16_t *diff_tmp = diff + 8;
114*fb1b10abSAndroid Build Coastguard Worker 
115*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
116*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
117*fb1b10abSAndroid Build Coastguard Worker             src, src_stride4, src1, src2, src3, src4);
118*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
119*fb1b10abSAndroid Build Coastguard Worker             pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
120*fb1b10abSAndroid Build Coastguard Worker   src += src_stride4;
121*fb1b10abSAndroid Build Coastguard Worker   pred += pred_stride4;
122*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
123*fb1b10abSAndroid Build Coastguard Worker             pred, pred_stride, src5, src6, src7, pred5);
124*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
125*fb1b10abSAndroid Build Coastguard Worker   src += src_stride4;
126*fb1b10abSAndroid Build Coastguard Worker   pred += pred_stride4;
127*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
128*fb1b10abSAndroid Build Coastguard Worker             reg0, reg2, reg4, reg6);
129*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
130*fb1b10abSAndroid Build Coastguard Worker             reg1, reg3, reg5, reg7);
131*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
132*fb1b10abSAndroid Build Coastguard Worker             tmp0, tmp2, tmp4, tmp6);
133*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
134*fb1b10abSAndroid Build Coastguard Worker             tmp1, tmp3, tmp5, tmp7);
135*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
136*fb1b10abSAndroid Build Coastguard Worker             src0, src1, src2, src3);
137*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
138*fb1b10abSAndroid Build Coastguard Worker             src4, src5, src6, src7);
139*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
140*fb1b10abSAndroid Build Coastguard Worker             pred0, pred1, pred2, pred3);
141*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
142*fb1b10abSAndroid Build Coastguard Worker             pred4, pred5, pred6, pred7);
143*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(src0, diff, 0);
144*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src2, diff, dst_stride);
145*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src4, diff, dst_stride2);
146*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src6, diff, dst_stride3);
147*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(src1, diff_tmp, 0);
148*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src3, diff_tmp, dst_stride);
149*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src5, diff_tmp, dst_stride2);
150*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src7, diff_tmp, dst_stride3);
151*fb1b10abSAndroid Build Coastguard Worker   diff += dst_stride2;
152*fb1b10abSAndroid Build Coastguard Worker   diff_tmp += dst_stride2;
153*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(pred0, diff, 0);
154*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred2, diff, dst_stride);
155*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred4, diff, dst_stride2);
156*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred6, diff, dst_stride3);
157*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(pred1, diff_tmp, 0);
158*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred3, diff_tmp, dst_stride);
159*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred5, diff_tmp, dst_stride2);
160*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred7, diff_tmp, dst_stride3);
161*fb1b10abSAndroid Build Coastguard Worker   diff += dst_stride2;
162*fb1b10abSAndroid Build Coastguard Worker   diff_tmp += dst_stride2;
163*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
164*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
165*fb1b10abSAndroid Build Coastguard Worker             src, src_stride4, src1, src2, src3, src4);
166*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
167*fb1b10abSAndroid Build Coastguard Worker             pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
168*fb1b10abSAndroid Build Coastguard Worker   src += src_stride4;
169*fb1b10abSAndroid Build Coastguard Worker   pred += pred_stride4;
170*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
171*fb1b10abSAndroid Build Coastguard Worker             pred, pred_stride, src5, src6, src7, pred5);
172*fb1b10abSAndroid Build Coastguard Worker   DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
173*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
174*fb1b10abSAndroid Build Coastguard Worker             reg0, reg2, reg4, reg6);
175*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
176*fb1b10abSAndroid Build Coastguard Worker             reg1, reg3, reg5, reg7);
177*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
178*fb1b10abSAndroid Build Coastguard Worker             tmp0, tmp2, tmp4, tmp6);
179*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
180*fb1b10abSAndroid Build Coastguard Worker             tmp1, tmp3, tmp5, tmp7);
181*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
182*fb1b10abSAndroid Build Coastguard Worker             src0, src1, src2, src3);
183*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
184*fb1b10abSAndroid Build Coastguard Worker             src4, src5, src6, src7);
185*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
186*fb1b10abSAndroid Build Coastguard Worker             pred0, pred1, pred2, pred3);
187*fb1b10abSAndroid Build Coastguard Worker   DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
188*fb1b10abSAndroid Build Coastguard Worker             pred4, pred5, pred6, pred7);
189*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(src0, diff, 0);
190*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src2, diff, dst_stride);
191*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src4, diff, dst_stride2);
192*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src6, diff, dst_stride3);
193*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(src1, diff_tmp, 0);
194*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src3, diff_tmp, dst_stride);
195*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src5, diff_tmp, dst_stride2);
196*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(src7, diff_tmp, dst_stride3);
197*fb1b10abSAndroid Build Coastguard Worker   diff += dst_stride2;
198*fb1b10abSAndroid Build Coastguard Worker   diff_tmp += dst_stride2;
199*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(pred0, diff, 0);
200*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred2, diff, dst_stride);
201*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred4, diff, dst_stride2);
202*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred6, diff, dst_stride3);
203*fb1b10abSAndroid Build Coastguard Worker   __lsx_vst(pred1, diff_tmp, 0);
204*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred3, diff_tmp, dst_stride);
205*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred5, diff_tmp, dst_stride2);
206*fb1b10abSAndroid Build Coastguard Worker   __lsx_vstx(pred7, diff_tmp, dst_stride3);
207*fb1b10abSAndroid Build Coastguard Worker }
208*fb1b10abSAndroid Build Coastguard Worker 
sub_blk_32x32_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)209*fb1b10abSAndroid Build Coastguard Worker static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride,
210*fb1b10abSAndroid Build Coastguard Worker                               const uint8_t *pred, int32_t pred_stride,
211*fb1b10abSAndroid Build Coastguard Worker                               int16_t *diff, int32_t diff_stride) {
212*fb1b10abSAndroid Build Coastguard Worker   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
213*fb1b10abSAndroid Build Coastguard Worker   __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
214*fb1b10abSAndroid Build Coastguard Worker   __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
215*fb1b10abSAndroid Build Coastguard Worker   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
216*fb1b10abSAndroid Build Coastguard Worker   uint32_t loop_cnt;
217*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride2 = src_stride << 1;
218*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride2 = pred_stride << 1;
219*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride3 = src_stride2 + src_stride;
220*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride3 = pred_stride2 + pred_stride;
221*fb1b10abSAndroid Build Coastguard Worker   int32_t src_stride4 = src_stride2 << 1;
222*fb1b10abSAndroid Build Coastguard Worker   int32_t pred_stride4 = pred_stride2 << 1;
223*fb1b10abSAndroid Build Coastguard Worker 
224*fb1b10abSAndroid Build Coastguard Worker   for (loop_cnt = 8; loop_cnt--;) {
225*fb1b10abSAndroid Build Coastguard Worker     const uint8_t *src_tmp = src + 16;
226*fb1b10abSAndroid Build Coastguard Worker     const uint8_t *pred_tmp = pred + 16;
227*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1,
228*fb1b10abSAndroid Build Coastguard Worker               pred0, pred1);
229*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
230*fb1b10abSAndroid Build Coastguard Worker               src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
231*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred,
232*fb1b10abSAndroid Build Coastguard Worker               pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3);
233*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred,
234*fb1b10abSAndroid Build Coastguard Worker               pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7);
235*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
236*fb1b10abSAndroid Build Coastguard Worker               reg0, reg2, reg4, reg6);
237*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
238*fb1b10abSAndroid Build Coastguard Worker               reg1, reg3, reg5, reg7);
239*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
240*fb1b10abSAndroid Build Coastguard Worker               tmp0, tmp2, tmp4, tmp6);
241*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
242*fb1b10abSAndroid Build Coastguard Worker               tmp1, tmp3, tmp5, tmp7);
243*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
244*fb1b10abSAndroid Build Coastguard Worker               reg3, src0, src1, src2, src3);
245*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
246*fb1b10abSAndroid Build Coastguard Worker               reg7, src4, src5, src6, src7);
247*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
248*fb1b10abSAndroid Build Coastguard Worker               tmp3, pred0, pred1, pred2, pred3);
249*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
250*fb1b10abSAndroid Build Coastguard Worker               tmp7, pred4, pred5, pred6, pred7);
251*fb1b10abSAndroid Build Coastguard Worker     src += src_stride4;
252*fb1b10abSAndroid Build Coastguard Worker     pred += pred_stride4;
253*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src0, diff, 0);
254*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src1, diff, 16);
255*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src2, diff, 32);
256*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src3, diff, 48);
257*fb1b10abSAndroid Build Coastguard Worker     diff += diff_stride;
258*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src4, diff, 0);
259*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src5, diff, 16);
260*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src6, diff, 32);
261*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src7, diff, 48);
262*fb1b10abSAndroid Build Coastguard Worker     diff += diff_stride;
263*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred0, diff, 0);
264*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred1, diff, 16);
265*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred2, diff, 32);
266*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred3, diff, 48);
267*fb1b10abSAndroid Build Coastguard Worker     diff += diff_stride;
268*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred4, diff, 0);
269*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred5, diff, 16);
270*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred6, diff, 32);
271*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred7, diff, 48);
272*fb1b10abSAndroid Build Coastguard Worker     diff += diff_stride;
273*fb1b10abSAndroid Build Coastguard Worker   }
274*fb1b10abSAndroid Build Coastguard Worker }
275*fb1b10abSAndroid Build Coastguard Worker 
sub_blk_64x64_lsx(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)276*fb1b10abSAndroid Build Coastguard Worker static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride,
277*fb1b10abSAndroid Build Coastguard Worker                               const uint8_t *pred, int32_t pred_stride,
278*fb1b10abSAndroid Build Coastguard Worker                               int16_t *diff, int32_t diff_stride) {
279*fb1b10abSAndroid Build Coastguard Worker   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
280*fb1b10abSAndroid Build Coastguard Worker   __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
281*fb1b10abSAndroid Build Coastguard Worker   __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
282*fb1b10abSAndroid Build Coastguard Worker   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
283*fb1b10abSAndroid Build Coastguard Worker   uint32_t loop_cnt;
284*fb1b10abSAndroid Build Coastguard Worker 
285*fb1b10abSAndroid Build Coastguard Worker   for (loop_cnt = 32; loop_cnt--;) {
286*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
287*fb1b10abSAndroid Build Coastguard Worker               src3);
288*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1,
289*fb1b10abSAndroid Build Coastguard Worker               pred2, pred3);
290*fb1b10abSAndroid Build Coastguard Worker     src += src_stride;
291*fb1b10abSAndroid Build Coastguard Worker     pred += pred_stride;
292*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
293*fb1b10abSAndroid Build Coastguard Worker               src7);
294*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5,
295*fb1b10abSAndroid Build Coastguard Worker               pred6, pred7);
296*fb1b10abSAndroid Build Coastguard Worker     src += src_stride;
297*fb1b10abSAndroid Build Coastguard Worker     pred += pred_stride;
298*fb1b10abSAndroid Build Coastguard Worker 
299*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
300*fb1b10abSAndroid Build Coastguard Worker               reg0, reg2, reg4, reg6);
301*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
302*fb1b10abSAndroid Build Coastguard Worker               reg1, reg3, reg5, reg7);
303*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
304*fb1b10abSAndroid Build Coastguard Worker               tmp0, tmp2, tmp4, tmp6);
305*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
306*fb1b10abSAndroid Build Coastguard Worker               tmp1, tmp3, tmp5, tmp7);
307*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
308*fb1b10abSAndroid Build Coastguard Worker               reg3, src0, src1, src2, src3);
309*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
310*fb1b10abSAndroid Build Coastguard Worker               reg7, src4, src5, src6, src7);
311*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
312*fb1b10abSAndroid Build Coastguard Worker               tmp3, pred0, pred1, pred2, pred3);
313*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
314*fb1b10abSAndroid Build Coastguard Worker               tmp7, pred4, pred5, pred6, pred7);
315*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src0, diff, 0);
316*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src1, diff, 16);
317*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src2, diff, 32);
318*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src3, diff, 48);
319*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src4, diff, 64);
320*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src5, diff, 80);
321*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src6, diff, 96);
322*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(src7, diff, 112);
323*fb1b10abSAndroid Build Coastguard Worker     diff += diff_stride;
324*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred0, diff, 0);
325*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred1, diff, 16);
326*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred2, diff, 32);
327*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred3, diff, 48);
328*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred4, diff, 64);
329*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred5, diff, 80);
330*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred6, diff, 96);
331*fb1b10abSAndroid Build Coastguard Worker     __lsx_vst(pred7, diff, 112);
332*fb1b10abSAndroid Build Coastguard Worker     diff += diff_stride;
333*fb1b10abSAndroid Build Coastguard Worker   }
334*fb1b10abSAndroid Build Coastguard Worker }
335*fb1b10abSAndroid Build Coastguard Worker 
vpx_subtract_block_lsx(int32_t rows,int32_t cols,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)336*fb1b10abSAndroid Build Coastguard Worker void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr,
337*fb1b10abSAndroid Build Coastguard Worker                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
338*fb1b10abSAndroid Build Coastguard Worker                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
339*fb1b10abSAndroid Build Coastguard Worker                             ptrdiff_t pred_stride) {
340*fb1b10abSAndroid Build Coastguard Worker   if (rows == cols) {
341*fb1b10abSAndroid Build Coastguard Worker     switch (rows) {
342*fb1b10abSAndroid Build Coastguard Worker       case 4:
343*fb1b10abSAndroid Build Coastguard Worker         sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
344*fb1b10abSAndroid Build Coastguard Worker                         diff_stride);
345*fb1b10abSAndroid Build Coastguard Worker         break;
346*fb1b10abSAndroid Build Coastguard Worker       case 8:
347*fb1b10abSAndroid Build Coastguard Worker         sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
348*fb1b10abSAndroid Build Coastguard Worker                         diff_stride);
349*fb1b10abSAndroid Build Coastguard Worker         break;
350*fb1b10abSAndroid Build Coastguard Worker       case 16:
351*fb1b10abSAndroid Build Coastguard Worker         sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
352*fb1b10abSAndroid Build Coastguard Worker                           diff_stride);
353*fb1b10abSAndroid Build Coastguard Worker         break;
354*fb1b10abSAndroid Build Coastguard Worker       case 32:
355*fb1b10abSAndroid Build Coastguard Worker         sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
356*fb1b10abSAndroid Build Coastguard Worker                           diff_stride);
357*fb1b10abSAndroid Build Coastguard Worker         break;
358*fb1b10abSAndroid Build Coastguard Worker       case 64:
359*fb1b10abSAndroid Build Coastguard Worker         sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
360*fb1b10abSAndroid Build Coastguard Worker                           diff_stride);
361*fb1b10abSAndroid Build Coastguard Worker         break;
362*fb1b10abSAndroid Build Coastguard Worker       default:
363*fb1b10abSAndroid Build Coastguard Worker         vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
364*fb1b10abSAndroid Build Coastguard Worker                              src_stride, pred_ptr, pred_stride);
365*fb1b10abSAndroid Build Coastguard Worker         break;
366*fb1b10abSAndroid Build Coastguard Worker     }
367*fb1b10abSAndroid Build Coastguard Worker   } else {
368*fb1b10abSAndroid Build Coastguard Worker     vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
369*fb1b10abSAndroid Build Coastguard Worker                          pred_ptr, pred_stride);
370*fb1b10abSAndroid Build Coastguard Worker   }
371*fb1b10abSAndroid Build Coastguard Worker }
372