xref: /aosp_15_r20/external/libyuv/source/row_lsx.cc (revision 4e366538070a3a6c5c163c31b791eab742e1657a)
1 /*
2  *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Copyright (c) 2022 Loongson Technology Corporation Limited
5  *
6  *  Use of this source code is governed by a BSD-style license
7  *  that can be found in the LICENSE file in the root of the source
8  *  tree. An additional intellectual property rights grant can be found
9  *  in the file PATENTS. All contributing project authors may
10  *  be found in the AUTHORS file in the root of the source tree.
11  */
12 
13 #include "libyuv/row.h"
14 
15 #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
16 #include "libyuv/loongson_intrinsics.h"
17 
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22 
23 // Fill YUV -> RGB conversion constants into vectors
24 #define YUVTORGB_SETUP(yuvconst, vr, ub, vg, ug, yg, yb) \
25   {                                                      \
26     ub = __lsx_vreplgr2vr_h(yuvconst->kUVToB[0]);        \
27     vr = __lsx_vreplgr2vr_h(yuvconst->kUVToR[1]);        \
28     ug = __lsx_vreplgr2vr_h(yuvconst->kUVToG[0]);        \
29     vg = __lsx_vreplgr2vr_h(yuvconst->kUVToG[1]);        \
30     yg = __lsx_vreplgr2vr_h(yuvconst->kYToRgb[0]);       \
31     yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]);   \
32   }
33 
34 // Load 32 YUV422 pixel data
35 #define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
36   {                                                             \
37     __m128i temp0, temp1;                                       \
38                                                                 \
39     DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0);   \
40     temp1 = __lsx_vld(psrc_v, 0);                               \
41     temp0 = __lsx_vsub_b(temp0, const_80);                      \
42     temp1 = __lsx_vsub_b(temp1, const_80);                      \
43     temp0 = __lsx_vsllwil_h_b(temp0, 0);                        \
44     temp1 = __lsx_vsllwil_h_b(temp1, 0);                        \
45     uv_l = __lsx_vilvl_h(temp0, temp1);                         \
46     uv_h = __lsx_vilvh_h(temp0, temp1);                         \
47   }
48 
49 // Load 16 YUV422 pixel data
50 #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
51   {                                                   \
52     __m128i temp0, temp1;                             \
53                                                       \
54     out_y = __lsx_vld(psrc_y, 0);                     \
55     temp0 = __lsx_vldrepl_d(psrc_u, 0);               \
56     temp1 = __lsx_vldrepl_d(psrc_v, 0);               \
57     uv = __lsx_vilvl_b(temp0, temp1);                 \
58     uv = __lsx_vsub_b(uv, const_80);                  \
59     uv = __lsx_vsllwil_h_b(uv, 0);                    \
60   }
61 
62 // Convert 16 pixels of YUV420 to RGB.
63 #define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \
64                    g_h, r_l, r_h)                                           \
65   {                                                                         \
66     __m128i u_l, u_h, v_l, v_h;                                             \
67     __m128i yl_ev, yl_od, yh_ev, yh_od;                                     \
68     __m128i temp0, temp1, temp2, temp3;                                     \
69                                                                             \
70     temp0 = __lsx_vilvl_b(in_y, in_y);                                      \
71     temp1 = __lsx_vilvh_b(in_y, in_y);                                      \
72     yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg);                                \
73     yl_od = __lsx_vmulwod_w_hu_h(temp0, yg);                                \
74     yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg);                                \
75     yh_od = __lsx_vmulwod_w_hu_h(temp1, yg);                                \
76     DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16,    \
77               yl_ev, yl_od, yh_ev, yh_od);                                  \
78     yl_ev = __lsx_vadd_w(yl_ev, yb);                                        \
79     yl_od = __lsx_vadd_w(yl_od, yb);                                        \
80     yh_ev = __lsx_vadd_w(yh_ev, yb);                                        \
81     yh_od = __lsx_vadd_w(yh_od, yb);                                        \
82     v_l = __lsx_vmulwev_w_h(in_uvl, ubvr);                                  \
83     u_l = __lsx_vmulwod_w_h(in_uvl, ubvr);                                  \
84     v_h = __lsx_vmulwev_w_h(in_uvh, ubvr);                                  \
85     u_h = __lsx_vmulwod_w_h(in_uvh, ubvr);                                  \
86     temp0 = __lsx_vadd_w(yl_ev, u_l);                                       \
87     temp1 = __lsx_vadd_w(yl_od, u_l);                                       \
88     temp2 = __lsx_vadd_w(yh_ev, u_h);                                       \
89     temp3 = __lsx_vadd_w(yh_od, u_h);                                       \
90     DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
91               temp1, temp2, temp3);                                         \
92     DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
93               temp2, temp3);                                                \
94     b_l = __lsx_vpackev_h(temp1, temp0);                                    \
95     b_h = __lsx_vpackev_h(temp3, temp2);                                    \
96     temp0 = __lsx_vadd_w(yl_ev, v_l);                                       \
97     temp1 = __lsx_vadd_w(yl_od, v_l);                                       \
98     temp2 = __lsx_vadd_w(yh_ev, v_h);                                       \
99     temp3 = __lsx_vadd_w(yh_od, v_h);                                       \
100     DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
101               temp1, temp2, temp3);                                         \
102     DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
103               temp2, temp3);                                                \
104     r_l = __lsx_vpackev_h(temp1, temp0);                                    \
105     r_h = __lsx_vpackev_h(temp3, temp2);                                    \
106     DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h);        \
107     temp0 = __lsx_vsub_w(yl_ev, u_l);                                       \
108     temp1 = __lsx_vsub_w(yl_od, u_l);                                       \
109     temp2 = __lsx_vsub_w(yh_ev, u_h);                                       \
110     temp3 = __lsx_vsub_w(yh_od, u_h);                                       \
111     DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
112               temp1, temp2, temp3);                                         \
113     DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1,   \
114               temp2, temp3);                                                \
115     g_l = __lsx_vpackev_h(temp1, temp0);                                    \
116     g_h = __lsx_vpackev_h(temp3, temp2);                                    \
117   }
118 
119 // Convert 8 pixels of YUV420 to RGB.
120 #define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \
121   {                                                                    \
122     __m128i y_ev, y_od, u_l, v_l;                                      \
123     __m128i tmp0, tmp1, tmp2, tmp3;                                    \
124                                                                        \
125     tmp0 = __lsx_vilvl_b(in_y, in_y);                                  \
126     y_ev = __lsx_vmulwev_w_hu_h(tmp0, yg);                             \
127     y_od = __lsx_vmulwod_w_hu_h(tmp0, yg);                             \
128     y_ev = __lsx_vsrai_w(y_ev, 16);                                    \
129     y_od = __lsx_vsrai_w(y_od, 16);                                    \
130     y_ev = __lsx_vadd_w(y_ev, yb);                                     \
131     y_od = __lsx_vadd_w(y_od, yb);                                     \
132     in_vu = __lsx_vilvl_b(zero, in_vu);                                \
133     in_vu = __lsx_vsub_h(in_vu, const_80);                             \
134     u_l = __lsx_vmulwev_w_h(in_vu, vrub);                              \
135     v_l = __lsx_vmulwod_w_h(in_vu, vrub);                              \
136     tmp0 = __lsx_vadd_w(y_ev, u_l);                                    \
137     tmp1 = __lsx_vadd_w(y_od, u_l);                                    \
138     tmp2 = __lsx_vadd_w(y_ev, v_l);                                    \
139     tmp3 = __lsx_vadd_w(y_od, v_l);                                    \
140     tmp0 = __lsx_vsrai_w(tmp0, 6);                                     \
141     tmp1 = __lsx_vsrai_w(tmp1, 6);                                     \
142     tmp2 = __lsx_vsrai_w(tmp2, 6);                                     \
143     tmp3 = __lsx_vsrai_w(tmp3, 6);                                     \
144     tmp0 = __lsx_vclip255_w(tmp0);                                     \
145     tmp1 = __lsx_vclip255_w(tmp1);                                     \
146     tmp2 = __lsx_vclip255_w(tmp2);                                     \
147     tmp3 = __lsx_vclip255_w(tmp3);                                     \
148     out_b = __lsx_vpackev_h(tmp1, tmp0);                               \
149     out_r = __lsx_vpackev_h(tmp3, tmp2);                               \
150     tmp0 = __lsx_vdp2_w_h(in_vu, vgug);                                \
151     tmp1 = __lsx_vsub_w(y_ev, tmp0);                                   \
152     tmp2 = __lsx_vsub_w(y_od, tmp0);                                   \
153     tmp1 = __lsx_vsrai_w(tmp1, 6);                                     \
154     tmp2 = __lsx_vsrai_w(tmp2, 6);                                     \
155     tmp1 = __lsx_vclip255_w(tmp1);                                     \
156     tmp2 = __lsx_vclip255_w(tmp2);                                     \
157     out_g = __lsx_vpackev_h(tmp2, tmp1);                               \
158   }
159 
160 // Convert I444 pixels of YUV420 to RGB.
161 #define I444TORGB(in_yy, in_u, in_v, ub, vr, ugvg, yg, yb, out_b, out_g, \
162                   out_r)                                                 \
163   {                                                                      \
164     __m128i y_ev, y_od, u_ev, v_ev, u_od, v_od;                          \
165     __m128i tmp0, tmp1, tmp2, tmp3;                                      \
166                                                                          \
167     y_ev = __lsx_vmulwev_w_hu_h(in_yy, yg);                              \
168     y_od = __lsx_vmulwod_w_hu_h(in_yy, yg);                              \
169     y_ev = __lsx_vsrai_w(y_ev, 16);                                      \
170     y_od = __lsx_vsrai_w(y_od, 16);                                      \
171     y_ev = __lsx_vadd_w(y_ev, yb);                                       \
172     y_od = __lsx_vadd_w(y_od, yb);                                       \
173     in_u = __lsx_vsub_h(in_u, const_80);                                 \
174     in_v = __lsx_vsub_h(in_v, const_80);                                 \
175     u_ev = __lsx_vmulwev_w_h(in_u, ub);                                  \
176     u_od = __lsx_vmulwod_w_h(in_u, ub);                                  \
177     v_ev = __lsx_vmulwev_w_h(in_v, vr);                                  \
178     v_od = __lsx_vmulwod_w_h(in_v, vr);                                  \
179     tmp0 = __lsx_vadd_w(y_ev, u_ev);                                     \
180     tmp1 = __lsx_vadd_w(y_od, u_od);                                     \
181     tmp2 = __lsx_vadd_w(y_ev, v_ev);                                     \
182     tmp3 = __lsx_vadd_w(y_od, v_od);                                     \
183     tmp0 = __lsx_vsrai_w(tmp0, 6);                                       \
184     tmp1 = __lsx_vsrai_w(tmp1, 6);                                       \
185     tmp2 = __lsx_vsrai_w(tmp2, 6);                                       \
186     tmp3 = __lsx_vsrai_w(tmp3, 6);                                       \
187     tmp0 = __lsx_vclip255_w(tmp0);                                       \
188     tmp1 = __lsx_vclip255_w(tmp1);                                       \
189     tmp2 = __lsx_vclip255_w(tmp2);                                       \
190     tmp3 = __lsx_vclip255_w(tmp3);                                       \
191     out_b = __lsx_vpackev_h(tmp1, tmp0);                                 \
192     out_r = __lsx_vpackev_h(tmp3, tmp2);                                 \
193     u_ev = __lsx_vpackev_h(in_u, in_v);                                  \
194     u_od = __lsx_vpackod_h(in_u, in_v);                                  \
195     v_ev = __lsx_vdp2_w_h(u_ev, ugvg);                                   \
196     v_od = __lsx_vdp2_w_h(u_od, ugvg);                                   \
197     tmp0 = __lsx_vsub_w(y_ev, v_ev);                                     \
198     tmp1 = __lsx_vsub_w(y_od, v_od);                                     \
199     tmp0 = __lsx_vsrai_w(tmp0, 6);                                       \
200     tmp1 = __lsx_vsrai_w(tmp1, 6);                                       \
201     tmp0 = __lsx_vclip255_w(tmp0);                                       \
202     tmp1 = __lsx_vclip255_w(tmp1);                                       \
203     out_g = __lsx_vpackev_h(tmp1, tmp0);                                 \
204   }
205 
206 // Pack and Store 16 ARGB values.
207 #define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
208   {                                                                    \
209     __m128i temp0, temp1, temp2, temp3;                                \
210     temp0 = __lsx_vpackev_b(g_l, b_l);                                 \
211     temp1 = __lsx_vpackev_b(a_l, r_l);                                 \
212     temp2 = __lsx_vpackev_b(g_h, b_h);                                 \
213     temp3 = __lsx_vpackev_b(a_h, r_h);                                 \
214     r_l = __lsx_vilvl_h(temp1, temp0);                                 \
215     r_h = __lsx_vilvh_h(temp1, temp0);                                 \
216     g_l = __lsx_vilvl_h(temp3, temp2);                                 \
217     g_h = __lsx_vilvh_h(temp3, temp2);                                 \
218     __lsx_vst(r_l, pdst_argb, 0);                                      \
219     __lsx_vst(r_h, pdst_argb, 16);                                     \
220     __lsx_vst(g_l, pdst_argb, 32);                                     \
221     __lsx_vst(g_h, pdst_argb, 48);                                     \
222     pdst_argb += 64;                                                   \
223   }
224 
225 // Pack and Store 8 ARGB values.
226 #define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
227   {                                                  \
228     __m128i temp0, temp1;                            \
229     __m128i dst0, dst1;                              \
230                                                      \
231     temp0 = __lsx_vpackev_b(in_g, in_b);             \
232     temp1 = __lsx_vpackev_b(in_a, in_r);             \
233     dst0 = __lsx_vilvl_h(temp1, temp0);              \
234     dst1 = __lsx_vilvh_h(temp1, temp0);              \
235     __lsx_vst(dst0, pdst_argb, 0);                   \
236     __lsx_vst(dst1, pdst_argb, 16);                  \
237     pdst_argb += 32;                                 \
238   }
239 
240 #define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
241   {                                                              \
242     __m128i _tmp0, _tmp1, _tmp2, _tmp3;                          \
243     __m128i _reg0, _reg1;                                        \
244     _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb);                    \
245     _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb);                    \
246     _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg);                    \
247     _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg);                    \
248     _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr);                    \
249     _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr);                    \
250     _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1);                        \
251     _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3);                        \
252     _tmpr = __lsx_vavgr_hu(_reg0, _reg1);                        \
253     _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb);         \
254     _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr);         \
255     _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg);               \
256     _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg);               \
257     _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr);               \
258     _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb);               \
259     _dst0 = __lsx_vpickod_b(_reg1, _reg0);                       \
260   }
261 
MirrorRow_LSX(const uint8_t * src,uint8_t * dst,int width)262 void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
263   int x;
264   int len = width / 32;
265   __m128i src0, src1;
266   __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607};
267   src += width - 32;
268   for (x = 0; x < len; x++) {
269     DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
270     DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
271               src1);
272     __lsx_vst(src1, dst, 0);
273     __lsx_vst(src0, dst, 16);
274     dst += 32;
275     src -= 32;
276   }
277 }
278 
MirrorUVRow_LSX(const uint8_t * src_uv,uint8_t * dst_uv,int width)279 void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
280   int x;
281   int len = width / 8;
282   __m128i src, dst;
283   __m128i shuffler = {0x0004000500060007, 0x0000000100020003};
284 
285   src_uv += (width - 8) << 1;
286   for (x = 0; x < len; x++) {
287     src = __lsx_vld(src_uv, 0);
288     dst = __lsx_vshuf_h(shuffler, src, src);
289     __lsx_vst(dst, dst_uv, 0);
290     src_uv -= 16;
291     dst_uv += 16;
292   }
293 }
294 
ARGBMirrorRow_LSX(const uint8_t * src,uint8_t * dst,int width)295 void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
296   int x;
297   int len = width / 8;
298   __m128i src0, src1;
299   __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504};
300 
301   src += (width * 4) - 32;
302   for (x = 0; x < len; x++) {
303     DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
304     DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
305               src1);
306     __lsx_vst(src1, dst, 0);
307     __lsx_vst(src0, dst, 16);
308     dst += 32;
309     src -= 32;
310   }
311 }
312 
I422ToYUY2Row_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)313 void I422ToYUY2Row_LSX(const uint8_t* src_y,
314                        const uint8_t* src_u,
315                        const uint8_t* src_v,
316                        uint8_t* dst_yuy2,
317                        int width) {
318   int x;
319   int len = width / 16;
320   __m128i src_u0, src_v0, src_y0, vec_uv0;
321   __m128i vec_yuy2_0, vec_yuy2_1;
322 
323   for (x = 0; x < len; x++) {
324     DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
325     src_y0 = __lsx_vld(src_y, 0);
326     vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
327     vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0);
328     vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0);
329     __lsx_vst(vec_yuy2_0, dst_yuy2, 0);
330     __lsx_vst(vec_yuy2_1, dst_yuy2, 16);
331     src_u += 8;
332     src_v += 8;
333     src_y += 16;
334     dst_yuy2 += 32;
335   }
336 }
337 
I422ToUYVYRow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)338 void I422ToUYVYRow_LSX(const uint8_t* src_y,
339                        const uint8_t* src_u,
340                        const uint8_t* src_v,
341                        uint8_t* dst_uyvy,
342                        int width) {
343   int x;
344   int len = width / 16;
345   __m128i src_u0, src_v0, src_y0, vec_uv0;
346   __m128i vec_uyvy0, vec_uyvy1;
347 
348   for (x = 0; x < len; x++) {
349     DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
350     src_y0 = __lsx_vld(src_y, 0);
351     vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
352     vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0);
353     vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0);
354     __lsx_vst(vec_uyvy0, dst_uyvy, 0);
355     __lsx_vst(vec_uyvy1, dst_uyvy, 16);
356     src_u += 8;
357     src_v += 8;
358     src_y += 16;
359     dst_uyvy += 32;
360   }
361 }
362 
I422ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)363 void I422ToARGBRow_LSX(const uint8_t* src_y,
364                        const uint8_t* src_u,
365                        const uint8_t* src_v,
366                        uint8_t* dst_argb,
367                        const struct YuvConstants* yuvconstants,
368                        int width) {
369   int x;
370   int len = width / 16;
371   __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
372   __m128i vec_ubvr, vec_ugvg;
373   __m128i alpha = __lsx_vldi(0xFF);
374   __m128i const_80 = __lsx_vldi(0x80);
375 
376   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
377   vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
378   vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
379 
380   for (x = 0; x < len; x++) {
381     __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
382 
383     READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
384     YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
385                g_h, r_l, r_h);
386     STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
387     src_y += 16;
388     src_u += 8;
389     src_v += 8;
390   }
391 }
392 
I422ToRGBARow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)393 void I422ToRGBARow_LSX(const uint8_t* src_y,
394                        const uint8_t* src_u,
395                        const uint8_t* src_v,
396                        uint8_t* dst_argb,
397                        const struct YuvConstants* yuvconstants,
398                        int width) {
399   int x;
400   int len = width / 16;
401   __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
402   __m128i vec_ubvr, vec_ugvg;
403   __m128i alpha = __lsx_vldi(0xFF);
404   __m128i const_80 = __lsx_vldi(0x80);
405 
406   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
407   vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
408   vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
409 
410   for (x = 0; x < len; x++) {
411     __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
412 
413     READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
414     YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
415                g_h, r_l, r_h);
416     STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
417     src_y += 16;
418     src_u += 8;
419     src_v += 8;
420   }
421 }
422 
I422AlphaToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)423 void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
424                             const uint8_t* src_u,
425                             const uint8_t* src_v,
426                             const uint8_t* src_a,
427                             uint8_t* dst_argb,
428                             const struct YuvConstants* yuvconstants,
429                             int width) {
430   int x;
431   int len = width / 16;
432   int res = width & 15;
433   __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
434   __m128i vec_ubvr, vec_ugvg;
435   __m128i zero = __lsx_vldi(0);
436   __m128i const_80 = __lsx_vldi(0x80);
437 
438   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
439   vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
440   vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
441 
442   for (x = 0; x < len; x++) {
443     __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
444 
445     y = __lsx_vld(src_a, 0);
446     a_l = __lsx_vilvl_b(zero, y);
447     a_h = __lsx_vilvh_b(zero, y);
448     READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
449     YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
450                g_h, r_l, r_h);
451     STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
452     src_y += 16;
453     src_u += 8;
454     src_v += 8;
455     src_a += 16;
456   }
457   if (res) {
458     __m128i y, uv, r, g, b, a;
459     a = __lsx_vld(src_a, 0);
460     a = __lsx_vsllwil_hu_bu(a, 0);
461     READYUV422(src_y, src_u, src_v, y, uv);
462     YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
463     STOREARGB(a, r, g, b, dst_argb);
464   }
465 }
466 
I422ToRGB24Row_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int32_t width)467 void I422ToRGB24Row_LSX(const uint8_t* src_y,
468                         const uint8_t* src_u,
469                         const uint8_t* src_v,
470                         uint8_t* dst_argb,
471                         const struct YuvConstants* yuvconstants,
472                         int32_t width) {
473   int x;
474   int len = width / 16;
475   __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
476   __m128i vec_ubvr, vec_ugvg;
477   __m128i const_80 = __lsx_vldi(0x80);
478   __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614};
479   __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
480 
481   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
482   vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
483   vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
484 
485   for (x = 0; x < len; x++) {
486     __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
487     __m128i temp0, temp1, temp2, temp3;
488 
489     READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
490     YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
491                g_h, r_l, r_h);
492     temp0 = __lsx_vpackev_b(g_l, b_l);
493     temp1 = __lsx_vpackev_b(g_h, b_h);
494     DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, r_l,
495               temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
496               temp1);
497 
498     b_l = __lsx_vilvl_d(temp1, temp2);
499     b_h = __lsx_vilvh_d(temp3, temp1);
500     __lsx_vst(temp0, dst_argb, 0);
501     __lsx_vst(b_l, dst_argb, 16);
502     __lsx_vst(b_h, dst_argb, 32);
503     dst_argb += 48;
504     src_y += 16;
505     src_u += 8;
506     src_v += 8;
507   }
508 }
509 
510 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
I422ToRGB565Row_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)511 void I422ToRGB565Row_LSX(const uint8_t* src_y,
512                          const uint8_t* src_u,
513                          const uint8_t* src_v,
514                          uint8_t* dst_rgb565,
515                          const struct YuvConstants* yuvconstants,
516                          int width) {
517   int x;
518   int len = width / 16;
519   __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
520   __m128i vec_ubvr, vec_ugvg;
521   __m128i const_80 = __lsx_vldi(0x80);
522 
523   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
524   vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
525   vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
526 
527   for (x = 0; x < len; x++) {
528     __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
529 
530     READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
531     YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
532                g_h, r_l, r_h);
533     b_l = __lsx_vsrli_h(b_l, 3);
534     b_h = __lsx_vsrli_h(b_h, 3);
535     g_l = __lsx_vsrli_h(g_l, 2);
536     g_h = __lsx_vsrli_h(g_h, 2);
537     r_l = __lsx_vsrli_h(r_l, 3);
538     r_h = __lsx_vsrli_h(r_h, 3);
539     r_l = __lsx_vslli_h(r_l, 11);
540     r_h = __lsx_vslli_h(r_h, 11);
541     g_l = __lsx_vslli_h(g_l, 5);
542     g_h = __lsx_vslli_h(g_h, 5);
543     r_l = __lsx_vor_v(r_l, g_l);
544     r_l = __lsx_vor_v(r_l, b_l);
545     r_h = __lsx_vor_v(r_h, g_h);
546     r_h = __lsx_vor_v(r_h, b_h);
547     __lsx_vst(r_l, dst_rgb565, 0);
548     __lsx_vst(r_h, dst_rgb565, 16);
549     dst_rgb565 += 32;
550     src_y += 16;
551     src_u += 8;
552     src_v += 8;
553   }
554 }
555 
556 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
I422ToARGB4444Row_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)557 void I422ToARGB4444Row_LSX(const uint8_t* src_y,
558                            const uint8_t* src_u,
559                            const uint8_t* src_v,
560                            uint8_t* dst_argb4444,
561                            const struct YuvConstants* yuvconstants,
562                            int width) {
563   int x;
564   int len = width / 16;
565   __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
566   __m128i vec_ubvr, vec_ugvg;
567   __m128i const_80 = __lsx_vldi(0x80);
568   __m128i alpha = (__m128i)v2u64{0xF000F000F000F000, 0xF000F000F000F000};
569   __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0};
570 
571   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
572   vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
573   vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
574 
575   for (x = 0; x < len; x++) {
576     __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
577 
578     READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
579     YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
580                g_h, r_l, r_h);
581     b_l = __lsx_vsrli_h(b_l, 4);
582     b_h = __lsx_vsrli_h(b_h, 4);
583     r_l = __lsx_vsrli_h(r_l, 4);
584     r_h = __lsx_vsrli_h(r_h, 4);
585     g_l = __lsx_vand_v(g_l, mask);
586     g_h = __lsx_vand_v(g_h, mask);
587     r_l = __lsx_vslli_h(r_l, 8);
588     r_h = __lsx_vslli_h(r_h, 8);
589     r_l = __lsx_vor_v(r_l, alpha);
590     r_h = __lsx_vor_v(r_h, alpha);
591     r_l = __lsx_vor_v(r_l, g_l);
592     r_h = __lsx_vor_v(r_h, g_h);
593     r_l = __lsx_vor_v(r_l, b_l);
594     r_h = __lsx_vor_v(r_h, b_h);
595     __lsx_vst(r_l, dst_argb4444, 0);
596     __lsx_vst(r_h, dst_argb4444, 16);
597     dst_argb4444 += 32;
598     src_y += 16;
599     src_u += 8;
600     src_v += 8;
601   }
602 }
603 
I422ToARGB1555Row_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)604 void I422ToARGB1555Row_LSX(const uint8_t* src_y,
605                            const uint8_t* src_u,
606                            const uint8_t* src_v,
607                            uint8_t* dst_argb1555,
608                            const struct YuvConstants* yuvconstants,
609                            int width) {
610   int x;
611   int len = width / 16;
612   __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
613   __m128i vec_ubvr, vec_ugvg;
614   __m128i const_80 = __lsx_vldi(0x80);
615   __m128i alpha = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
616 
617   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
618   vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
619   vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
620 
621   for (x = 0; x < len; x++) {
622     __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
623 
624     READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
625     YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
626                g_h, r_l, r_h);
627     b_l = __lsx_vsrli_h(b_l, 3);
628     b_h = __lsx_vsrli_h(b_h, 3);
629     g_l = __lsx_vsrli_h(g_l, 3);
630 
631     g_h = __lsx_vsrli_h(g_h, 3);
632     g_l = __lsx_vslli_h(g_l, 5);
633     g_h = __lsx_vslli_h(g_h, 5);
634     r_l = __lsx_vsrli_h(r_l, 3);
635     r_h = __lsx_vsrli_h(r_h, 3);
636     r_l = __lsx_vslli_h(r_l, 10);
637     r_h = __lsx_vslli_h(r_h, 10);
638     r_l = __lsx_vor_v(r_l, alpha);
639     r_h = __lsx_vor_v(r_h, alpha);
640     r_l = __lsx_vor_v(r_l, g_l);
641     r_h = __lsx_vor_v(r_h, g_h);
642     r_l = __lsx_vor_v(r_l, b_l);
643     r_h = __lsx_vor_v(r_h, b_h);
644     __lsx_vst(r_l, dst_argb1555, 0);
645     __lsx_vst(r_h, dst_argb1555, 16);
646     dst_argb1555 += 32;
647     src_y += 16;
648     src_u += 8;
649     src_v += 8;
650   }
651 }
652 
YUY2ToYRow_LSX(const uint8_t * src_yuy2,uint8_t * dst_y,int width)653 void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
654   int x;
655   int len = width / 16;
656   __m128i src0, src1, dst0;
657 
658   for (x = 0; x < len; x++) {
659     DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
660     dst0 = __lsx_vpickev_b(src1, src0);
661     __lsx_vst(dst0, dst_y, 0);
662     src_yuy2 += 32;
663     dst_y += 16;
664   }
665 }
666 
YUY2ToUVRow_LSX(const uint8_t * src_yuy2,int src_stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)667 void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
668                      int src_stride_yuy2,
669                      uint8_t* dst_u,
670                      uint8_t* dst_v,
671                      int width) {
672   const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
673   int x;
674   int len = width / 16;
675   __m128i src0, src1, src2, src3;
676   __m128i tmp0, dst0, dst1;
677 
678   for (x = 0; x < len; x++) {
679     DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0,
680               src_yuy2_next, 16, src0, src1, src2, src3);
681     src0 = __lsx_vpickod_b(src1, src0);
682     src1 = __lsx_vpickod_b(src3, src2);
683     tmp0 = __lsx_vavgr_bu(src1, src0);
684     dst0 = __lsx_vpickev_b(tmp0, tmp0);
685     dst1 = __lsx_vpickod_b(tmp0, tmp0);
686     __lsx_vstelm_d(dst0, dst_u, 0, 0);
687     __lsx_vstelm_d(dst1, dst_v, 0, 0);
688     src_yuy2 += 32;
689     src_yuy2_next += 32;
690     dst_u += 8;
691     dst_v += 8;
692   }
693 }
694 
YUY2ToUV422Row_LSX(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)695 void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
696                         uint8_t* dst_u,
697                         uint8_t* dst_v,
698                         int width) {
699   int x;
700   int len = width / 16;
701   __m128i src0, src1, tmp0, dst0, dst1;
702 
703   for (x = 0; x < len; x++) {
704     DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
705     tmp0 = __lsx_vpickod_b(src1, src0);
706     dst0 = __lsx_vpickev_b(tmp0, tmp0);
707     dst1 = __lsx_vpickod_b(tmp0, tmp0);
708     __lsx_vstelm_d(dst0, dst_u, 0, 0);
709     __lsx_vstelm_d(dst1, dst_v, 0, 0);
710     src_yuy2 += 32;
711     dst_u += 8;
712     dst_v += 8;
713   }
714 }
715 
UYVYToYRow_LSX(const uint8_t * src_uyvy,uint8_t * dst_y,int width)716 void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
717   int x;
718   int len = width / 16;
719   __m128i src0, src1, dst0;
720 
721   for (x = 0; x < len; x++) {
722     DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
723     dst0 = __lsx_vpickod_b(src1, src0);
724     __lsx_vst(dst0, dst_y, 0);
725     src_uyvy += 32;
726     dst_y += 16;
727   }
728 }
729 
UYVYToUVRow_LSX(const uint8_t * src_uyvy,int src_stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)730 void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
731                      int src_stride_uyvy,
732                      uint8_t* dst_u,
733                      uint8_t* dst_v,
734                      int width) {
735   const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
736   int x;
737   int len = width / 16;
738   __m128i src0, src1, src2, src3, tmp0, dst0, dst1;
739 
740   for (x = 0; x < len; x++) {
741     DUP4_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src_uyvy_next, 0,
742               src_uyvy_next, 16, src0, src1, src2, src3);
743     src0 = __lsx_vpickev_b(src1, src0);
744     src1 = __lsx_vpickev_b(src3, src2);
745     tmp0 = __lsx_vavgr_bu(src1, src0);
746     dst0 = __lsx_vpickev_b(tmp0, tmp0);
747     dst1 = __lsx_vpickod_b(tmp0, tmp0);
748     __lsx_vstelm_d(dst0, dst_u, 0, 0);
749     __lsx_vstelm_d(dst1, dst_v, 0, 0);
750     src_uyvy += 32;
751     src_uyvy_next += 32;
752     dst_u += 8;
753     dst_v += 8;
754   }
755 }
756 
UYVYToUV422Row_LSX(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)757 void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
758                         uint8_t* dst_u,
759                         uint8_t* dst_v,
760                         int width) {
761   int x;
762   int len = width / 16;
763   __m128i src0, src1, tmp0, dst0, dst1;
764 
765   for (x = 0; x < len; x++) {
766     DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
767     tmp0 = __lsx_vpickev_b(src1, src0);
768     dst0 = __lsx_vpickev_b(tmp0, tmp0);
769     dst1 = __lsx_vpickod_b(tmp0, tmp0);
770     __lsx_vstelm_d(dst0, dst_u, 0, 0);
771     __lsx_vstelm_d(dst1, dst_v, 0, 0);
772     src_uyvy += 32;
773     dst_u += 8;
774     dst_v += 8;
775   }
776 }
777 
ARGBToUVRow_LSX(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)778 void ARGBToUVRow_LSX(const uint8_t* src_argb0,
779                      int src_stride_argb,
780                      uint8_t* dst_u,
781                      uint8_t* dst_v,
782                      int width) {
783   int x;
784   int len = width / 16;
785   const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
786 
787   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
788   __m128i vec0, vec1, vec2, vec3;
789   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
790   __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038};
791   __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025};
792   __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013};
793   __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f};
794   __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009};
795   __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
796   for (x = 0; x < len; x++) {
797     DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0,
798               48, src0, src1, src2, src3);
799     DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1,
800               48, src4, src5, src6, src7);
801     vec0 = __lsx_vaddwev_h_bu(src0, src4);
802     vec1 = __lsx_vaddwev_h_bu(src1, src5);
803     vec2 = __lsx_vaddwev_h_bu(src2, src6);
804     vec3 = __lsx_vaddwev_h_bu(src3, src7);
805     tmp0 = __lsx_vpickev_h(vec1, vec0);
806     tmp1 = __lsx_vpickev_h(vec3, vec2);
807     tmp2 = __lsx_vpickod_h(vec1, vec0);
808     tmp3 = __lsx_vpickod_h(vec3, vec2);
809     vec0 = __lsx_vaddwod_h_bu(src0, src4);
810     vec1 = __lsx_vaddwod_h_bu(src1, src5);
811     vec2 = __lsx_vaddwod_h_bu(src2, src6);
812     vec3 = __lsx_vaddwod_h_bu(src3, src7);
813     tmp4 = __lsx_vpickev_h(vec1, vec0);
814     tmp5 = __lsx_vpickev_h(vec3, vec2);
815     vec0 = __lsx_vpickev_h(tmp1, tmp0);
816     vec1 = __lsx_vpickod_h(tmp1, tmp0);
817     src0 = __lsx_vavgr_h(vec0, vec1);
818     vec0 = __lsx_vpickev_h(tmp3, tmp2);
819     vec1 = __lsx_vpickod_h(tmp3, tmp2);
820     src1 = __lsx_vavgr_h(vec0, vec1);
821     vec0 = __lsx_vpickev_h(tmp5, tmp4);
822     vec1 = __lsx_vpickod_h(tmp5, tmp4);
823     src2 = __lsx_vavgr_h(vec0, vec1);
824     dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70);
825     dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A);
826     dst0 = __lsx_vmsub_h(dst0, src1, const_0x26);
827     dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70);
828     dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E);
829     dst1 = __lsx_vmsub_h(dst1, src0, const_0x12);
830     dst0 = __lsx_vsrai_h(dst0, 8);
831     dst1 = __lsx_vsrai_h(dst1, 8);
832     dst0 = __lsx_vpickev_b(dst1, dst0);
833     __lsx_vstelm_d(dst0, dst_u, 0, 0);
834     __lsx_vstelm_d(dst0, dst_v, 0, 1);
835     src_argb0 += 64;
836     src_argb1 += 64;
837     dst_u += 8;
838     dst_v += 8;
839   }
840 }
841 
ARGBToRGB24Row_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)842 void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
843   int x;
844   int len = (width / 16) - 1;
845   __m128i src0, src1, src2, src3;
846   __m128i tmp0, tmp1, tmp2, tmp3;
847   __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A};
848   for (x = 0; x < len; x++) {
849     DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
850               src0, src1, src2, src3);
851     tmp0 = __lsx_vshuf_b(src0, src0, shuf);
852     tmp1 = __lsx_vshuf_b(src1, src1, shuf);
853     tmp2 = __lsx_vshuf_b(src2, src2, shuf);
854     tmp3 = __lsx_vshuf_b(src3, src3, shuf);
855     __lsx_vst(tmp0, dst_rgb, 0);
856     __lsx_vst(tmp1, dst_rgb, 12);
857     __lsx_vst(tmp2, dst_rgb, 24);
858     __lsx_vst(tmp3, dst_rgb, 36);
859     dst_rgb += 48;
860     src_argb += 64;
861   }
862   DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
863             src0, src1, src2, src3);
864   tmp0 = __lsx_vshuf_b(src0, src0, shuf);
865   tmp1 = __lsx_vshuf_b(src1, src1, shuf);
866   tmp2 = __lsx_vshuf_b(src2, src2, shuf);
867   tmp3 = __lsx_vshuf_b(src3, src3, shuf);
868   __lsx_vst(tmp0, dst_rgb, 0);
869   __lsx_vst(tmp1, dst_rgb, 12);
870   __lsx_vst(tmp2, dst_rgb, 24);
871   dst_rgb += 36;
872   __lsx_vst(tmp3, dst_rgb, 0);
873 }
874 
ARGBToRAWRow_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)875 void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
876   int x;
877   int len = (width / 16) - 1;
878   __m128i src0, src1, src2, src3;
879   __m128i tmp0, tmp1, tmp2, tmp3;
880   __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08};
881   for (x = 0; x < len; x++) {
882     DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
883               src0, src1, src2, src3);
884     tmp0 = __lsx_vshuf_b(src0, src0, shuf);
885     tmp1 = __lsx_vshuf_b(src1, src1, shuf);
886     tmp2 = __lsx_vshuf_b(src2, src2, shuf);
887     tmp3 = __lsx_vshuf_b(src3, src3, shuf);
888     __lsx_vst(tmp0, dst_rgb, 0);
889     __lsx_vst(tmp1, dst_rgb, 12);
890     __lsx_vst(tmp2, dst_rgb, 24);
891     __lsx_vst(tmp3, dst_rgb, 36);
892     dst_rgb += 48;
893     src_argb += 64;
894   }
895   DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
896             src0, src1, src2, src3);
897   tmp0 = __lsx_vshuf_b(src0, src0, shuf);
898   tmp1 = __lsx_vshuf_b(src1, src1, shuf);
899   tmp2 = __lsx_vshuf_b(src2, src2, shuf);
900   tmp3 = __lsx_vshuf_b(src3, src3, shuf);
901   __lsx_vst(tmp0, dst_rgb, 0);
902   __lsx_vst(tmp1, dst_rgb, 12);
903   __lsx_vst(tmp2, dst_rgb, 24);
904   dst_rgb += 36;
905   __lsx_vst(tmp3, dst_rgb, 0);
906 }
907 
ARGBToRGB565Row_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)908 void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
909   int x;
910   int len = width / 8;
911   __m128i zero = __lsx_vldi(0);
912   __m128i src0, src1, tmp0, tmp1, dst0;
913   __m128i shift = {0x0300030003000300, 0x0300030003000300};
914 
915   for (x = 0; x < len; x++) {
916     DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
917     tmp0 = __lsx_vpickev_b(src1, src0);
918     tmp1 = __lsx_vpickod_b(src1, src0);
919     tmp0 = __lsx_vsrli_b(tmp0, 3);
920     tmp1 = __lsx_vpackev_b(zero, tmp1);
921     tmp1 = __lsx_vsrli_h(tmp1, 2);
922     tmp0 = __lsx_vsll_b(tmp0, shift);
923     tmp1 = __lsx_vslli_h(tmp1, 5);
924     dst0 = __lsx_vor_v(tmp0, tmp1);
925     __lsx_vst(dst0, dst_rgb, 0);
926     dst_rgb += 16;
927     src_argb += 32;
928   }
929 }
930 
ARGBToARGB1555Row_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)931 void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
932                            uint8_t* dst_rgb,
933                            int width) {
934   int x;
935   int len = width / 8;
936   __m128i zero = __lsx_vldi(0);
937   __m128i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
938   __m128i shift1 = {0x0703070307030703, 0x0703070307030703};
939   __m128i shift2 = {0x0200020002000200, 0x0200020002000200};
940 
941   for (x = 0; x < len; x++) {
942     DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
943     tmp0 = __lsx_vpickev_b(src1, src0);
944     tmp1 = __lsx_vpickod_b(src1, src0);
945     tmp0 = __lsx_vsrli_b(tmp0, 3);
946     tmp1 = __lsx_vsrl_b(tmp1, shift1);
947     tmp0 = __lsx_vsll_b(tmp0, shift2);
948     tmp2 = __lsx_vpackev_b(zero, tmp1);
949     tmp3 = __lsx_vpackod_b(zero, tmp1);
950     tmp2 = __lsx_vslli_h(tmp2, 5);
951     tmp3 = __lsx_vslli_h(tmp3, 15);
952     dst0 = __lsx_vor_v(tmp0, tmp2);
953     dst0 = __lsx_vor_v(dst0, tmp3);
954     __lsx_vst(dst0, dst_rgb, 0);
955     dst_rgb += 16;
956     src_argb += 32;
957   }
958 }
959 
ARGBToARGB4444Row_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)960 void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
961                            uint8_t* dst_rgb,
962                            int width) {
963   int x;
964   int len = width / 8;
965   __m128i src0, src1, tmp0, tmp1, dst0;
966 
967   for (x = 0; x < len; x++) {
968     DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
969     tmp0 = __lsx_vpickev_b(src1, src0);
970     tmp1 = __lsx_vpickod_b(src1, src0);
971     tmp1 = __lsx_vandi_b(tmp1, 0xF0);
972     tmp0 = __lsx_vsrli_b(tmp0, 4);
973     dst0 = __lsx_vor_v(tmp1, tmp0);
974     __lsx_vst(dst0, dst_rgb, 0);
975     dst_rgb += 16;
976     src_argb += 32;
977   }
978 }
979 
ARGBToUV444Row_LSX(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int32_t width)980 void ARGBToUV444Row_LSX(const uint8_t* src_argb,
981                         uint8_t* dst_u,
982                         uint8_t* dst_v,
983                         int32_t width) {
984   int x;
985   int len = width / 16;
986   __m128i src0, src1, src2, src3;
987   __m128i tmp0, tmp1, tmp2, tmp3;
988   __m128i reg0, reg1, reg2, reg3, dst0, dst1;
989   __m128i const_112 = __lsx_vldi(112);
990   __m128i const_74 = __lsx_vldi(74);
991   __m128i const_38 = __lsx_vldi(38);
992   __m128i const_94 = __lsx_vldi(94);
993   __m128i const_18 = __lsx_vldi(18);
994   __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
995   for (x = 0; x < len; x++) {
996     DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
997               src0, src1, src2, src3);
998     tmp0 = __lsx_vpickev_h(src1, src0);
999     tmp1 = __lsx_vpickod_h(src1, src0);
1000     tmp2 = __lsx_vpickev_h(src3, src2);
1001     tmp3 = __lsx_vpickod_h(src3, src2);
1002     reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112);
1003     reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112);
1004     reg2 = __lsx_vmulwod_h_bu(tmp0, const_74);
1005     reg3 = __lsx_vmulwod_h_bu(tmp2, const_74);
1006     reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38);
1007     reg3 = __lsx_vmaddwev_h_bu(reg3, tmp3, const_38);
1008     reg0 = __lsx_vsub_h(reg0, reg2);
1009     reg1 = __lsx_vsub_h(reg1, reg3);
1010     reg0 = __lsx_vsrai_h(reg0, 8);
1011     reg1 = __lsx_vsrai_h(reg1, 8);
1012     dst0 = __lsx_vpickev_b(reg1, reg0);
1013 
1014     reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112);
1015     reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112);
1016     reg2 = __lsx_vmulwev_h_bu(tmp0, const_18);
1017     reg3 = __lsx_vmulwev_h_bu(tmp2, const_18);
1018     reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94);
1019     reg3 = __lsx_vmaddwod_h_bu(reg3, tmp2, const_94);
1020     reg0 = __lsx_vsub_h(reg0, reg2);
1021     reg1 = __lsx_vsub_h(reg1, reg3);
1022     reg0 = __lsx_vsrai_h(reg0, 8);
1023     reg1 = __lsx_vsrai_h(reg1, 8);
1024     dst1 = __lsx_vpickev_b(reg1, reg0);
1025 
1026     __lsx_vst(dst0, dst_u, 0);
1027     __lsx_vst(dst1, dst_v, 0);
1028     dst_u += 16;
1029     dst_v += 16;
1030     src_argb += 64;
1031   }
1032 }
1033 
ARGBMultiplyRow_LSX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1034 void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
1035                          const uint8_t* src_argb1,
1036                          uint8_t* dst_argb,
1037                          int width) {
1038   int x;
1039   int len = width / 4;
1040   __m128i zero = __lsx_vldi(0);
1041   __m128i src0, src1, dst0, dst1;
1042   __m128i tmp0, tmp1, tmp2, tmp3;
1043 
1044   for (x = 0; x < len; x++) {
1045     DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
1046     tmp0 = __lsx_vilvl_b(src0, src0);
1047     tmp1 = __lsx_vilvh_b(src0, src0);
1048     tmp2 = __lsx_vilvl_b(zero, src1);
1049     tmp3 = __lsx_vilvh_b(zero, src1);
1050     dst0 = __lsx_vmuh_hu(tmp0, tmp2);
1051     dst1 = __lsx_vmuh_hu(tmp1, tmp3);
1052     dst0 = __lsx_vpickev_b(dst1, dst0);
1053     __lsx_vst(dst0, dst_argb, 0);
1054     src_argb0 += 16;
1055     src_argb1 += 16;
1056     dst_argb += 16;
1057   }
1058 }
1059 
ARGBAddRow_LSX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1060 void ARGBAddRow_LSX(const uint8_t* src_argb0,
1061                     const uint8_t* src_argb1,
1062                     uint8_t* dst_argb,
1063                     int width) {
1064   int x;
1065   int len = width / 4;
1066   __m128i src0, src1, dst0;
1067 
1068   for (x = 0; x < len; x++) {
1069     DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
1070     dst0 = __lsx_vsadd_bu(src0, src1);
1071     __lsx_vst(dst0, dst_argb, 0);
1072     src_argb0 += 16;
1073     src_argb1 += 16;
1074     dst_argb += 16;
1075   }
1076 }
1077 
ARGBSubtractRow_LSX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1078 void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
1079                          const uint8_t* src_argb1,
1080                          uint8_t* dst_argb,
1081                          int width) {
1082   int x;
1083   int len = width / 4;
1084   __m128i src0, src1, dst0;
1085 
1086   for (x = 0; x < len; x++) {
1087     DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
1088     dst0 = __lsx_vssub_bu(src0, src1);
1089     __lsx_vst(dst0, dst_argb, 0);
1090     src_argb0 += 16;
1091     src_argb1 += 16;
1092     dst_argb += 16;
1093   }
1094 }
1095 
ARGBAttenuateRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,int width)1096 void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
1097                           uint8_t* dst_argb,
1098                           int width) {
1099   int x;
1100   int len = width / 8;
1101   __m128i src0, src1, tmp0, tmp1;
1102   __m128i reg0, reg1, reg2, reg3, reg4, reg5;
1103   __m128i b, g, r, a, dst0, dst1;
1104   __m128i control = {0x0005000100040000, 0x0007000300060002};
1105 
1106   for (x = 0; x < len; x++) {
1107     DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
1108     tmp0 = __lsx_vpickev_b(src1, src0);
1109     tmp1 = __lsx_vpickod_b(src1, src0);
1110     b = __lsx_vpackev_b(tmp0, tmp0);
1111     r = __lsx_vpackod_b(tmp0, tmp0);
1112     g = __lsx_vpackev_b(tmp1, tmp1);
1113     a = __lsx_vpackod_b(tmp1, tmp1);
1114     reg0 = __lsx_vmulwev_w_hu(b, a);
1115     reg1 = __lsx_vmulwod_w_hu(b, a);
1116     reg2 = __lsx_vmulwev_w_hu(r, a);
1117     reg3 = __lsx_vmulwod_w_hu(r, a);
1118     reg4 = __lsx_vmulwev_w_hu(g, a);
1119     reg5 = __lsx_vmulwod_w_hu(g, a);
1120     reg0 = __lsx_vssrani_h_w(reg1, reg0, 24);
1121     reg2 = __lsx_vssrani_h_w(reg3, reg2, 24);
1122     reg4 = __lsx_vssrani_h_w(reg5, reg4, 24);
1123     reg0 = __lsx_vshuf_h(control, reg0, reg0);
1124     reg2 = __lsx_vshuf_h(control, reg2, reg2);
1125     reg4 = __lsx_vshuf_h(control, reg4, reg4);
1126     tmp0 = __lsx_vpackev_b(reg4, reg0);
1127     tmp1 = __lsx_vpackev_b(a, reg2);
1128     dst0 = __lsx_vilvl_h(tmp1, tmp0);
1129     dst1 = __lsx_vilvh_h(tmp1, tmp0);
1130     __lsx_vst(dst0, dst_argb, 0);
1131     __lsx_vst(dst1, dst_argb, 16);
1132     dst_argb += 32;
1133     src_argb += 32;
1134   }
1135 }
1136 
ARGBToRGB565DitherRow_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,uint32_t dither4,int width)1137 void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
1138                                uint8_t* dst_rgb,
1139                                uint32_t dither4,
1140                                int width) {
1141   int x;
1142   int len = width / 8;
1143   __m128i src0, src1, tmp0, tmp1, dst0;
1144   __m128i b, g, r;
1145   __m128i zero = __lsx_vldi(0);
1146   __m128i vec_dither = __lsx_vldrepl_w(&dither4, 0);
1147 
1148   vec_dither = __lsx_vilvl_b(zero, vec_dither);
1149   for (x = 0; x < len; x++) {
1150     DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
1151     tmp0 = __lsx_vpickev_b(src1, src0);
1152     tmp1 = __lsx_vpickod_b(src1, src0);
1153     b = __lsx_vpackev_b(zero, tmp0);
1154     r = __lsx_vpackod_b(zero, tmp0);
1155     g = __lsx_vpackev_b(zero, tmp1);
1156     b = __lsx_vadd_h(b, vec_dither);
1157     g = __lsx_vadd_h(g, vec_dither);
1158     r = __lsx_vadd_h(r, vec_dither);
1159     DUP2_ARG1(__lsx_vclip255_h, b, g, b, g);
1160     r = __lsx_vclip255_h(r);
1161     b = __lsx_vsrai_h(b, 3);
1162     g = __lsx_vsrai_h(g, 2);
1163     r = __lsx_vsrai_h(r, 3);
1164     g = __lsx_vslli_h(g, 5);
1165     r = __lsx_vslli_h(r, 11);
1166     dst0 = __lsx_vor_v(b, g);
1167     dst0 = __lsx_vor_v(dst0, r);
1168     __lsx_vst(dst0, dst_rgb, 0);
1169     src_argb += 32;
1170     dst_rgb += 16;
1171   }
1172 }
1173 
ARGBShuffleRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1174 void ARGBShuffleRow_LSX(const uint8_t* src_argb,
1175                         uint8_t* dst_argb,
1176                         const uint8_t* shuffler,
1177                         int width) {
1178   int x;
1179   int len = width / 8;
1180   __m128i src0, src1, dst0, dst1;
1181   __m128i shuf = {0x0404040400000000, 0x0C0C0C0C08080808};
1182   __m128i temp = __lsx_vldrepl_w(shuffler, 0);
1183 
1184   shuf = __lsx_vadd_b(shuf, temp);
1185   for (x = 0; x < len; x++) {
1186     DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
1187     dst0 = __lsx_vshuf_b(src0, src0, shuf);
1188     dst1 = __lsx_vshuf_b(src1, src1, shuf);
1189     __lsx_vst(dst0, dst_argb, 0);
1190     __lsx_vst(dst1, dst_argb, 16);
1191     src_argb += 32;
1192     dst_argb += 32;
1193   }
1194 }
1195 
ARGBShadeRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)1196 void ARGBShadeRow_LSX(const uint8_t* src_argb,
1197                       uint8_t* dst_argb,
1198                       int width,
1199                       uint32_t value) {
1200   int x;
1201   int len = width / 4;
1202   __m128i src0, dst0, tmp0, tmp1;
1203   __m128i vec_value = __lsx_vreplgr2vr_w(value);
1204 
1205   vec_value = __lsx_vilvl_b(vec_value, vec_value);
1206   for (x = 0; x < len; x++) {
1207     src0 = __lsx_vld(src_argb, 0);
1208     tmp0 = __lsx_vilvl_b(src0, src0);
1209     tmp1 = __lsx_vilvh_b(src0, src0);
1210     tmp0 = __lsx_vmuh_hu(tmp0, vec_value);
1211     tmp1 = __lsx_vmuh_hu(tmp1, vec_value);
1212     dst0 = __lsx_vpickod_b(tmp1, tmp0);
1213     __lsx_vst(dst0, dst_argb, 0);
1214     src_argb += 16;
1215     dst_argb += 16;
1216   }
1217 }
1218 
ARGBGrayRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,int width)1219 void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1220   int x;
1221   int len = width / 8;
1222   __m128i src0, src1, tmp0, tmp1;
1223   __m128i reg0, reg1, reg2, dst0, dst1;
1224   __m128i const_128 = __lsx_vldi(0x480);
1225   __m128i const_150 = __lsx_vldi(0x96);
1226   __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
1227 
1228   for (x = 0; x < len; x++) {
1229     DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
1230     tmp0 = __lsx_vpickev_b(src1, src0);
1231     tmp1 = __lsx_vpickod_b(src1, src0);
1232     reg0 = __lsx_vdp2_h_bu(tmp0, const_br);
1233     reg1 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150);
1234     reg2 = __lsx_vadd_h(reg0, reg1);
1235     tmp0 = __lsx_vpackod_b(reg2, reg2);
1236     tmp1 = __lsx_vpackod_b(tmp1, reg2);
1237     dst0 = __lsx_vilvl_h(tmp1, tmp0);
1238     dst1 = __lsx_vilvh_h(tmp1, tmp0);
1239     __lsx_vst(dst0, dst_argb, 0);
1240     __lsx_vst(dst1, dst_argb, 16);
1241     src_argb += 32;
1242     dst_argb += 32;
1243   }
1244 }
1245 
ARGBSepiaRow_LSX(uint8_t * dst_argb,int width)1246 void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width) {
1247   int x;
1248   int len = width / 8;
1249   __m128i src0, src1, tmp0, tmp1;
1250   __m128i reg0, reg1, spb, spg, spr;
1251   __m128i dst0, dst1;
1252   __m128i spb_g = __lsx_vldi(68);
1253   __m128i spg_g = __lsx_vldi(88);
1254   __m128i spr_g = __lsx_vldi(98);
1255   __m128i spb_br = {0x2311231123112311, 0x2311231123112311};
1256   __m128i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16};
1257   __m128i spr_br = {0x3218321832183218, 0x3218321832183218};
1258   __m128i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908};
1259 
1260   for (x = 0; x < len; x++) {
1261     DUP2_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, src0, src1);
1262     tmp0 = __lsx_vpickev_b(src1, src0);
1263     tmp1 = __lsx_vpickod_b(src1, src0);
1264     DUP2_ARG2(__lsx_vdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
1265     spr = __lsx_vdp2_h_bu(tmp0, spr_br);
1266     spb = __lsx_vmaddwev_h_bu(spb, tmp1, spb_g);
1267     spg = __lsx_vmaddwev_h_bu(spg, tmp1, spg_g);
1268     spr = __lsx_vmaddwev_h_bu(spr, tmp1, spr_g);
1269     spb = __lsx_vsrli_h(spb, 7);
1270     spg = __lsx_vsrli_h(spg, 7);
1271     spr = __lsx_vsrli_h(spr, 7);
1272     spg = __lsx_vsat_hu(spg, 7);
1273     spr = __lsx_vsat_hu(spr, 7);
1274     reg0 = __lsx_vpackev_b(spg, spb);
1275     reg1 = __lsx_vshuf_b(tmp1, spr, shuff);
1276     dst0 = __lsx_vilvl_h(reg1, reg0);
1277     dst1 = __lsx_vilvh_h(reg1, reg0);
1278     __lsx_vst(dst0, dst_argb, 0);
1279     __lsx_vst(dst1, dst_argb, 16);
1280     dst_argb += 32;
1281   }
1282 }
1283 
ARGB4444ToARGBRow_LSX(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1284 void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444,
1285                            uint8_t* dst_argb,
1286                            int width) {
1287   int x;
1288   int len = width / 16;
1289   __m128i src0, src1;
1290   __m128i tmp0, tmp1, tmp2, tmp3;
1291   __m128i reg0, reg1, reg2, reg3;
1292   __m128i dst0, dst1, dst2, dst3;
1293 
1294   for (x = 0; x < len; x++) {
1295     src0 = __lsx_vld(src_argb4444, 0);
1296     src1 = __lsx_vld(src_argb4444, 16);
1297     tmp0 = __lsx_vandi_b(src0, 0x0F);
1298     tmp1 = __lsx_vandi_b(src0, 0xF0);
1299     tmp2 = __lsx_vandi_b(src1, 0x0F);
1300     tmp3 = __lsx_vandi_b(src1, 0xF0);
1301     reg0 = __lsx_vslli_b(tmp0, 4);
1302     reg2 = __lsx_vslli_b(tmp2, 4);
1303     reg1 = __lsx_vsrli_b(tmp1, 4);
1304     reg3 = __lsx_vsrli_b(tmp3, 4);
1305     DUP4_ARG2(__lsx_vor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, tmp0,
1306               tmp1, tmp2, tmp3);
1307     dst0 = __lsx_vilvl_b(tmp1, tmp0);
1308     dst2 = __lsx_vilvl_b(tmp3, tmp2);
1309     dst1 = __lsx_vilvh_b(tmp1, tmp0);
1310     dst3 = __lsx_vilvh_b(tmp3, tmp2);
1311     __lsx_vst(dst0, dst_argb, 0);
1312     __lsx_vst(dst1, dst_argb, 16);
1313     __lsx_vst(dst2, dst_argb, 32);
1314     __lsx_vst(dst3, dst_argb, 48);
1315     dst_argb += 64;
1316     src_argb4444 += 32;
1317   }
1318 }
1319 
ARGB1555ToARGBRow_LSX(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1320 void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555,
1321                            uint8_t* dst_argb,
1322                            int width) {
1323   int x;
1324   int len = width / 16;
1325   __m128i src0, src1;
1326   __m128i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
1327   __m128i reg0, reg1, reg2;
1328   __m128i dst0, dst1, dst2, dst3;
1329 
1330   for (x = 0; x < len; x++) {
1331     src0 = __lsx_vld(src_argb1555, 0);
1332     src1 = __lsx_vld(src_argb1555, 16);
1333     tmp0 = __lsx_vpickev_b(src1, src0);
1334     tmp1 = __lsx_vpickod_b(src1, src0);
1335     tmpb = __lsx_vandi_b(tmp0, 0x1F);
1336     tmpg = __lsx_vsrli_b(tmp0, 5);
1337     reg0 = __lsx_vandi_b(tmp1, 0x03);
1338     reg0 = __lsx_vslli_b(reg0, 3);
1339     tmpg = __lsx_vor_v(tmpg, reg0);
1340     reg1 = __lsx_vandi_b(tmp1, 0x7C);
1341     tmpr = __lsx_vsrli_b(reg1, 2);
1342     tmpa = __lsx_vsrli_b(tmp1, 7);
1343     tmpa = __lsx_vneg_b(tmpa);
1344     reg0 = __lsx_vslli_b(tmpb, 3);
1345     reg1 = __lsx_vslli_b(tmpg, 3);
1346     reg2 = __lsx_vslli_b(tmpr, 3);
1347     tmpb = __lsx_vsrli_b(tmpb, 2);
1348     tmpg = __lsx_vsrli_b(tmpg, 2);
1349     tmpr = __lsx_vsrli_b(tmpr, 2);
1350     tmpb = __lsx_vor_v(reg0, tmpb);
1351     tmpg = __lsx_vor_v(reg1, tmpg);
1352     tmpr = __lsx_vor_v(reg2, tmpr);
1353     DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
1354     dst0 = __lsx_vilvl_h(reg1, reg0);
1355     dst1 = __lsx_vilvh_h(reg1, reg0);
1356     DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
1357     dst2 = __lsx_vilvl_h(reg1, reg0);
1358     dst3 = __lsx_vilvh_h(reg1, reg0);
1359     __lsx_vst(dst0, dst_argb, 0);
1360     __lsx_vst(dst1, dst_argb, 16);
1361     __lsx_vst(dst2, dst_argb, 32);
1362     __lsx_vst(dst3, dst_argb, 48);
1363     dst_argb += 64;
1364     src_argb1555 += 32;
1365   }
1366 }
1367 
RGB565ToARGBRow_LSX(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)1368 void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565,
1369                          uint8_t* dst_argb,
1370                          int width) {
1371   int x;
1372   int len = width / 16;
1373   __m128i src0, src1;
1374   __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
1375   __m128i reg0, reg1, dst0, dst1, dst2, dst3;
1376   __m128i alpha = __lsx_vldi(0xFF);
1377 
1378   for (x = 0; x < len; x++) {
1379     src0 = __lsx_vld(src_rgb565, 0);
1380     src1 = __lsx_vld(src_rgb565, 16);
1381     tmp0 = __lsx_vpickev_b(src1, src0);
1382     tmp1 = __lsx_vpickod_b(src1, src0);
1383     tmpb = __lsx_vandi_b(tmp0, 0x1F);
1384     tmpr = __lsx_vandi_b(tmp1, 0xF8);
1385     reg1 = __lsx_vandi_b(tmp1, 0x07);
1386     reg0 = __lsx_vsrli_b(tmp0, 5);
1387     reg1 = __lsx_vslli_b(reg1, 3);
1388     tmpg = __lsx_vor_v(reg1, reg0);
1389     reg0 = __lsx_vslli_b(tmpb, 3);
1390     reg1 = __lsx_vsrli_b(tmpb, 2);
1391     tmpb = __lsx_vor_v(reg1, reg0);
1392     reg0 = __lsx_vslli_b(tmpg, 2);
1393     reg1 = __lsx_vsrli_b(tmpg, 4);
1394     tmpg = __lsx_vor_v(reg1, reg0);
1395     reg0 = __lsx_vsrli_b(tmpr, 5);
1396     tmpr = __lsx_vor_v(tmpr, reg0);
1397     DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
1398     dst0 = __lsx_vilvl_h(reg1, reg0);
1399     dst1 = __lsx_vilvh_h(reg1, reg0);
1400     DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
1401     dst2 = __lsx_vilvl_h(reg1, reg0);
1402     dst3 = __lsx_vilvh_h(reg1, reg0);
1403     __lsx_vst(dst0, dst_argb, 0);
1404     __lsx_vst(dst1, dst_argb, 16);
1405     __lsx_vst(dst2, dst_argb, 32);
1406     __lsx_vst(dst3, dst_argb, 48);
1407     dst_argb += 64;
1408     src_rgb565 += 32;
1409   }
1410 }
1411 
RGB24ToARGBRow_LSX(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)1412 void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24,
1413                         uint8_t* dst_argb,
1414                         int width) {
1415   int x;
1416   int len = width / 16;
1417   __m128i src0, src1, src2;
1418   __m128i tmp0, tmp1, tmp2;
1419   __m128i dst0, dst1, dst2, dst3;
1420   __m128i alpha = __lsx_vldi(0xFF);
1421   __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
1422   __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
1423   __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
1424   __m128i shuf3 = {0x1005040310020100, 0x100B0A0910080706};
1425 
1426   for (x = 0; x < len; x++) {
1427     src0 = __lsx_vld(src_rgb24, 0);
1428     src1 = __lsx_vld(src_rgb24, 16);
1429     src2 = __lsx_vld(src_rgb24, 32);
1430     DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
1431     tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
1432     DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
1433               tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
1434     __lsx_vst(dst0, dst_argb, 0);
1435     __lsx_vst(dst1, dst_argb, 16);
1436     __lsx_vst(dst2, dst_argb, 32);
1437     __lsx_vst(dst3, dst_argb, 48);
1438     dst_argb += 64;
1439     src_rgb24 += 48;
1440   }
1441 }
1442 
RAWToARGBRow_LSX(const uint8_t * src_raw,uint8_t * dst_argb,int width)1443 void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
1444   int x;
1445   int len = width / 16;
1446   __m128i src0, src1, src2;
1447   __m128i tmp0, tmp1, tmp2;
1448   __m128i dst0, dst1, dst2, dst3;
1449   __m128i alpha = __lsx_vldi(0xFF);
1450   __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
1451   __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
1452   __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
1453   __m128i shuf3 = {0x1003040510000102, 0x10090A0B10060708};
1454 
1455   for (x = 0; x < len; x++) {
1456     src0 = __lsx_vld(src_raw, 0);
1457     src1 = __lsx_vld(src_raw, 16);
1458     src2 = __lsx_vld(src_raw, 32);
1459     DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
1460     tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
1461     DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
1462               tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
1463     __lsx_vst(dst0, dst_argb, 0);
1464     __lsx_vst(dst1, dst_argb, 16);
1465     __lsx_vst(dst2, dst_argb, 32);
1466     __lsx_vst(dst3, dst_argb, 48);
1467     dst_argb += 64;
1468     src_raw += 48;
1469   }
1470 }
1471 
ARGB1555ToYRow_LSX(const uint8_t * src_argb1555,uint8_t * dst_y,int width)1472 void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555,
1473                         uint8_t* dst_y,
1474                         int width) {
1475   int x;
1476   int len = width / 16;
1477   __m128i src0, src1;
1478   __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
1479   __m128i reg0, reg1, reg2, dst0;
1480   __m128i const_66 = __lsx_vldi(66);
1481   __m128i const_129 = __lsx_vldi(129);
1482   __m128i const_25 = __lsx_vldi(25);
1483   __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
1484 
1485   for (x = 0; x < len; x++) {
1486     src0 = __lsx_vld(src_argb1555, 0);
1487     src1 = __lsx_vld(src_argb1555, 16);
1488     tmp0 = __lsx_vpickev_b(src1, src0);
1489     tmp1 = __lsx_vpickod_b(src1, src0);
1490     tmpb = __lsx_vandi_b(tmp0, 0x1F);
1491     tmpg = __lsx_vsrli_b(tmp0, 5);
1492     reg0 = __lsx_vandi_b(tmp1, 0x03);
1493     reg0 = __lsx_vslli_b(reg0, 3);
1494     tmpg = __lsx_vor_v(tmpg, reg0);
1495     reg1 = __lsx_vandi_b(tmp1, 0x7C);
1496     tmpr = __lsx_vsrli_b(reg1, 2);
1497     reg0 = __lsx_vslli_b(tmpb, 3);
1498     reg1 = __lsx_vslli_b(tmpg, 3);
1499     reg2 = __lsx_vslli_b(tmpr, 3);
1500     tmpb = __lsx_vsrli_b(tmpb, 2);
1501     tmpg = __lsx_vsrli_b(tmpg, 2);
1502     tmpr = __lsx_vsrli_b(tmpr, 2);
1503     tmpb = __lsx_vor_v(reg0, tmpb);
1504     tmpg = __lsx_vor_v(reg1, tmpg);
1505     tmpr = __lsx_vor_v(reg2, tmpr);
1506     reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
1507     reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
1508     reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
1509     reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
1510     reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
1511     reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
1512     dst0 = __lsx_vpackod_b(reg1, reg0);
1513     __lsx_vst(dst0, dst_y, 0);
1514     dst_y += 16;
1515     src_argb1555 += 32;
1516   }
1517 }
1518 
ARGB1555ToUVRow_LSX(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)1519 void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
1520                          int src_stride_argb1555,
1521                          uint8_t* dst_u,
1522                          uint8_t* dst_v,
1523                          int width) {
1524   int x;
1525   int len = width / 16;
1526   const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
1527   __m128i src0, src1, src2, src3;
1528   __m128i tmp0, tmp1, tmp2, tmp3;
1529   __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1530   __m128i reg0, reg1, reg2, reg3, dst0;
1531   __m128i const_112 = __lsx_vldi(0x438);
1532   __m128i const_74 = __lsx_vldi(0x425);
1533   __m128i const_38 = __lsx_vldi(0x413);
1534   __m128i const_94 = __lsx_vldi(0x42F);
1535   __m128i const_18 = __lsx_vldi(0x409);
1536   __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
1537 
1538   for (x = 0; x < len; x++) {
1539     DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0,
1540               next_argb1555, 16, src0, src1, src2, src3);
1541     DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
1542     DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
1543     tmpb = __lsx_vandi_b(tmp0, 0x1F);
1544     nexb = __lsx_vandi_b(tmp2, 0x1F);
1545     tmpg = __lsx_vsrli_b(tmp0, 5);
1546     nexg = __lsx_vsrli_b(tmp2, 5);
1547     reg0 = __lsx_vandi_b(tmp1, 0x03);
1548     reg2 = __lsx_vandi_b(tmp3, 0x03);
1549     reg0 = __lsx_vslli_b(reg0, 3);
1550     reg2 = __lsx_vslli_b(reg2, 3);
1551     tmpg = __lsx_vor_v(tmpg, reg0);
1552     nexg = __lsx_vor_v(nexg, reg2);
1553     reg1 = __lsx_vandi_b(tmp1, 0x7C);
1554     reg3 = __lsx_vandi_b(tmp3, 0x7C);
1555     tmpr = __lsx_vsrli_b(reg1, 2);
1556     nexr = __lsx_vsrli_b(reg3, 2);
1557     reg0 = __lsx_vslli_b(tmpb, 3);
1558     reg1 = __lsx_vslli_b(tmpg, 3);
1559     reg2 = __lsx_vslli_b(tmpr, 3);
1560     tmpb = __lsx_vsrli_b(tmpb, 2);
1561     tmpg = __lsx_vsrli_b(tmpg, 2);
1562     tmpr = __lsx_vsrli_b(tmpr, 2);
1563     tmpb = __lsx_vor_v(reg0, tmpb);
1564     tmpg = __lsx_vor_v(reg1, tmpg);
1565     tmpr = __lsx_vor_v(reg2, tmpr);
1566     reg0 = __lsx_vslli_b(nexb, 3);
1567     reg1 = __lsx_vslli_b(nexg, 3);
1568     reg2 = __lsx_vslli_b(nexr, 3);
1569     nexb = __lsx_vsrli_b(nexb, 2);
1570     nexg = __lsx_vsrli_b(nexg, 2);
1571     nexr = __lsx_vsrli_b(nexr, 2);
1572     nexb = __lsx_vor_v(reg0, nexb);
1573     nexg = __lsx_vor_v(reg1, nexg);
1574     nexr = __lsx_vor_v(reg2, nexr);
1575     RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1576     __lsx_vstelm_d(dst0, dst_u, 0, 0);
1577     __lsx_vstelm_d(dst0, dst_v, 0, 1);
1578     dst_u += 8;
1579     dst_v += 8;
1580     src_argb1555 += 32;
1581     next_argb1555 += 32;
1582   }
1583 }
1584 
RGB565ToYRow_LSX(const uint8_t * src_rgb565,uint8_t * dst_y,int width)1585 void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
1586   int x;
1587   int len = width / 16;
1588   __m128i src0, src1;
1589   __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
1590   __m128i reg0, reg1, dst0;
1591   __m128i const_66 = __lsx_vldi(66);
1592   __m128i const_129 = __lsx_vldi(129);
1593   __m128i const_25 = __lsx_vldi(25);
1594   __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
1595 
1596   for (x = 0; x < len; x++) {
1597     src0 = __lsx_vld(src_rgb565, 0);
1598     src1 = __lsx_vld(src_rgb565, 16);
1599     tmp0 = __lsx_vpickev_b(src1, src0);
1600     tmp1 = __lsx_vpickod_b(src1, src0);
1601     tmpb = __lsx_vandi_b(tmp0, 0x1F);
1602     tmpr = __lsx_vandi_b(tmp1, 0xF8);
1603     reg1 = __lsx_vandi_b(tmp1, 0x07);
1604     reg0 = __lsx_vsrli_b(tmp0, 5);
1605     reg1 = __lsx_vslli_b(reg1, 3);
1606     tmpg = __lsx_vor_v(reg1, reg0);
1607     reg0 = __lsx_vslli_b(tmpb, 3);
1608     reg1 = __lsx_vsrli_b(tmpb, 2);
1609     tmpb = __lsx_vor_v(reg1, reg0);
1610     reg0 = __lsx_vslli_b(tmpg, 2);
1611     reg1 = __lsx_vsrli_b(tmpg, 4);
1612     tmpg = __lsx_vor_v(reg1, reg0);
1613     reg0 = __lsx_vsrli_b(tmpr, 5);
1614     tmpr = __lsx_vor_v(tmpr, reg0);
1615     reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
1616     reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
1617     reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
1618     reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
1619     reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
1620     reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
1621     dst0 = __lsx_vpackod_b(reg1, reg0);
1622     __lsx_vst(dst0, dst_y, 0);
1623     dst_y += 16;
1624     src_rgb565 += 32;
1625   }
1626 }
1627 
RGB565ToUVRow_LSX(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)1628 void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
1629                        int src_stride_rgb565,
1630                        uint8_t* dst_u,
1631                        uint8_t* dst_v,
1632                        int width) {
1633   int x;
1634   int len = width / 16;
1635   const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
1636   __m128i src0, src1, src2, src3;
1637   __m128i tmp0, tmp1, tmp2, tmp3;
1638   __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1639   __m128i reg0, reg1, reg2, reg3, dst0;
1640   __m128i const_112 = __lsx_vldi(0x438);
1641   __m128i const_74 = __lsx_vldi(0x425);
1642   __m128i const_38 = __lsx_vldi(0x413);
1643   __m128i const_94 = __lsx_vldi(0x42F);
1644   __m128i const_18 = __lsx_vldi(0x409);
1645   __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
1646 
1647   for (x = 0; x < len; x++) {
1648     DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0,
1649               next_rgb565, 16, src0, src1, src2, src3);
1650     DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
1651     DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
1652     tmpb = __lsx_vandi_b(tmp0, 0x1F);
1653     tmpr = __lsx_vandi_b(tmp1, 0xF8);
1654     nexb = __lsx_vandi_b(tmp2, 0x1F);
1655     nexr = __lsx_vandi_b(tmp3, 0xF8);
1656     reg1 = __lsx_vandi_b(tmp1, 0x07);
1657     reg3 = __lsx_vandi_b(tmp3, 0x07);
1658     reg0 = __lsx_vsrli_b(tmp0, 5);
1659     reg1 = __lsx_vslli_b(reg1, 3);
1660     reg2 = __lsx_vsrli_b(tmp2, 5);
1661     reg3 = __lsx_vslli_b(reg3, 3);
1662     tmpg = __lsx_vor_v(reg1, reg0);
1663     nexg = __lsx_vor_v(reg2, reg3);
1664     reg0 = __lsx_vslli_b(tmpb, 3);
1665     reg1 = __lsx_vsrli_b(tmpb, 2);
1666     reg2 = __lsx_vslli_b(nexb, 3);
1667     reg3 = __lsx_vsrli_b(nexb, 2);
1668     tmpb = __lsx_vor_v(reg1, reg0);
1669     nexb = __lsx_vor_v(reg2, reg3);
1670     reg0 = __lsx_vslli_b(tmpg, 2);
1671     reg1 = __lsx_vsrli_b(tmpg, 4);
1672     reg2 = __lsx_vslli_b(nexg, 2);
1673     reg3 = __lsx_vsrli_b(nexg, 4);
1674     tmpg = __lsx_vor_v(reg1, reg0);
1675     nexg = __lsx_vor_v(reg2, reg3);
1676     reg0 = __lsx_vsrli_b(tmpr, 5);
1677     reg2 = __lsx_vsrli_b(nexr, 5);
1678     tmpr = __lsx_vor_v(tmpr, reg0);
1679     nexr = __lsx_vor_v(nexr, reg2);
1680     RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1681     __lsx_vstelm_d(dst0, dst_u, 0, 0);
1682     __lsx_vstelm_d(dst0, dst_v, 0, 1);
1683     dst_u += 8;
1684     dst_v += 8;
1685     src_rgb565 += 32;
1686     next_rgb565 += 32;
1687   }
1688 }
1689 
RGB24ToUVRow_LSX(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)1690 void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
1691                       int src_stride_rgb24,
1692                       uint8_t* dst_u,
1693                       uint8_t* dst_v,
1694                       int width) {
1695   int x;
1696   const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
1697   int len = width / 16;
1698   __m128i src0, src1, src2;
1699   __m128i nex0, nex1, nex2, dst0;
1700   __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1701   __m128i const_112 = __lsx_vldi(0x438);
1702   __m128i const_74 = __lsx_vldi(0x425);
1703   __m128i const_38 = __lsx_vldi(0x413);
1704   __m128i const_94 = __lsx_vldi(0x42F);
1705   __m128i const_18 = __lsx_vldi(0x409);
1706   __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
1707   __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18};
1708   __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908};
1709   __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
1710   __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
1711   __m128i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A};
1712   __m128i shuff1_r = {0x0706050403020100, 0x1F1C191613100908};
1713 
1714   for (x = 0; x < len; x++) {
1715     src0 = __lsx_vld(src_rgb24, 0);
1716     src1 = __lsx_vld(src_rgb24, 16);
1717     src2 = __lsx_vld(src_rgb24, 32);
1718     nex0 = __lsx_vld(next_rgb24, 0);
1719     nex1 = __lsx_vld(next_rgb24, 16);
1720     nex2 = __lsx_vld(next_rgb24, 32);
1721     DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
1722               nexb);
1723     DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
1724               nexg);
1725     DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
1726               nexr);
1727     DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
1728               nexb);
1729     DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
1730               nexg);
1731     DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
1732               nexr);
1733     RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1734     __lsx_vstelm_d(dst0, dst_u, 0, 0);
1735     __lsx_vstelm_d(dst0, dst_v, 0, 1);
1736     dst_u += 8;
1737     dst_v += 8;
1738     src_rgb24 += 48;
1739     next_rgb24 += 48;
1740   }
1741 }
1742 
RAWToUVRow_LSX(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)1743 void RAWToUVRow_LSX(const uint8_t* src_raw,
1744                     int src_stride_raw,
1745                     uint8_t* dst_u,
1746                     uint8_t* dst_v,
1747                     int width) {
1748   int x;
1749   const uint8_t* next_raw = src_raw + src_stride_raw;
1750   int len = width / 16;
1751   __m128i src0, src1, src2;
1752   __m128i nex0, nex1, nex2, dst0;
1753   __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1754   __m128i const_112 = __lsx_vldi(0x438);
1755   __m128i const_74 = __lsx_vldi(0x425);
1756   __m128i const_38 = __lsx_vldi(0x413);
1757   __m128i const_94 = __lsx_vldi(0x42F);
1758   __m128i const_18 = __lsx_vldi(0x409);
1759   __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
1760   __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18};
1761   __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908};
1762   __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
1763   __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
1764   __m128i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A};
1765   __m128i shuff1_b = {0x0706050403020100, 0x1F1C191613100908};
1766 
1767   for (x = 0; x < len; x++) {
1768     src0 = __lsx_vld(src_raw, 0);
1769     src1 = __lsx_vld(src_raw, 16);
1770     src2 = __lsx_vld(src_raw, 32);
1771     nex0 = __lsx_vld(next_raw, 0);
1772     nex1 = __lsx_vld(next_raw, 16);
1773     nex2 = __lsx_vld(next_raw, 32);
1774     DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
1775               nexb);
1776     DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
1777               nexg);
1778     DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
1779               nexr);
1780     DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
1781               nexb);
1782     DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
1783               nexg);
1784     DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
1785               nexr);
1786     RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1787     __lsx_vstelm_d(dst0, dst_u, 0, 0);
1788     __lsx_vstelm_d(dst0, dst_v, 0, 1);
1789     dst_u += 8;
1790     dst_v += 8;
1791     src_raw += 48;
1792     next_raw += 48;
1793   }
1794 }
1795 
NV12ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)1796 void NV12ToARGBRow_LSX(const uint8_t* src_y,
1797                        const uint8_t* src_uv,
1798                        uint8_t* dst_argb,
1799                        const struct YuvConstants* yuvconstants,
1800                        int width) {
1801   int x;
1802   int len = width / 8;
1803   __m128i vec_y, vec_vu;
1804   __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
1805   __m128i vec_vrub, vec_vgug;
1806   __m128i out_b, out_g, out_r;
1807   __m128i const_80 = __lsx_vldi(0x480);
1808   __m128i alpha = __lsx_vldi(0xFF);
1809   __m128i zero = __lsx_vldi(0);
1810 
1811   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
1812   vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
1813   vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
1814 
1815   for (x = 0; x < len; x++) {
1816     vec_y = __lsx_vld(src_y, 0);
1817     vec_vu = __lsx_vld(src_uv, 0);
1818     YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
1819              out_r);
1820     STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
1821     src_y += 8;
1822     src_uv += 8;
1823   }
1824 }
1825 
NV12ToRGB565Row_LSX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)1826 void NV12ToRGB565Row_LSX(const uint8_t* src_y,
1827                          const uint8_t* src_uv,
1828                          uint8_t* dst_rgb565,
1829                          const struct YuvConstants* yuvconstants,
1830                          int width) {
1831   int x;
1832   int len = width / 8;
1833   __m128i vec_y, vec_vu;
1834   __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
1835   __m128i vec_vrub, vec_vgug;
1836   __m128i out_b, out_g, out_r;
1837   __m128i const_80 = __lsx_vldi(0x480);
1838   __m128i zero = __lsx_vldi(0);
1839 
1840   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
1841   vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
1842   vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
1843 
1844   for (x = 0; x < len; x++) {
1845     vec_y = __lsx_vld(src_y, 0);
1846     vec_vu = __lsx_vld(src_uv, 0);
1847     YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
1848              out_r);
1849     out_b = __lsx_vsrli_h(out_b, 3);
1850     out_g = __lsx_vsrli_h(out_g, 2);
1851     out_r = __lsx_vsrli_h(out_r, 3);
1852     out_g = __lsx_vslli_h(out_g, 5);
1853     out_r = __lsx_vslli_h(out_r, 11);
1854     out_r = __lsx_vor_v(out_r, out_g);
1855     out_r = __lsx_vor_v(out_r, out_b);
1856     __lsx_vst(out_r, dst_rgb565, 0);
1857     src_y += 8;
1858     src_uv += 8;
1859     dst_rgb565 += 16;
1860   }
1861 }
1862 
NV21ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)1863 void NV21ToARGBRow_LSX(const uint8_t* src_y,
1864                        const uint8_t* src_vu,
1865                        uint8_t* dst_argb,
1866                        const struct YuvConstants* yuvconstants,
1867                        int width) {
1868   int x;
1869   int len = width / 8;
1870   __m128i vec_y, vec_uv;
1871   __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
1872   __m128i vec_ubvr, vec_ugvg;
1873   __m128i out_b, out_g, out_r;
1874   __m128i const_80 = __lsx_vldi(0x480);
1875   __m128i alpha = __lsx_vldi(0xFF);
1876   __m128i zero = __lsx_vldi(0);
1877 
1878   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
1879   vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
1880   vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
1881 
1882   for (x = 0; x < len; x++) {
1883     vec_y = __lsx_vld(src_y, 0);
1884     vec_uv = __lsx_vld(src_vu, 0);
1885     YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_r, out_g,
1886              out_b);
1887     STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
1888     src_y += 8;
1889     src_vu += 8;
1890   }
1891 }
1892 
SobelRow_LSX(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)1893 void SobelRow_LSX(const uint8_t* src_sobelx,
1894                   const uint8_t* src_sobely,
1895                   uint8_t* dst_argb,
1896                   int width) {
1897   int x;
1898   int len = width / 16;
1899   __m128i src0, src1, tmp0;
1900   __m128i out0, out1, out2, out3;
1901   __m128i alpha = __lsx_vldi(0xFF);
1902   __m128i shuff0 = {0x1001010110000000, 0x1003030310020202};
1903   __m128i shuff1 = __lsx_vaddi_bu(shuff0, 0x04);
1904   __m128i shuff2 = __lsx_vaddi_bu(shuff1, 0x04);
1905   __m128i shuff3 = __lsx_vaddi_bu(shuff2, 0x04);
1906 
1907   for (x = 0; x < len; x++) {
1908     src0 = __lsx_vld(src_sobelx, 0);
1909     src1 = __lsx_vld(src_sobely, 0);
1910     tmp0 = __lsx_vsadd_bu(src0, src1);
1911     DUP4_ARG3(__lsx_vshuf_b, alpha, tmp0, shuff0, alpha, tmp0, shuff1, alpha,
1912               tmp0, shuff2, alpha, tmp0, shuff3, out0, out1, out2, out3);
1913     __lsx_vst(out0, dst_argb, 0);
1914     __lsx_vst(out1, dst_argb, 16);
1915     __lsx_vst(out2, dst_argb, 32);
1916     __lsx_vst(out3, dst_argb, 48);
1917     src_sobelx += 16;
1918     src_sobely += 16;
1919     dst_argb += 64;
1920   }
1921 }
1922 
SobelToPlaneRow_LSX(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)1923 void SobelToPlaneRow_LSX(const uint8_t* src_sobelx,
1924                          const uint8_t* src_sobely,
1925                          uint8_t* dst_y,
1926                          int width) {
1927   int x;
1928   int len = width / 32;
1929   __m128i src0, src1, src2, src3, dst0, dst1;
1930 
1931   for (x = 0; x < len; x++) {
1932     DUP2_ARG2(__lsx_vld, src_sobelx, 0, src_sobelx, 16, src0, src1);
1933     DUP2_ARG2(__lsx_vld, src_sobely, 0, src_sobely, 16, src2, src3);
1934     dst0 = __lsx_vsadd_bu(src0, src2);
1935     dst1 = __lsx_vsadd_bu(src1, src3);
1936     __lsx_vst(dst0, dst_y, 0);
1937     __lsx_vst(dst1, dst_y, 16);
1938     src_sobelx += 32;
1939     src_sobely += 32;
1940     dst_y += 32;
1941   }
1942 }
1943 
SobelXYRow_LSX(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)1944 void SobelXYRow_LSX(const uint8_t* src_sobelx,
1945                     const uint8_t* src_sobely,
1946                     uint8_t* dst_argb,
1947                     int width) {
1948   int x;
1949   int len = width / 16;
1950   __m128i src_r, src_b, src_g;
1951   __m128i tmp0, tmp1, tmp2, tmp3;
1952   __m128i dst0, dst1, dst2, dst3;
1953   __m128i alpha = __lsx_vldi(0xFF);
1954 
1955   for (x = 0; x < len; x++) {
1956     src_r = __lsx_vld(src_sobelx, 0);
1957     src_b = __lsx_vld(src_sobely, 0);
1958     src_g = __lsx_vsadd_bu(src_r, src_b);
1959     tmp0 = __lsx_vilvl_b(src_g, src_b);
1960     tmp1 = __lsx_vilvh_b(src_g, src_b);
1961     tmp2 = __lsx_vilvl_b(alpha, src_r);
1962     tmp3 = __lsx_vilvh_b(alpha, src_r);
1963     dst0 = __lsx_vilvl_h(tmp2, tmp0);
1964     dst1 = __lsx_vilvh_h(tmp2, tmp0);
1965     dst2 = __lsx_vilvl_h(tmp3, tmp1);
1966     dst3 = __lsx_vilvh_h(tmp3, tmp1);
1967     __lsx_vst(dst0, dst_argb, 0);
1968     __lsx_vst(dst1, dst_argb, 16);
1969     __lsx_vst(dst2, dst_argb, 32);
1970     __lsx_vst(dst3, dst_argb, 48);
1971     src_sobelx += 16;
1972     src_sobely += 16;
1973     dst_argb += 64;
1974   }
1975 }
1976 
BGRAToUVRow_LSX(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1977 void BGRAToUVRow_LSX(const uint8_t* src_bgra,
1978                      int src_stride_bgra,
1979                      uint8_t* dst_u,
1980                      uint8_t* dst_v,
1981                      int width) {
1982   int x;
1983   const uint8_t* next_bgra = src_bgra + src_stride_bgra;
1984   int len = width / 16;
1985   __m128i src0, src1, src2, src3;
1986   __m128i nex0, nex1, nex2, nex3;
1987   __m128i tmp0, tmp1, tmp2, tmp3, dst0;
1988   __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1989   __m128i const_112 = __lsx_vldi(0x438);
1990   __m128i const_74 = __lsx_vldi(0x425);
1991   __m128i const_38 = __lsx_vldi(0x413);
1992   __m128i const_94 = __lsx_vldi(0x42F);
1993   __m128i const_18 = __lsx_vldi(0x409);
1994   __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
1995 
1996   for (x = 0; x < len; x++) {
1997     DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
1998               src0, src1, src2, src3);
1999     DUP4_ARG2(__lsx_vld, next_bgra, 0, next_bgra, 16, next_bgra, 32, next_bgra,
2000               48, nex0, nex1, nex2, nex3);
2001     tmp0 = __lsx_vpickod_b(src1, src0);
2002     tmp1 = __lsx_vpickev_b(src1, src0);
2003     tmp2 = __lsx_vpickod_b(src3, src2);
2004     tmp3 = __lsx_vpickev_b(src3, src2);
2005     tmpb = __lsx_vpickod_b(tmp2, tmp0);
2006     tmpr = __lsx_vpickev_b(tmp2, tmp0);
2007     tmpg = __lsx_vpickod_b(tmp3, tmp1);
2008     tmp0 = __lsx_vpickod_b(nex1, nex0);
2009     tmp1 = __lsx_vpickev_b(nex1, nex0);
2010     tmp2 = __lsx_vpickod_b(nex3, nex2);
2011     tmp3 = __lsx_vpickev_b(nex3, nex2);
2012     nexb = __lsx_vpickod_b(tmp2, tmp0);
2013     nexr = __lsx_vpickev_b(tmp2, tmp0);
2014     nexg = __lsx_vpickod_b(tmp3, tmp1);
2015     RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
2016     __lsx_vstelm_d(dst0, dst_u, 0, 0);
2017     __lsx_vstelm_d(dst0, dst_v, 0, 1);
2018     dst_u += 8;
2019     dst_v += 8;
2020     src_bgra += 64;
2021     next_bgra += 64;
2022   }
2023 }
2024 
ABGRToUVRow_LSX(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2025 void ABGRToUVRow_LSX(const uint8_t* src_abgr,
2026                      int src_stride_abgr,
2027                      uint8_t* dst_u,
2028                      uint8_t* dst_v,
2029                      int width) {
2030   int x;
2031   const uint8_t* next_abgr = src_abgr + src_stride_abgr;
2032   int len = width / 16;
2033   __m128i src0, src1, src2, src3;
2034   __m128i nex0, nex1, nex2, nex3;
2035   __m128i tmp0, tmp1, tmp2, tmp3, dst0;
2036   __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
2037   __m128i const_112 = __lsx_vldi(0x438);
2038   __m128i const_74 = __lsx_vldi(0x425);
2039   __m128i const_38 = __lsx_vldi(0x413);
2040   __m128i const_94 = __lsx_vldi(0x42F);
2041   __m128i const_18 = __lsx_vldi(0x409);
2042   __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
2043 
2044   for (x = 0; x < len; x++) {
2045     DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
2046               src0, src1, src2, src3);
2047     DUP4_ARG2(__lsx_vld, next_abgr, 0, next_abgr, 16, next_abgr, 32, next_abgr,
2048               48, nex0, nex1, nex2, nex3);
2049     tmp0 = __lsx_vpickev_b(src1, src0);
2050     tmp1 = __lsx_vpickod_b(src1, src0);
2051     tmp2 = __lsx_vpickev_b(src3, src2);
2052     tmp3 = __lsx_vpickod_b(src3, src2);
2053     tmpb = __lsx_vpickod_b(tmp2, tmp0);
2054     tmpr = __lsx_vpickev_b(tmp2, tmp0);
2055     tmpg = __lsx_vpickev_b(tmp3, tmp1);
2056     tmp0 = __lsx_vpickev_b(nex1, nex0);
2057     tmp1 = __lsx_vpickod_b(nex1, nex0);
2058     tmp2 = __lsx_vpickev_b(nex3, nex2);
2059     tmp3 = __lsx_vpickod_b(nex3, nex2);
2060     nexb = __lsx_vpickod_b(tmp2, tmp0);
2061     nexr = __lsx_vpickev_b(tmp2, tmp0);
2062     nexg = __lsx_vpickev_b(tmp3, tmp1);
2063     RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
2064     __lsx_vstelm_d(dst0, dst_u, 0, 0);
2065     __lsx_vstelm_d(dst0, dst_v, 0, 1);
2066     dst_u += 8;
2067     dst_v += 8;
2068     src_abgr += 64;
2069     next_abgr += 64;
2070   }
2071 }
2072 
RGBAToUVRow_LSX(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2073 void RGBAToUVRow_LSX(const uint8_t* src_rgba,
2074                      int src_stride_rgba,
2075                      uint8_t* dst_u,
2076                      uint8_t* dst_v,
2077                      int width) {
2078   int x;
2079   const uint8_t* next_rgba = src_rgba + src_stride_rgba;
2080   int len = width / 16;
2081   __m128i src0, src1, src2, src3;
2082   __m128i nex0, nex1, nex2, nex3;
2083   __m128i tmp0, tmp1, tmp2, tmp3, dst0;
2084   __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
2085   __m128i const_112 = __lsx_vldi(0x438);
2086   __m128i const_74 = __lsx_vldi(0x425);
2087   __m128i const_38 = __lsx_vldi(0x413);
2088   __m128i const_94 = __lsx_vldi(0x42F);
2089   __m128i const_18 = __lsx_vldi(0x409);
2090   __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
2091 
2092   for (x = 0; x < len; x++) {
2093     DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
2094               src0, src1, src2, src3);
2095     DUP4_ARG2(__lsx_vld, next_rgba, 0, next_rgba, 16, next_rgba, 32, next_rgba,
2096               48, nex0, nex1, nex2, nex3);
2097     tmp0 = __lsx_vpickod_b(src1, src0);
2098     tmp1 = __lsx_vpickev_b(src1, src0);
2099     tmp2 = __lsx_vpickod_b(src3, src2);
2100     tmp3 = __lsx_vpickev_b(src3, src2);
2101     tmpr = __lsx_vpickod_b(tmp2, tmp0);
2102     tmpb = __lsx_vpickev_b(tmp2, tmp0);
2103     tmpg = __lsx_vpickod_b(tmp3, tmp1);
2104     tmp0 = __lsx_vpickod_b(nex1, nex0);
2105     tmp1 = __lsx_vpickev_b(nex1, nex0);
2106     tmp2 = __lsx_vpickod_b(nex3, nex2);
2107     tmp3 = __lsx_vpickev_b(nex3, nex2);
2108     nexr = __lsx_vpickod_b(tmp2, tmp0);
2109     nexb = __lsx_vpickev_b(tmp2, tmp0);
2110     nexg = __lsx_vpickod_b(tmp3, tmp1);
2111     RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
2112     __lsx_vstelm_d(dst0, dst_u, 0, 0);
2113     __lsx_vstelm_d(dst0, dst_v, 0, 1);
2114     dst_u += 8;
2115     dst_v += 8;
2116     src_rgba += 64;
2117     next_rgba += 64;
2118   }
2119 }
2120 
ARGBToUVJRow_LSX(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2121 void ARGBToUVJRow_LSX(const uint8_t* src_argb,
2122                       int src_stride_argb,
2123                       uint8_t* dst_u,
2124                       uint8_t* dst_v,
2125                       int width) {
2126   int x;
2127   const uint8_t* next_argb = src_argb + src_stride_argb;
2128   int len = width / 16;
2129   __m128i src0, src1, src2, src3;
2130   __m128i nex0, nex1, nex2, nex3;
2131   __m128i tmp0, tmp1, tmp2, tmp3;
2132   __m128i reg0, reg1, dst0;
2133   __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
2134   __m128i const_63 = __lsx_vldi(0x43F);
2135   __m128i const_42 = __lsx_vldi(0x42A);
2136   __m128i const_21 = __lsx_vldi(0x415);
2137   __m128i const_53 = __lsx_vldi(0x435);
2138   __m128i const_10 = __lsx_vldi(0x40A);
2139   __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
2140 
2141   for (x = 0; x < len; x++) {
2142     DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
2143               src0, src1, src2, src3);
2144     DUP4_ARG2(__lsx_vld, next_argb, 0, next_argb, 16, next_argb, 32, next_argb,
2145               48, nex0, nex1, nex2, nex3);
2146     tmp0 = __lsx_vpickev_b(src1, src0);
2147     tmp1 = __lsx_vpickod_b(src1, src0);
2148     tmp2 = __lsx_vpickev_b(src3, src2);
2149     tmp3 = __lsx_vpickod_b(src3, src2);
2150     tmpr = __lsx_vpickod_b(tmp2, tmp0);
2151     tmpb = __lsx_vpickev_b(tmp2, tmp0);
2152     tmpg = __lsx_vpickev_b(tmp3, tmp1);
2153     tmp0 = __lsx_vpickev_b(nex1, nex0);
2154     tmp1 = __lsx_vpickod_b(nex1, nex0);
2155     tmp2 = __lsx_vpickev_b(nex3, nex2);
2156     tmp3 = __lsx_vpickod_b(nex3, nex2);
2157     nexr = __lsx_vpickod_b(tmp2, tmp0);
2158     nexb = __lsx_vpickev_b(tmp2, tmp0);
2159     nexg = __lsx_vpickev_b(tmp3, tmp1);
2160     tmp0 = __lsx_vaddwev_h_bu(tmpb, nexb);
2161     tmp1 = __lsx_vaddwod_h_bu(tmpb, nexb);
2162     tmp2 = __lsx_vaddwev_h_bu(tmpg, nexg);
2163     tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg);
2164     reg0 = __lsx_vaddwev_h_bu(tmpr, nexr);
2165     reg1 = __lsx_vaddwod_h_bu(tmpr, nexr);
2166     tmpb = __lsx_vavgr_hu(tmp0, tmp1);
2167     tmpg = __lsx_vavgr_hu(tmp2, tmp3);
2168     tmpr = __lsx_vavgr_hu(reg0, reg1);
2169     reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb);
2170     reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr);
2171     reg0 = __lsx_vmsub_h(reg0, const_42, tmpg);
2172     reg1 = __lsx_vmsub_h(reg1, const_53, tmpg);
2173     reg0 = __lsx_vmsub_h(reg0, const_21, tmpr);
2174     reg1 = __lsx_vmsub_h(reg1, const_10, tmpb);
2175     dst0 = __lsx_vpickod_b(reg1, reg0);
2176     __lsx_vstelm_d(dst0, dst_u, 0, 0);
2177     __lsx_vstelm_d(dst0, dst_v, 0, 1);
2178     dst_u += 8;
2179     dst_v += 8;
2180     src_argb += 64;
2181     next_argb += 64;
2182   }
2183 }
2184 
I444ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2185 void I444ToARGBRow_LSX(const uint8_t* src_y,
2186                        const uint8_t* src_u,
2187                        const uint8_t* src_v,
2188                        uint8_t* dst_argb,
2189                        const struct YuvConstants* yuvconstants,
2190                        int width) {
2191   int x;
2192   int len = width / 16;
2193   __m128i vec_y, vec_u, vec_v, out_b, out_g, out_r;
2194   __m128i vec_yl, vec_yh, vec_ul, vec_vl, vec_uh, vec_vh;
2195   __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb, vec_ugvg;
2196   __m128i const_80 = __lsx_vldi(0x480);
2197   __m128i alpha = __lsx_vldi(0xFF);
2198   __m128i zero = __lsx_vldi(0);
2199 
2200   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
2201   vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
2202 
2203   for (x = 0; x < len; x++) {
2204     vec_y = __lsx_vld(src_y, 0);
2205     vec_u = __lsx_vld(src_u, 0);
2206     vec_v = __lsx_vld(src_v, 0);
2207     vec_yl = __lsx_vilvl_b(vec_y, vec_y);
2208     vec_ul = __lsx_vilvl_b(zero, vec_u);
2209     vec_vl = __lsx_vilvl_b(zero, vec_v);
2210     I444TORGB(vec_yl, vec_ul, vec_vl, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
2211               out_b, out_g, out_r);
2212     STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
2213     vec_yh = __lsx_vilvh_b(vec_y, vec_y);
2214     vec_uh = __lsx_vilvh_b(zero, vec_u);
2215     vec_vh = __lsx_vilvh_b(zero, vec_v);
2216     I444TORGB(vec_yh, vec_uh, vec_vh, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
2217               out_b, out_g, out_r);
2218     STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
2219     src_y += 16;
2220     src_u += 16;
2221     src_v += 16;
2222   }
2223 }
2224 
I400ToARGBRow_LSX(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2225 void I400ToARGBRow_LSX(const uint8_t* src_y,
2226                        uint8_t* dst_argb,
2227                        const struct YuvConstants* yuvconstants,
2228                        int width) {
2229   int x;
2230   int len = width / 16;
2231   __m128i vec_y, vec_yl, vec_yh, out0;
2232   __m128i y_ev, y_od, dst0, dst1, dst2, dst3;
2233   __m128i temp0, temp1;
2234   __m128i alpha = __lsx_vldi(0xFF);
2235   __m128i vec_yg = __lsx_vreplgr2vr_h(yuvconstants->kYToRgb[0]);
2236   __m128i vec_yb = __lsx_vreplgr2vr_w(yuvconstants->kYBiasToRgb[0]);
2237 
2238   for (x = 0; x < len; x++) {
2239     vec_y = __lsx_vld(src_y, 0);
2240     vec_yl = __lsx_vilvl_b(vec_y, vec_y);
2241     y_ev = __lsx_vmulwev_w_hu_h(vec_yl, vec_yg);
2242     y_od = __lsx_vmulwod_w_hu_h(vec_yl, vec_yg);
2243     y_ev = __lsx_vsrai_w(y_ev, 16);
2244     y_od = __lsx_vsrai_w(y_od, 16);
2245     y_ev = __lsx_vadd_w(y_ev, vec_yb);
2246     y_od = __lsx_vadd_w(y_od, vec_yb);
2247     y_ev = __lsx_vsrai_w(y_ev, 6);
2248     y_od = __lsx_vsrai_w(y_od, 6);
2249     y_ev = __lsx_vclip255_w(y_ev);
2250     y_od = __lsx_vclip255_w(y_od);
2251     out0 = __lsx_vpackev_h(y_od, y_ev);
2252     temp0 = __lsx_vpackev_b(out0, out0);
2253     temp1 = __lsx_vpackev_b(alpha, out0);
2254     dst0 = __lsx_vilvl_h(temp1, temp0);
2255     dst1 = __lsx_vilvh_h(temp1, temp0);
2256     vec_yh = __lsx_vilvh_b(vec_y, vec_y);
2257     y_ev = __lsx_vmulwev_w_hu_h(vec_yh, vec_yg);
2258     y_od = __lsx_vmulwod_w_hu_h(vec_yh, vec_yg);
2259     y_ev = __lsx_vsrai_w(y_ev, 16);
2260     y_od = __lsx_vsrai_w(y_od, 16);
2261     y_ev = __lsx_vadd_w(y_ev, vec_yb);
2262     y_od = __lsx_vadd_w(y_od, vec_yb);
2263     y_ev = __lsx_vsrai_w(y_ev, 6);
2264     y_od = __lsx_vsrai_w(y_od, 6);
2265     y_ev = __lsx_vclip255_w(y_ev);
2266     y_od = __lsx_vclip255_w(y_od);
2267     out0 = __lsx_vpackev_h(y_od, y_ev);
2268     temp0 = __lsx_vpackev_b(out0, out0);
2269     temp1 = __lsx_vpackev_b(alpha, out0);
2270     dst2 = __lsx_vilvl_h(temp1, temp0);
2271     dst3 = __lsx_vilvh_h(temp1, temp0);
2272     __lsx_vst(dst0, dst_argb, 0);
2273     __lsx_vst(dst1, dst_argb, 16);
2274     __lsx_vst(dst2, dst_argb, 32);
2275     __lsx_vst(dst3, dst_argb, 48);
2276     dst_argb += 64;
2277     src_y += 16;
2278   }
2279 }
2280 
J400ToARGBRow_LSX(const uint8_t * src_y,uint8_t * dst_argb,int width)2281 void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width) {
2282   int x;
2283   int len = width / 16;
2284   __m128i vec_y, dst0, dst1, dst2, dst3;
2285   __m128i tmp0, tmp1, tmp2, tmp3;
2286   __m128i alpha = __lsx_vldi(0xFF);
2287 
2288   for (x = 0; x < len; x++) {
2289     vec_y = __lsx_vld(src_y, 0);
2290     tmp0 = __lsx_vilvl_b(vec_y, vec_y);
2291     tmp1 = __lsx_vilvh_b(vec_y, vec_y);
2292     tmp2 = __lsx_vilvl_b(alpha, vec_y);
2293     tmp3 = __lsx_vilvh_b(alpha, vec_y);
2294     dst0 = __lsx_vilvl_h(tmp2, tmp0);
2295     dst1 = __lsx_vilvh_h(tmp2, tmp0);
2296     dst2 = __lsx_vilvl_h(tmp3, tmp1);
2297     dst3 = __lsx_vilvh_h(tmp3, tmp1);
2298     __lsx_vst(dst0, dst_argb, 0);
2299     __lsx_vst(dst1, dst_argb, 16);
2300     __lsx_vst(dst2, dst_argb, 32);
2301     __lsx_vst(dst3, dst_argb, 48);
2302     dst_argb += 64;
2303     src_y += 16;
2304   }
2305 }
2306 
YUY2ToARGBRow_LSX(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2307 void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2,
2308                        uint8_t* dst_argb,
2309                        const struct YuvConstants* yuvconstants,
2310                        int width) {
2311   int x;
2312   int len = width / 8;
2313   __m128i src0, vec_y, vec_vu;
2314   __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
2315   __m128i vec_vrub, vec_vgug;
2316   __m128i out_b, out_g, out_r;
2317   __m128i const_80 = __lsx_vldi(0x480);
2318   __m128i zero = __lsx_vldi(0);
2319   __m128i alpha = __lsx_vldi(0xFF);
2320 
2321   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
2322   vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
2323   vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
2324 
2325   for (x = 0; x < len; x++) {
2326     src0 = __lsx_vld(src_yuy2, 0);
2327     vec_y = __lsx_vpickev_b(src0, src0);
2328     vec_vu = __lsx_vpickod_b(src0, src0);
2329     YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
2330              out_r);
2331     STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
2332     src_yuy2 += 16;
2333   }
2334 }
2335 
UYVYToARGBRow_LSX(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2336 void UYVYToARGBRow_LSX(const uint8_t* src_uyvy,
2337                        uint8_t* dst_argb,
2338                        const struct YuvConstants* yuvconstants,
2339                        int width) {
2340   int x;
2341   int len = width / 8;
2342   __m128i src0, vec_y, vec_vu;
2343   __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
2344   __m128i vec_vrub, vec_vgug;
2345   __m128i out_b, out_g, out_r;
2346   __m128i const_80 = __lsx_vldi(0x480);
2347   __m128i zero = __lsx_vldi(0);
2348   __m128i alpha = __lsx_vldi(0xFF);
2349 
2350   YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
2351   vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
2352   vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
2353 
2354   for (x = 0; x < len; x++) {
2355     src0 = __lsx_vld(src_uyvy, 0);
2356     vec_y = __lsx_vpickod_b(src0, src0);
2357     vec_vu = __lsx_vpickev_b(src0, src0);
2358     YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
2359              out_r);
2360     STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
2361     src_uyvy += 16;
2362   }
2363 }
2364 
InterpolateRow_LSX(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int32_t source_y_fraction)2365 void InterpolateRow_LSX(uint8_t* dst_ptr,
2366                         const uint8_t* src_ptr,
2367                         ptrdiff_t src_stride,
2368                         int width,
2369                         int32_t source_y_fraction) {
2370   int x;
2371   int y1_fraction = source_y_fraction;
2372   int y0_fraction = 256 - y1_fraction;
2373   const uint8_t* nex_ptr = src_ptr + src_stride;
2374   uint16_t y_fractions;
2375   int len = width / 32;
2376   __m128i src0, src1, nex0, nex1;
2377   __m128i dst0, dst1, y_frac;
2378   __m128i tmp0, tmp1, tmp2, tmp3;
2379   __m128i const_128 = __lsx_vldi(0x480);
2380 
2381   if (y1_fraction == 0) {
2382     for (x = 0; x < len; x++) {
2383       DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
2384       __lsx_vst(src0, dst_ptr, 0);
2385       __lsx_vst(src1, dst_ptr, 16);
2386       src_ptr += 32;
2387       dst_ptr += 32;
2388     }
2389     return;
2390   }
2391 
2392   if (y1_fraction == 128) {
2393     for (x = 0; x < len; x++) {
2394       DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
2395       DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
2396       dst0 = __lsx_vavgr_bu(src0, nex0);
2397       dst1 = __lsx_vavgr_bu(src1, nex1);
2398       __lsx_vst(dst0, dst_ptr, 0);
2399       __lsx_vst(dst1, dst_ptr, 16);
2400       src_ptr += 32;
2401       nex_ptr += 32;
2402       dst_ptr += 32;
2403     }
2404     return;
2405   }
2406 
2407   y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
2408   y_frac = __lsx_vreplgr2vr_h(y_fractions);
2409 
2410   for (x = 0; x < len; x++) {
2411     DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
2412     DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
2413     tmp0 = __lsx_vilvl_b(nex0, src0);
2414     tmp1 = __lsx_vilvh_b(nex0, src0);
2415     tmp2 = __lsx_vilvl_b(nex1, src1);
2416     tmp3 = __lsx_vilvh_b(nex1, src1);
2417     tmp0 = __lsx_vdp2add_h_bu(const_128, tmp0, y_frac);
2418     tmp1 = __lsx_vdp2add_h_bu(const_128, tmp1, y_frac);
2419     tmp2 = __lsx_vdp2add_h_bu(const_128, tmp2, y_frac);
2420     tmp3 = __lsx_vdp2add_h_bu(const_128, tmp3, y_frac);
2421     dst0 = __lsx_vsrlni_b_h(tmp1, tmp0, 8);
2422     dst1 = __lsx_vsrlni_b_h(tmp3, tmp2, 8);
2423     __lsx_vst(dst0, dst_ptr, 0);
2424     __lsx_vst(dst1, dst_ptr, 16);
2425     src_ptr += 32;
2426     nex_ptr += 32;
2427     dst_ptr += 32;
2428   }
2429 }
2430 
ARGBSetRow_LSX(uint8_t * dst_argb,uint32_t v32,int width)2431 void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width) {
2432   int x;
2433   int len = width / 4;
2434   __m128i dst0 = __lsx_vreplgr2vr_w(v32);
2435 
2436   for (x = 0; x < len; x++) {
2437     __lsx_vst(dst0, dst_argb, 0);
2438     dst_argb += 16;
2439   }
2440 }
2441 
RAWToRGB24Row_LSX(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)2442 void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
2443   int x;
2444   int len = width / 16;
2445   __m128i src0, src1, src2;
2446   __m128i dst0, dst1, dst2;
2447   __m128i shuf0 = {0x0708030405000102, 0x110C0D0E090A0B06};
2448   __m128i shuf1 = {0x1516171213140F10, 0x1F1E1B1C1D18191A};
2449   __m128i shuf2 = {0x090405060102031E, 0x0D0E0F0A0B0C0708};
2450 
2451   for (x = 0; x < len; x++) {
2452     DUP2_ARG2(__lsx_vld, src_raw, 0, src_raw, 16, src0, src1);
2453     src2 = __lsx_vld(src_raw, 32);
2454     DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src0, shuf1, dst0, dst1);
2455     dst2 = __lsx_vshuf_b(src1, src2, shuf2);
2456     dst1 = __lsx_vinsgr2vr_b(dst1, src_raw[32], 0x0E);
2457     __lsx_vst(dst0, dst_rgb24, 0);
2458     __lsx_vst(dst1, dst_rgb24, 16);
2459     __lsx_vst(dst2, dst_rgb24, 32);
2460     dst_rgb24 += 48;
2461     src_raw += 48;
2462   }
2463 }
2464 
MergeUVRow_LSX(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)2465 void MergeUVRow_LSX(const uint8_t* src_u,
2466                     const uint8_t* src_v,
2467                     uint8_t* dst_uv,
2468                     int width) {
2469   int x;
2470   int len = width / 16;
2471   __m128i src0, src1, dst0, dst1;
2472 
2473   for (x = 0; x < len; x++) {
2474     DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src0, src1);
2475     dst0 = __lsx_vilvl_b(src1, src0);
2476     dst1 = __lsx_vilvh_b(src1, src0);
2477     __lsx_vst(dst0, dst_uv, 0);
2478     __lsx_vst(dst1, dst_uv, 16);
2479     src_u += 16;
2480     src_v += 16;
2481     dst_uv += 32;
2482   }
2483 }
2484 
ARGBExtractAlphaRow_LSX(const uint8_t * src_argb,uint8_t * dst_a,int width)2485 void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
2486                              uint8_t* dst_a,
2487                              int width) {
2488   int x;
2489   int len = width / 16;
2490   __m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
2491 
2492   for (x = 0; x < len; x++) {
2493     DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
2494               src0, src1, src2, src3);
2495     tmp0 = __lsx_vpickod_b(src1, src0);
2496     tmp1 = __lsx_vpickod_b(src3, src2);
2497     dst0 = __lsx_vpickod_b(tmp1, tmp0);
2498     __lsx_vst(dst0, dst_a, 0);
2499     src_argb += 64;
2500     dst_a += 16;
2501   }
2502 }
2503 
ARGBBlendRow_LSX(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2504 void ARGBBlendRow_LSX(const uint8_t* src_argb,
2505                       const uint8_t* src_argb1,
2506                       uint8_t* dst_argb,
2507                       int width) {
2508   int x;
2509   int len = width / 8;
2510   __m128i src0, src1, src2, src3;
2511   __m128i tmp0, tmp1, dst0, dst1;
2512   __m128i reg0, reg1, reg2, reg3;
2513   __m128i a0, a1, a2, a3;
2514   __m128i const_256 = __lsx_vldi(0x500);
2515   __m128i zero = __lsx_vldi(0);
2516   __m128i alpha = __lsx_vldi(0xFF);
2517   __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000};
2518 
2519   for (x = 0; x < len; x++) {
2520     DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb1, 0, src_argb1, 16,
2521               src0, src1, src2, src3);
2522     tmp0 = __lsx_vshuf4i_b(src0, 0xFF);
2523     tmp1 = __lsx_vshuf4i_b(src1, 0xFF);
2524     a0 = __lsx_vilvl_b(zero, tmp0);
2525     a1 = __lsx_vilvh_b(zero, tmp0);
2526     a2 = __lsx_vilvl_b(zero, tmp1);
2527     a3 = __lsx_vilvh_b(zero, tmp1);
2528     reg0 = __lsx_vilvl_b(zero, src2);
2529     reg1 = __lsx_vilvh_b(zero, src2);
2530     reg2 = __lsx_vilvl_b(zero, src3);
2531     reg3 = __lsx_vilvh_b(zero, src3);
2532     DUP4_ARG2(__lsx_vsub_h, const_256, a0, const_256, a1, const_256, a2,
2533               const_256, a3, a0, a1, a2, a3);
2534     DUP4_ARG2(__lsx_vmul_h, a0, reg0, a1, reg1, a2, reg2, a3, reg3, reg0, reg1,
2535               reg2, reg3);
2536     DUP2_ARG3(__lsx_vsrani_b_h, reg1, reg0, 8, reg3, reg2, 8, dst0, dst1);
2537     dst0 = __lsx_vsadd_bu(dst0, src0);
2538     dst1 = __lsx_vsadd_bu(dst1, src1);
2539     dst0 = __lsx_vbitsel_v(dst0, alpha, control);
2540     dst1 = __lsx_vbitsel_v(dst1, alpha, control);
2541     __lsx_vst(dst0, dst_argb, 0);
2542     __lsx_vst(dst1, dst_argb, 16);
2543     src_argb += 32;
2544     src_argb1 += 32;
2545     dst_argb += 32;
2546   }
2547 }
2548 
ARGBQuantizeRow_LSX(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)2549 void ARGBQuantizeRow_LSX(uint8_t* dst_argb,
2550                          int scale,
2551                          int interval_size,
2552                          int interval_offset,
2553                          int width) {
2554   int x;
2555   int len = width / 16;
2556   __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
2557   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2558   __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
2559   __m128i vec_size = __lsx_vreplgr2vr_b(interval_size);
2560   __m128i vec_offset = __lsx_vreplgr2vr_b(interval_offset);
2561   __m128i vec_scale = __lsx_vreplgr2vr_w(scale);
2562   __m128i zero = __lsx_vldi(0);
2563   __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000};
2564 
2565   for (x = 0; x < len; x++) {
2566     DUP4_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, dst_argb, 32, dst_argb, 48,
2567               src0, src1, src2, src3);
2568     reg0 = __lsx_vilvl_b(zero, src0);
2569     reg1 = __lsx_vilvh_b(zero, src0);
2570     reg2 = __lsx_vilvl_b(zero, src1);
2571     reg3 = __lsx_vilvh_b(zero, src1);
2572     reg4 = __lsx_vilvl_b(zero, src2);
2573     reg5 = __lsx_vilvh_b(zero, src2);
2574     reg6 = __lsx_vilvl_b(zero, src3);
2575     reg7 = __lsx_vilvh_b(zero, src3);
2576     tmp0 = __lsx_vilvl_h(zero, reg0);
2577     tmp1 = __lsx_vilvh_h(zero, reg0);
2578     tmp2 = __lsx_vilvl_h(zero, reg1);
2579     tmp3 = __lsx_vilvh_h(zero, reg1);
2580     tmp4 = __lsx_vilvl_h(zero, reg2);
2581     tmp5 = __lsx_vilvh_h(zero, reg2);
2582     tmp6 = __lsx_vilvl_h(zero, reg3);
2583     tmp7 = __lsx_vilvh_h(zero, reg3);
2584     DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
2585               tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
2586     DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
2587               tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
2588     DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
2589               tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
2590     dst0 = __lsx_vpickev_b(reg1, reg0);
2591     dst1 = __lsx_vpickev_b(reg3, reg2);
2592     tmp0 = __lsx_vilvl_h(zero, reg4);
2593     tmp1 = __lsx_vilvh_h(zero, reg4);
2594     tmp2 = __lsx_vilvl_h(zero, reg5);
2595     tmp3 = __lsx_vilvh_h(zero, reg5);
2596     tmp4 = __lsx_vilvl_h(zero, reg6);
2597     tmp5 = __lsx_vilvh_h(zero, reg6);
2598     tmp6 = __lsx_vilvl_h(zero, reg7);
2599     tmp7 = __lsx_vilvh_h(zero, reg7);
2600     DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
2601               tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
2602     DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
2603               tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
2604     DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
2605               tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
2606     dst2 = __lsx_vpickev_b(reg1, reg0);
2607     dst3 = __lsx_vpickev_b(reg3, reg2);
2608     DUP4_ARG2(__lsx_vmul_b, dst0, vec_size, dst1, vec_size, dst2, vec_size,
2609               dst3, vec_size, dst0, dst1, dst2, dst3);
2610     DUP4_ARG2(__lsx_vadd_b, dst0, vec_offset, dst1, vec_offset, dst2,
2611               vec_offset, dst3, vec_offset, dst0, dst1, dst2, dst3);
2612     DUP4_ARG3(__lsx_vbitsel_v, dst0, src0, control, dst1, src1, control, dst2,
2613               src2, control, dst3, src3, control, dst0, dst1, dst2, dst3);
2614     __lsx_vst(dst0, dst_argb, 0);
2615     __lsx_vst(dst1, dst_argb, 16);
2616     __lsx_vst(dst2, dst_argb, 32);
2617     __lsx_vst(dst3, dst_argb, 48);
2618     dst_argb += 64;
2619   }
2620 }
2621 
ARGBColorMatrixRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)2622 void ARGBColorMatrixRow_LSX(const uint8_t* src_argb,
2623                             uint8_t* dst_argb,
2624                             const int8_t* matrix_argb,
2625                             int width) {
2626   int x;
2627   int len = width / 8;
2628   __m128i src0, src1, tmp0, tmp1, dst0, dst1;
2629   __m128i tmp_b, tmp_g, tmp_r, tmp_a;
2630   __m128i reg_b, reg_g, reg_r, reg_a;
2631   __m128i matrix_b = __lsx_vldrepl_w(matrix_argb, 0);
2632   __m128i matrix_g = __lsx_vldrepl_w(matrix_argb, 4);
2633   __m128i matrix_r = __lsx_vldrepl_w(matrix_argb, 8);
2634   __m128i matrix_a = __lsx_vldrepl_w(matrix_argb, 12);
2635 
2636   for (x = 0; x < len; x++) {
2637     DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
2638     DUP4_ARG2(__lsx_vdp2_h_bu_b, src0, matrix_b, src0, matrix_g, src0, matrix_r,
2639               src0, matrix_a, tmp_b, tmp_g, tmp_r, tmp_a);
2640     DUP4_ARG2(__lsx_vdp2_h_bu_b, src1, matrix_b, src1, matrix_g, src1, matrix_r,
2641               src1, matrix_a, reg_b, reg_g, reg_r, reg_a);
2642     DUP4_ARG2(__lsx_vhaddw_w_h, tmp_b, tmp_b, tmp_g, tmp_g, tmp_r, tmp_r, tmp_a,
2643               tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
2644     DUP4_ARG2(__lsx_vhaddw_w_h, reg_b, reg_b, reg_g, reg_g, reg_r, reg_r, reg_a,
2645               reg_a, reg_b, reg_g, reg_r, reg_a);
2646     DUP4_ARG2(__lsx_vsrai_w, tmp_b, 6, tmp_g, 6, tmp_r, 6, tmp_a, 6, tmp_b,
2647               tmp_g, tmp_r, tmp_a);
2648     DUP4_ARG2(__lsx_vsrai_w, reg_b, 6, reg_g, 6, reg_r, 6, reg_a, 6, reg_b,
2649               reg_g, reg_r, reg_a);
2650     DUP4_ARG1(__lsx_vclip255_w, tmp_b, tmp_g, tmp_r, tmp_a, tmp_b, tmp_g, tmp_r,
2651               tmp_a)
2652     DUP4_ARG1(__lsx_vclip255_w, reg_b, reg_g, reg_r, reg_a, reg_b, reg_g, reg_r,
2653               reg_a)
2654     DUP4_ARG2(__lsx_vpickev_h, reg_b, tmp_b, reg_g, tmp_g, reg_r, tmp_r, reg_a,
2655               tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
2656     tmp0 = __lsx_vpackev_b(tmp_g, tmp_b);
2657     tmp1 = __lsx_vpackev_b(tmp_a, tmp_r);
2658     dst0 = __lsx_vilvl_h(tmp1, tmp0);
2659     dst1 = __lsx_vilvh_h(tmp1, tmp0);
2660     __lsx_vst(dst0, dst_argb, 0);
2661     __lsx_vst(dst1, dst_argb, 16);
2662     src_argb += 32;
2663     dst_argb += 32;
2664   }
2665 }
2666 
SplitUVRow_LSX(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)2667 void SplitUVRow_LSX(const uint8_t* src_uv,
2668                     uint8_t* dst_u,
2669                     uint8_t* dst_v,
2670                     int width) {
2671   int x;
2672   int len = width / 32;
2673   __m128i src0, src1, src2, src3;
2674   __m128i dst0, dst1, dst2, dst3;
2675 
2676   for (x = 0; x < len; x++) {
2677     DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src0,
2678               src1, src2, src3);
2679     DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, dst0, dst1);
2680     DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst2, dst3);
2681     __lsx_vst(dst0, dst_u, 0);
2682     __lsx_vst(dst1, dst_u, 16);
2683     __lsx_vst(dst2, dst_v, 0);
2684     __lsx_vst(dst3, dst_v, 16);
2685     src_uv += 64;
2686     dst_u += 32;
2687     dst_v += 32;
2688   }
2689 }
2690 
SetRow_LSX(uint8_t * dst,uint8_t v8,int width)2691 void SetRow_LSX(uint8_t* dst, uint8_t v8, int width) {
2692   int x;
2693   int len = width / 16;
2694   __m128i dst0 = __lsx_vreplgr2vr_b(v8);
2695 
2696   for (x = 0; x < len; x++) {
2697     __lsx_vst(dst0, dst, 0);
2698     dst += 16;
2699   }
2700 }
2701 
MirrorSplitUVRow_LSX(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)2702 void MirrorSplitUVRow_LSX(const uint8_t* src_uv,
2703                           uint8_t* dst_u,
2704                           uint8_t* dst_v,
2705                           int width) {
2706   int x;
2707   int len = width / 32;
2708   __m128i src0, src1, src2, src3;
2709   __m128i dst0, dst1, dst2, dst3;
2710   __m128i shuff0 = {0x10121416181A1C1E, 0x00020406080A0C0E};
2711   __m128i shuff1 = {0x11131517191B1D1F, 0x01030507090B0D0F};
2712 
2713   src_uv += (width << 1);
2714   for (x = 0; x < len; x++) {
2715     src_uv -= 64;
2716     DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src2,
2717               src3, src0, src1);
2718     DUP4_ARG3(__lsx_vshuf_b, src1, src0, shuff1, src3, src2, shuff1, src1, src0,
2719               shuff0, src3, src2, shuff0, dst0, dst1, dst2, dst3);
2720     __lsx_vst(dst0, dst_v, 0);
2721     __lsx_vst(dst1, dst_v, 16);
2722     __lsx_vst(dst2, dst_u, 0);
2723     __lsx_vst(dst3, dst_u, 16);
2724     dst_u += 32;
2725     dst_v += 32;
2726   }
2727 }
2728 
HalfFloatRow_LSX(const uint16_t * src,uint16_t * dst,float scale,int width)2729 void HalfFloatRow_LSX(const uint16_t* src,
2730                       uint16_t* dst,
2731                       float scale,
2732                       int width) {
2733   int x;
2734   int len = width / 32;
2735   float mult = 1.9259299444e-34f * scale;
2736   __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
2737   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2738   __m128 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
2739   __m128 vec_mult = (__m128)__lsx_vldrepl_w(&mult, 0);
2740   __m128i zero = __lsx_vldi(0);
2741 
2742   for (x = 0; x < len; x++) {
2743     DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
2744               src3);
2745     DUP4_ARG2(__lsx_vilvl_h, zero, src0, zero, src1, zero, src2, zero, src3,
2746               tmp0, tmp2, tmp4, tmp6);
2747     DUP4_ARG2(__lsx_vilvh_h, zero, src0, zero, src1, zero, src2, zero, src3,
2748               tmp1, tmp3, tmp5, tmp7);
2749     DUP4_ARG1(__lsx_vffint_s_wu, tmp0, tmp2, tmp4, tmp6, reg0, reg2, reg4,
2750               reg6);
2751     DUP4_ARG1(__lsx_vffint_s_wu, tmp1, tmp3, tmp5, tmp7, reg1, reg3, reg5,
2752               reg7);
2753     DUP4_ARG2(__lsx_vfmul_s, reg0, vec_mult, reg1, vec_mult, reg2, vec_mult,
2754               reg3, vec_mult, reg0, reg1, reg2, reg3);
2755     DUP4_ARG2(__lsx_vfmul_s, reg4, vec_mult, reg5, vec_mult, reg6, vec_mult,
2756               reg7, vec_mult, reg4, reg5, reg6, reg7);
2757     DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg0, 13, (v4u32)reg1, 13, (v4u32)reg2, 13,
2758               (v4u32)reg3, 13, tmp0, tmp1, tmp2, tmp3);
2759     DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg4, 13, (v4u32)reg5, 13, (v4u32)reg6, 13,
2760               (v4u32)reg7, 13, tmp4, tmp5, tmp6, tmp7);
2761     DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
2762               dst0, dst1, dst2, dst3);
2763     __lsx_vst(dst0, dst, 0);
2764     __lsx_vst(dst1, dst, 16);
2765     __lsx_vst(dst2, dst, 32);
2766     __lsx_vst(dst3, dst, 48);
2767     src += 32;
2768     dst += 32;
2769   }
2770 }
2771 
2772 struct RgbConstants {
2773   uint8_t kRGBToY[4];
2774   uint16_t kAddY;
2775   uint16_t pad;
2776 };
2777 
2778 // RGB to JPeg coefficients
2779 // B * 0.1140 coefficient = 29
2780 // G * 0.5870 coefficient = 150
2781 // R * 0.2990 coefficient = 77
2782 // Add 0.5 = 0x80
2783 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
2784                                                         128,
2785                                                         0};
2786 
2787 static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
2788 
2789 // RGB to BT.601 coefficients
2790 // B * 0.1016 coefficient = 25
2791 // G * 0.5078 coefficient = 129
2792 // R * 0.2578 coefficient = 66
2793 // Add 16.5 = 0x1080
2794 
2795 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
2796                                                         0x1080,
2797                                                         0};
2798 
2799 static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
2800                                                       0x1080,
2801                                                       0};
2802 
2803 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
ARGBToYMatrixRow_LSX(const uint8_t * src_argb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2804 static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
2805                                  uint8_t* dst_y,
2806                                  int width,
2807                                  const struct RgbConstants* rgbconstants) {
2808   asm volatile(
2809       "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
2810       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
2811       "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
2812       "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
2813       "1:                                         \n\t"
2814       "vld            $vr4,  %0,    0             \n\t"
2815       "vld            $vr5,  %0,    16            \n\t"
2816       "vld            $vr6,  %0,    32            \n\t"
2817       "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
2818                                                          // ARGB
2819       "vor.v          $vr12, $vr3,  $vr3          \n\t"
2820       "vor.v          $vr13, $vr3,  $vr3          \n\t"
2821       "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
2822                                                          // loop.
2823       "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // BR
2824       "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
2825       "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // GA
2826       "vpickod.b      $vr11, $vr7,  $vr6          \n\t"
2827       "vmaddwev.h.bu  $vr12, $vr8,  $vr0          \n\t"  // B
2828       "vmaddwev.h.bu  $vr13, $vr10, $vr0          \n\t"
2829       "vmaddwev.h.bu  $vr12, $vr9,  $vr1          \n\t"  // G
2830       "vmaddwev.h.bu  $vr13, $vr11, $vr1          \n\t"
2831       "vmaddwod.h.bu  $vr12, $vr8,  $vr2          \n\t"  // R
2832       "vmaddwod.h.bu  $vr13, $vr10, $vr2          \n\t"
2833       "addi.d         %0,    %0,    64            \n\t"
2834       "vpickod.b      $vr10, $vr13, $vr12         \n\t"
2835       "vst            $vr10, %1,    0             \n\t"
2836       "addi.d         %1,    %1,    16            \n\t"
2837       "bnez           %2,    1b                   \n\t"
2838       : "+&r"(src_argb),  // %0
2839         "+&r"(dst_y),     // %1
2840         "+&r"(width)      // %2
2841       : "r"(rgbconstants)
2842       : "memory");
2843 }
2844 
ARGBToYRow_LSX(const uint8_t * src_argb,uint8_t * dst_y,int width)2845 void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
2846   ARGBToYMatrixRow_LSX(src_argb, dst_y, width, &kRgb24I601Constants);
2847 }
2848 
ARGBToYJRow_LSX(const uint8_t * src_argb,uint8_t * dst_yj,int width)2849 void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
2850   ARGBToYMatrixRow_LSX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
2851 }
2852 
ABGRToYRow_LSX(const uint8_t * src_abgr,uint8_t * dst_y,int width)2853 void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
2854   ARGBToYMatrixRow_LSX(src_abgr, dst_y, width, &kRawI601Constants);
2855 }
2856 
ABGRToYJRow_LSX(const uint8_t * src_abgr,uint8_t * dst_yj,int width)2857 void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
2858   ARGBToYMatrixRow_LSX(src_abgr, dst_yj, width, &kRawJPEGConstants);
2859 }
2860 
2861 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
2862 // Same code as ARGB, except the LD4
RGBAToYMatrixRow_LSX(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2863 static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
2864                                  uint8_t* dst_y,
2865                                  int width,
2866                                  const struct RgbConstants* rgbconstants) {
2867   asm volatile(
2868       "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
2869       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
2870       "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
2871       "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
2872       "1:                                         \n\t"
2873       "vld            $vr4,  %0,    0             \n\t"
2874       "vld            $vr5,  %0,    16            \n\t"
2875       "vld            $vr6,  %0,    32            \n\t"
2876       "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
2877                                                          // RGBA
2878       "vor.v          $vr12, $vr3,  $vr3          \n\t"
2879       "vor.v          $vr13, $vr3,  $vr3          \n\t"
2880       "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
2881                                                          // loop.
2882       "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // AG
2883       "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
2884       "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // BR
2885       "vpickod.b      $vr11, $vr7,  $vr6          \n\t"
2886       "vmaddwev.h.bu  $vr12, $vr9,  $vr0          \n\t"  // B
2887       "vmaddwev.h.bu  $vr13, $vr11, $vr0          \n\t"
2888       "vmaddwod.h.bu  $vr12, $vr8,  $vr1          \n\t"  // G
2889       "vmaddwod.h.bu  $vr13, $vr10, $vr1          \n\t"
2890       "vmaddwod.h.bu  $vr12, $vr9,  $vr2          \n\t"  // R
2891       "vmaddwod.h.bu  $vr13, $vr11, $vr2          \n\t"
2892       "addi.d         %0,    %0,    64            \n\t"
2893       "vpickod.b      $vr10, $vr13, $vr12         \n\t"
2894       "vst            $vr10, %1,    0             \n\t"
2895       "addi.d         %1,    %1,    16            \n\t"
2896       "bnez           %2,    1b                   \n\t"
2897       : "+&r"(src_rgba),  // %0
2898         "+&r"(dst_y),     // %1
2899         "+&r"(width)      // %2
2900       : "r"(rgbconstants)
2901       : "memory");
2902 }
2903 
RGBAToYRow_LSX(const uint8_t * src_rgba,uint8_t * dst_y,int width)2904 void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
2905   RGBAToYMatrixRow_LSX(src_rgba, dst_y, width, &kRgb24I601Constants);
2906 }
2907 
RGBAToYJRow_LSX(const uint8_t * src_rgba,uint8_t * dst_yj,int width)2908 void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
2909   RGBAToYMatrixRow_LSX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
2910 }
2911 
BGRAToYRow_LSX(const uint8_t * src_bgra,uint8_t * dst_y,int width)2912 void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
2913   RGBAToYMatrixRow_LSX(src_bgra, dst_y, width, &kRawI601Constants);
2914 }
2915 
RGBToYMatrixRow_LSX(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2916 static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
2917                                 uint8_t* dst_y,
2918                                 int width,
2919                                 const struct RgbConstants* rgbconstants) {
2920   int8_t shuff[64] = {0,  2,  3,  5,  6,  8,  9,  11, 12, 14, 15, 17, 18,
2921                       20, 21, 23, 24, 26, 27, 29, 30, 0,  1,  3,  4,  6,
2922                       7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
2923                       0,  13, 0,  16, 0,  19, 0,  22, 0,  25, 0,  28, 0,
2924                       31, 0,  2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
2925   asm volatile(
2926       "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
2927       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
2928       "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
2929       "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
2930       "vld            $vr4,  %4,    0             \n\t"  // load shuff
2931       "vld            $vr5,  %4,    16            \n\t"
2932       "vld            $vr6,  %4,    32            \n\t"
2933       "vld            $vr7,  %4,    48            \n\t"
2934       "1:                                         \n\t"
2935       "vld            $vr8,  %0,    0             \n\t"
2936       "vld            $vr9,  %0,    16            \n\t"
2937       "vld            $vr10, %0,    32            \n\t"  // load 16 pixels of
2938                                                          // RGB
2939       "vor.v          $vr12, $vr3,  $vr3          \n\t"
2940       "vor.v          $vr13, $vr3,  $vr3          \n\t"
2941       "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
2942                                                          // loop.
2943       "vshuf.b        $vr14, $vr9,  $vr8,  $vr4   \n\t"
2944       "vshuf.b        $vr15, $vr9,  $vr10, $vr5   \n\t"
2945       "vshuf.b        $vr16, $vr9,  $vr8,  $vr6   \n\t"
2946       "vshuf.b        $vr17, $vr9,  $vr10, $vr7   \n\t"
2947       "vmaddwev.h.bu  $vr12, $vr16, $vr1          \n\t"  // G
2948       "vmaddwev.h.bu  $vr13, $vr17, $vr1          \n\t"
2949       "vmaddwev.h.bu  $vr12, $vr14, $vr0          \n\t"  // B
2950       "vmaddwev.h.bu  $vr13, $vr15, $vr0          \n\t"
2951       "vmaddwod.h.bu  $vr12, $vr14, $vr2          \n\t"  // R
2952       "vmaddwod.h.bu  $vr13, $vr15, $vr2          \n\t"
2953       "addi.d         %0,    %0,    48            \n\t"
2954       "vpickod.b      $vr10, $vr13, $vr12         \n\t"
2955       "vst            $vr10, %1,    0             \n\t"
2956       "addi.d         %1,    %1,    16            \n\t"
2957       "bnez           %2,    1b                   \n\t"
2958       : "+&r"(src_rgba),    // %0
2959         "+&r"(dst_y),       // %1
2960         "+&r"(width)        // %2
2961       : "r"(rgbconstants),  // %3
2962         "r"(shuff)          // %4
2963       : "memory");
2964 }
2965 
RGB24ToYJRow_LSX(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)2966 void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
2967   RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
2968 }
2969 
RAWToYJRow_LSX(const uint8_t * src_raw,uint8_t * dst_yj,int width)2970 void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
2971   RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants);
2972 }
2973 
RGB24ToYRow_LSX(const uint8_t * src_rgb24,uint8_t * dst_y,int width)2974 void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
2975   RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants);
2976 }
2977 
RAWToYRow_LSX(const uint8_t * src_raw,uint8_t * dst_y,int width)2978 void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
2979   RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants);
2980 }
2981 
2982 #ifdef __cplusplus
2983 }  // extern "C"
2984 }  // namespace libyuv
2985 #endif
2986 
2987 #endif  // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
2988