1 /*
2 * Copyright 2022 The LibYuv Project Authors. All rights reserved.
3 *
4 * Copyright (c) 2022 Loongson Technology Corporation Limited
5 *
6 * Use of this source code is governed by a BSD-style license
7 * that can be found in the LICENSE file in the root of the source
8 * tree. An additional intellectual property rights grant can be found
9 * in the file PATENTS. All contributing project authors may
10 * be found in the AUTHORS file in the root of the source tree.
11 */
12
13 #include "libyuv/row.h"
14
15 #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
16 #include "libyuv/loongson_intrinsics.h"
17
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22
23 // Fill YUV -> RGB conversion constants into vectors
24 #define YUVTORGB_SETUP(yuvconst, vr, ub, vg, ug, yg, yb) \
25 { \
26 ub = __lsx_vreplgr2vr_h(yuvconst->kUVToB[0]); \
27 vr = __lsx_vreplgr2vr_h(yuvconst->kUVToR[1]); \
28 ug = __lsx_vreplgr2vr_h(yuvconst->kUVToG[0]); \
29 vg = __lsx_vreplgr2vr_h(yuvconst->kUVToG[1]); \
30 yg = __lsx_vreplgr2vr_h(yuvconst->kYToRgb[0]); \
31 yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
32 }
33
34 // Load 32 YUV422 pixel data
35 #define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
36 { \
37 __m128i temp0, temp1; \
38 \
39 DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0); \
40 temp1 = __lsx_vld(psrc_v, 0); \
41 temp0 = __lsx_vsub_b(temp0, const_80); \
42 temp1 = __lsx_vsub_b(temp1, const_80); \
43 temp0 = __lsx_vsllwil_h_b(temp0, 0); \
44 temp1 = __lsx_vsllwil_h_b(temp1, 0); \
45 uv_l = __lsx_vilvl_h(temp0, temp1); \
46 uv_h = __lsx_vilvh_h(temp0, temp1); \
47 }
48
49 // Load 16 YUV422 pixel data
50 #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
51 { \
52 __m128i temp0, temp1; \
53 \
54 out_y = __lsx_vld(psrc_y, 0); \
55 temp0 = __lsx_vldrepl_d(psrc_u, 0); \
56 temp1 = __lsx_vldrepl_d(psrc_v, 0); \
57 uv = __lsx_vilvl_b(temp0, temp1); \
58 uv = __lsx_vsub_b(uv, const_80); \
59 uv = __lsx_vsllwil_h_b(uv, 0); \
60 }
61
62 // Convert 16 pixels of YUV420 to RGB.
63 #define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \
64 g_h, r_l, r_h) \
65 { \
66 __m128i u_l, u_h, v_l, v_h; \
67 __m128i yl_ev, yl_od, yh_ev, yh_od; \
68 __m128i temp0, temp1, temp2, temp3; \
69 \
70 temp0 = __lsx_vilvl_b(in_y, in_y); \
71 temp1 = __lsx_vilvh_b(in_y, in_y); \
72 yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg); \
73 yl_od = __lsx_vmulwod_w_hu_h(temp0, yg); \
74 yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg); \
75 yh_od = __lsx_vmulwod_w_hu_h(temp1, yg); \
76 DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \
77 yl_ev, yl_od, yh_ev, yh_od); \
78 yl_ev = __lsx_vadd_w(yl_ev, yb); \
79 yl_od = __lsx_vadd_w(yl_od, yb); \
80 yh_ev = __lsx_vadd_w(yh_ev, yb); \
81 yh_od = __lsx_vadd_w(yh_od, yb); \
82 v_l = __lsx_vmulwev_w_h(in_uvl, ubvr); \
83 u_l = __lsx_vmulwod_w_h(in_uvl, ubvr); \
84 v_h = __lsx_vmulwev_w_h(in_uvh, ubvr); \
85 u_h = __lsx_vmulwod_w_h(in_uvh, ubvr); \
86 temp0 = __lsx_vadd_w(yl_ev, u_l); \
87 temp1 = __lsx_vadd_w(yl_od, u_l); \
88 temp2 = __lsx_vadd_w(yh_ev, u_h); \
89 temp3 = __lsx_vadd_w(yh_od, u_h); \
90 DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
91 temp1, temp2, temp3); \
92 DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
93 temp2, temp3); \
94 b_l = __lsx_vpackev_h(temp1, temp0); \
95 b_h = __lsx_vpackev_h(temp3, temp2); \
96 temp0 = __lsx_vadd_w(yl_ev, v_l); \
97 temp1 = __lsx_vadd_w(yl_od, v_l); \
98 temp2 = __lsx_vadd_w(yh_ev, v_h); \
99 temp3 = __lsx_vadd_w(yh_od, v_h); \
100 DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
101 temp1, temp2, temp3); \
102 DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
103 temp2, temp3); \
104 r_l = __lsx_vpackev_h(temp1, temp0); \
105 r_h = __lsx_vpackev_h(temp3, temp2); \
106 DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \
107 temp0 = __lsx_vsub_w(yl_ev, u_l); \
108 temp1 = __lsx_vsub_w(yl_od, u_l); \
109 temp2 = __lsx_vsub_w(yh_ev, u_h); \
110 temp3 = __lsx_vsub_w(yh_od, u_h); \
111 DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
112 temp1, temp2, temp3); \
113 DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
114 temp2, temp3); \
115 g_l = __lsx_vpackev_h(temp1, temp0); \
116 g_h = __lsx_vpackev_h(temp3, temp2); \
117 }
118
119 // Convert 8 pixels of YUV420 to RGB.
120 #define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \
121 { \
122 __m128i y_ev, y_od, u_l, v_l; \
123 __m128i tmp0, tmp1, tmp2, tmp3; \
124 \
125 tmp0 = __lsx_vilvl_b(in_y, in_y); \
126 y_ev = __lsx_vmulwev_w_hu_h(tmp0, yg); \
127 y_od = __lsx_vmulwod_w_hu_h(tmp0, yg); \
128 y_ev = __lsx_vsrai_w(y_ev, 16); \
129 y_od = __lsx_vsrai_w(y_od, 16); \
130 y_ev = __lsx_vadd_w(y_ev, yb); \
131 y_od = __lsx_vadd_w(y_od, yb); \
132 in_vu = __lsx_vilvl_b(zero, in_vu); \
133 in_vu = __lsx_vsub_h(in_vu, const_80); \
134 u_l = __lsx_vmulwev_w_h(in_vu, vrub); \
135 v_l = __lsx_vmulwod_w_h(in_vu, vrub); \
136 tmp0 = __lsx_vadd_w(y_ev, u_l); \
137 tmp1 = __lsx_vadd_w(y_od, u_l); \
138 tmp2 = __lsx_vadd_w(y_ev, v_l); \
139 tmp3 = __lsx_vadd_w(y_od, v_l); \
140 tmp0 = __lsx_vsrai_w(tmp0, 6); \
141 tmp1 = __lsx_vsrai_w(tmp1, 6); \
142 tmp2 = __lsx_vsrai_w(tmp2, 6); \
143 tmp3 = __lsx_vsrai_w(tmp3, 6); \
144 tmp0 = __lsx_vclip255_w(tmp0); \
145 tmp1 = __lsx_vclip255_w(tmp1); \
146 tmp2 = __lsx_vclip255_w(tmp2); \
147 tmp3 = __lsx_vclip255_w(tmp3); \
148 out_b = __lsx_vpackev_h(tmp1, tmp0); \
149 out_r = __lsx_vpackev_h(tmp3, tmp2); \
150 tmp0 = __lsx_vdp2_w_h(in_vu, vgug); \
151 tmp1 = __lsx_vsub_w(y_ev, tmp0); \
152 tmp2 = __lsx_vsub_w(y_od, tmp0); \
153 tmp1 = __lsx_vsrai_w(tmp1, 6); \
154 tmp2 = __lsx_vsrai_w(tmp2, 6); \
155 tmp1 = __lsx_vclip255_w(tmp1); \
156 tmp2 = __lsx_vclip255_w(tmp2); \
157 out_g = __lsx_vpackev_h(tmp2, tmp1); \
158 }
159
160 // Convert I444 pixels of YUV420 to RGB.
161 #define I444TORGB(in_yy, in_u, in_v, ub, vr, ugvg, yg, yb, out_b, out_g, \
162 out_r) \
163 { \
164 __m128i y_ev, y_od, u_ev, v_ev, u_od, v_od; \
165 __m128i tmp0, tmp1, tmp2, tmp3; \
166 \
167 y_ev = __lsx_vmulwev_w_hu_h(in_yy, yg); \
168 y_od = __lsx_vmulwod_w_hu_h(in_yy, yg); \
169 y_ev = __lsx_vsrai_w(y_ev, 16); \
170 y_od = __lsx_vsrai_w(y_od, 16); \
171 y_ev = __lsx_vadd_w(y_ev, yb); \
172 y_od = __lsx_vadd_w(y_od, yb); \
173 in_u = __lsx_vsub_h(in_u, const_80); \
174 in_v = __lsx_vsub_h(in_v, const_80); \
175 u_ev = __lsx_vmulwev_w_h(in_u, ub); \
176 u_od = __lsx_vmulwod_w_h(in_u, ub); \
177 v_ev = __lsx_vmulwev_w_h(in_v, vr); \
178 v_od = __lsx_vmulwod_w_h(in_v, vr); \
179 tmp0 = __lsx_vadd_w(y_ev, u_ev); \
180 tmp1 = __lsx_vadd_w(y_od, u_od); \
181 tmp2 = __lsx_vadd_w(y_ev, v_ev); \
182 tmp3 = __lsx_vadd_w(y_od, v_od); \
183 tmp0 = __lsx_vsrai_w(tmp0, 6); \
184 tmp1 = __lsx_vsrai_w(tmp1, 6); \
185 tmp2 = __lsx_vsrai_w(tmp2, 6); \
186 tmp3 = __lsx_vsrai_w(tmp3, 6); \
187 tmp0 = __lsx_vclip255_w(tmp0); \
188 tmp1 = __lsx_vclip255_w(tmp1); \
189 tmp2 = __lsx_vclip255_w(tmp2); \
190 tmp3 = __lsx_vclip255_w(tmp3); \
191 out_b = __lsx_vpackev_h(tmp1, tmp0); \
192 out_r = __lsx_vpackev_h(tmp3, tmp2); \
193 u_ev = __lsx_vpackev_h(in_u, in_v); \
194 u_od = __lsx_vpackod_h(in_u, in_v); \
195 v_ev = __lsx_vdp2_w_h(u_ev, ugvg); \
196 v_od = __lsx_vdp2_w_h(u_od, ugvg); \
197 tmp0 = __lsx_vsub_w(y_ev, v_ev); \
198 tmp1 = __lsx_vsub_w(y_od, v_od); \
199 tmp0 = __lsx_vsrai_w(tmp0, 6); \
200 tmp1 = __lsx_vsrai_w(tmp1, 6); \
201 tmp0 = __lsx_vclip255_w(tmp0); \
202 tmp1 = __lsx_vclip255_w(tmp1); \
203 out_g = __lsx_vpackev_h(tmp1, tmp0); \
204 }
205
206 // Pack and Store 16 ARGB values.
207 #define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
208 { \
209 __m128i temp0, temp1, temp2, temp3; \
210 temp0 = __lsx_vpackev_b(g_l, b_l); \
211 temp1 = __lsx_vpackev_b(a_l, r_l); \
212 temp2 = __lsx_vpackev_b(g_h, b_h); \
213 temp3 = __lsx_vpackev_b(a_h, r_h); \
214 r_l = __lsx_vilvl_h(temp1, temp0); \
215 r_h = __lsx_vilvh_h(temp1, temp0); \
216 g_l = __lsx_vilvl_h(temp3, temp2); \
217 g_h = __lsx_vilvh_h(temp3, temp2); \
218 __lsx_vst(r_l, pdst_argb, 0); \
219 __lsx_vst(r_h, pdst_argb, 16); \
220 __lsx_vst(g_l, pdst_argb, 32); \
221 __lsx_vst(g_h, pdst_argb, 48); \
222 pdst_argb += 64; \
223 }
224
225 // Pack and Store 8 ARGB values.
226 #define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
227 { \
228 __m128i temp0, temp1; \
229 __m128i dst0, dst1; \
230 \
231 temp0 = __lsx_vpackev_b(in_g, in_b); \
232 temp1 = __lsx_vpackev_b(in_a, in_r); \
233 dst0 = __lsx_vilvl_h(temp1, temp0); \
234 dst1 = __lsx_vilvh_h(temp1, temp0); \
235 __lsx_vst(dst0, pdst_argb, 0); \
236 __lsx_vst(dst1, pdst_argb, 16); \
237 pdst_argb += 32; \
238 }
239
240 #define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
241 { \
242 __m128i _tmp0, _tmp1, _tmp2, _tmp3; \
243 __m128i _reg0, _reg1; \
244 _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb); \
245 _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb); \
246 _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg); \
247 _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg); \
248 _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr); \
249 _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr); \
250 _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1); \
251 _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3); \
252 _tmpr = __lsx_vavgr_hu(_reg0, _reg1); \
253 _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb); \
254 _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr); \
255 _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg); \
256 _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \
257 _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \
258 _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb); \
259 _dst0 = __lsx_vpickod_b(_reg1, _reg0); \
260 }
261
MirrorRow_LSX(const uint8_t * src,uint8_t * dst,int width)262 void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
263 int x;
264 int len = width / 32;
265 __m128i src0, src1;
266 __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607};
267 src += width - 32;
268 for (x = 0; x < len; x++) {
269 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
270 DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
271 src1);
272 __lsx_vst(src1, dst, 0);
273 __lsx_vst(src0, dst, 16);
274 dst += 32;
275 src -= 32;
276 }
277 }
278
MirrorUVRow_LSX(const uint8_t * src_uv,uint8_t * dst_uv,int width)279 void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
280 int x;
281 int len = width / 8;
282 __m128i src, dst;
283 __m128i shuffler = {0x0004000500060007, 0x0000000100020003};
284
285 src_uv += (width - 8) << 1;
286 for (x = 0; x < len; x++) {
287 src = __lsx_vld(src_uv, 0);
288 dst = __lsx_vshuf_h(shuffler, src, src);
289 __lsx_vst(dst, dst_uv, 0);
290 src_uv -= 16;
291 dst_uv += 16;
292 }
293 }
294
ARGBMirrorRow_LSX(const uint8_t * src,uint8_t * dst,int width)295 void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) {
296 int x;
297 int len = width / 8;
298 __m128i src0, src1;
299 __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504};
300
301 src += (width * 4) - 32;
302 for (x = 0; x < len; x++) {
303 DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
304 DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
305 src1);
306 __lsx_vst(src1, dst, 0);
307 __lsx_vst(src0, dst, 16);
308 dst += 32;
309 src -= 32;
310 }
311 }
312
I422ToYUY2Row_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)313 void I422ToYUY2Row_LSX(const uint8_t* src_y,
314 const uint8_t* src_u,
315 const uint8_t* src_v,
316 uint8_t* dst_yuy2,
317 int width) {
318 int x;
319 int len = width / 16;
320 __m128i src_u0, src_v0, src_y0, vec_uv0;
321 __m128i vec_yuy2_0, vec_yuy2_1;
322
323 for (x = 0; x < len; x++) {
324 DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
325 src_y0 = __lsx_vld(src_y, 0);
326 vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
327 vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0);
328 vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0);
329 __lsx_vst(vec_yuy2_0, dst_yuy2, 0);
330 __lsx_vst(vec_yuy2_1, dst_yuy2, 16);
331 src_u += 8;
332 src_v += 8;
333 src_y += 16;
334 dst_yuy2 += 32;
335 }
336 }
337
I422ToUYVYRow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)338 void I422ToUYVYRow_LSX(const uint8_t* src_y,
339 const uint8_t* src_u,
340 const uint8_t* src_v,
341 uint8_t* dst_uyvy,
342 int width) {
343 int x;
344 int len = width / 16;
345 __m128i src_u0, src_v0, src_y0, vec_uv0;
346 __m128i vec_uyvy0, vec_uyvy1;
347
348 for (x = 0; x < len; x++) {
349 DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0);
350 src_y0 = __lsx_vld(src_y, 0);
351 vec_uv0 = __lsx_vilvl_b(src_v0, src_u0);
352 vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0);
353 vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0);
354 __lsx_vst(vec_uyvy0, dst_uyvy, 0);
355 __lsx_vst(vec_uyvy1, dst_uyvy, 16);
356 src_u += 8;
357 src_v += 8;
358 src_y += 16;
359 dst_uyvy += 32;
360 }
361 }
362
I422ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)363 void I422ToARGBRow_LSX(const uint8_t* src_y,
364 const uint8_t* src_u,
365 const uint8_t* src_v,
366 uint8_t* dst_argb,
367 const struct YuvConstants* yuvconstants,
368 int width) {
369 int x;
370 int len = width / 16;
371 __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
372 __m128i vec_ubvr, vec_ugvg;
373 __m128i alpha = __lsx_vldi(0xFF);
374 __m128i const_80 = __lsx_vldi(0x80);
375
376 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
377 vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
378 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
379
380 for (x = 0; x < len; x++) {
381 __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
382
383 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
384 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
385 g_h, r_l, r_h);
386 STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
387 src_y += 16;
388 src_u += 8;
389 src_v += 8;
390 }
391 }
392
I422ToRGBARow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)393 void I422ToRGBARow_LSX(const uint8_t* src_y,
394 const uint8_t* src_u,
395 const uint8_t* src_v,
396 uint8_t* dst_argb,
397 const struct YuvConstants* yuvconstants,
398 int width) {
399 int x;
400 int len = width / 16;
401 __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
402 __m128i vec_ubvr, vec_ugvg;
403 __m128i alpha = __lsx_vldi(0xFF);
404 __m128i const_80 = __lsx_vldi(0x80);
405
406 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
407 vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
408 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
409
410 for (x = 0; x < len; x++) {
411 __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
412
413 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
414 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
415 g_h, r_l, r_h);
416 STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
417 src_y += 16;
418 src_u += 8;
419 src_v += 8;
420 }
421 }
422
I422AlphaToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)423 void I422AlphaToARGBRow_LSX(const uint8_t* src_y,
424 const uint8_t* src_u,
425 const uint8_t* src_v,
426 const uint8_t* src_a,
427 uint8_t* dst_argb,
428 const struct YuvConstants* yuvconstants,
429 int width) {
430 int x;
431 int len = width / 16;
432 int res = width & 15;
433 __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
434 __m128i vec_ubvr, vec_ugvg;
435 __m128i zero = __lsx_vldi(0);
436 __m128i const_80 = __lsx_vldi(0x80);
437
438 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
439 vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
440 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
441
442 for (x = 0; x < len; x++) {
443 __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
444
445 y = __lsx_vld(src_a, 0);
446 a_l = __lsx_vilvl_b(zero, y);
447 a_h = __lsx_vilvh_b(zero, y);
448 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
449 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
450 g_h, r_l, r_h);
451 STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
452 src_y += 16;
453 src_u += 8;
454 src_v += 8;
455 src_a += 16;
456 }
457 if (res) {
458 __m128i y, uv, r, g, b, a;
459 a = __lsx_vld(src_a, 0);
460 a = __lsx_vsllwil_hu_bu(a, 0);
461 READYUV422(src_y, src_u, src_v, y, uv);
462 YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
463 STOREARGB(a, r, g, b, dst_argb);
464 }
465 }
466
I422ToRGB24Row_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int32_t width)467 void I422ToRGB24Row_LSX(const uint8_t* src_y,
468 const uint8_t* src_u,
469 const uint8_t* src_v,
470 uint8_t* dst_argb,
471 const struct YuvConstants* yuvconstants,
472 int32_t width) {
473 int x;
474 int len = width / 16;
475 __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
476 __m128i vec_ubvr, vec_ugvg;
477 __m128i const_80 = __lsx_vldi(0x80);
478 __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614};
479 __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
480
481 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
482 vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
483 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
484
485 for (x = 0; x < len; x++) {
486 __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
487 __m128i temp0, temp1, temp2, temp3;
488
489 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
490 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
491 g_h, r_l, r_h);
492 temp0 = __lsx_vpackev_b(g_l, b_l);
493 temp1 = __lsx_vpackev_b(g_h, b_h);
494 DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, r_l,
495 temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
496 temp1);
497
498 b_l = __lsx_vilvl_d(temp1, temp2);
499 b_h = __lsx_vilvh_d(temp3, temp1);
500 __lsx_vst(temp0, dst_argb, 0);
501 __lsx_vst(b_l, dst_argb, 16);
502 __lsx_vst(b_h, dst_argb, 32);
503 dst_argb += 48;
504 src_y += 16;
505 src_u += 8;
506 src_v += 8;
507 }
508 }
509
510 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
I422ToRGB565Row_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)511 void I422ToRGB565Row_LSX(const uint8_t* src_y,
512 const uint8_t* src_u,
513 const uint8_t* src_v,
514 uint8_t* dst_rgb565,
515 const struct YuvConstants* yuvconstants,
516 int width) {
517 int x;
518 int len = width / 16;
519 __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
520 __m128i vec_ubvr, vec_ugvg;
521 __m128i const_80 = __lsx_vldi(0x80);
522
523 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
524 vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
525 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
526
527 for (x = 0; x < len; x++) {
528 __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
529
530 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
531 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
532 g_h, r_l, r_h);
533 b_l = __lsx_vsrli_h(b_l, 3);
534 b_h = __lsx_vsrli_h(b_h, 3);
535 g_l = __lsx_vsrli_h(g_l, 2);
536 g_h = __lsx_vsrli_h(g_h, 2);
537 r_l = __lsx_vsrli_h(r_l, 3);
538 r_h = __lsx_vsrli_h(r_h, 3);
539 r_l = __lsx_vslli_h(r_l, 11);
540 r_h = __lsx_vslli_h(r_h, 11);
541 g_l = __lsx_vslli_h(g_l, 5);
542 g_h = __lsx_vslli_h(g_h, 5);
543 r_l = __lsx_vor_v(r_l, g_l);
544 r_l = __lsx_vor_v(r_l, b_l);
545 r_h = __lsx_vor_v(r_h, g_h);
546 r_h = __lsx_vor_v(r_h, b_h);
547 __lsx_vst(r_l, dst_rgb565, 0);
548 __lsx_vst(r_h, dst_rgb565, 16);
549 dst_rgb565 += 32;
550 src_y += 16;
551 src_u += 8;
552 src_v += 8;
553 }
554 }
555
556 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
I422ToARGB4444Row_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)557 void I422ToARGB4444Row_LSX(const uint8_t* src_y,
558 const uint8_t* src_u,
559 const uint8_t* src_v,
560 uint8_t* dst_argb4444,
561 const struct YuvConstants* yuvconstants,
562 int width) {
563 int x;
564 int len = width / 16;
565 __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
566 __m128i vec_ubvr, vec_ugvg;
567 __m128i const_80 = __lsx_vldi(0x80);
568 __m128i alpha = (__m128i)v2u64{0xF000F000F000F000, 0xF000F000F000F000};
569 __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0};
570
571 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
572 vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
573 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
574
575 for (x = 0; x < len; x++) {
576 __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
577
578 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
579 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
580 g_h, r_l, r_h);
581 b_l = __lsx_vsrli_h(b_l, 4);
582 b_h = __lsx_vsrli_h(b_h, 4);
583 r_l = __lsx_vsrli_h(r_l, 4);
584 r_h = __lsx_vsrli_h(r_h, 4);
585 g_l = __lsx_vand_v(g_l, mask);
586 g_h = __lsx_vand_v(g_h, mask);
587 r_l = __lsx_vslli_h(r_l, 8);
588 r_h = __lsx_vslli_h(r_h, 8);
589 r_l = __lsx_vor_v(r_l, alpha);
590 r_h = __lsx_vor_v(r_h, alpha);
591 r_l = __lsx_vor_v(r_l, g_l);
592 r_h = __lsx_vor_v(r_h, g_h);
593 r_l = __lsx_vor_v(r_l, b_l);
594 r_h = __lsx_vor_v(r_h, b_h);
595 __lsx_vst(r_l, dst_argb4444, 0);
596 __lsx_vst(r_h, dst_argb4444, 16);
597 dst_argb4444 += 32;
598 src_y += 16;
599 src_u += 8;
600 src_v += 8;
601 }
602 }
603
I422ToARGB1555Row_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)604 void I422ToARGB1555Row_LSX(const uint8_t* src_y,
605 const uint8_t* src_u,
606 const uint8_t* src_v,
607 uint8_t* dst_argb1555,
608 const struct YuvConstants* yuvconstants,
609 int width) {
610 int x;
611 int len = width / 16;
612 __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
613 __m128i vec_ubvr, vec_ugvg;
614 __m128i const_80 = __lsx_vldi(0x80);
615 __m128i alpha = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
616
617 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
618 vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
619 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
620
621 for (x = 0; x < len; x++) {
622 __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
623
624 READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
625 YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
626 g_h, r_l, r_h);
627 b_l = __lsx_vsrli_h(b_l, 3);
628 b_h = __lsx_vsrli_h(b_h, 3);
629 g_l = __lsx_vsrli_h(g_l, 3);
630
631 g_h = __lsx_vsrli_h(g_h, 3);
632 g_l = __lsx_vslli_h(g_l, 5);
633 g_h = __lsx_vslli_h(g_h, 5);
634 r_l = __lsx_vsrli_h(r_l, 3);
635 r_h = __lsx_vsrli_h(r_h, 3);
636 r_l = __lsx_vslli_h(r_l, 10);
637 r_h = __lsx_vslli_h(r_h, 10);
638 r_l = __lsx_vor_v(r_l, alpha);
639 r_h = __lsx_vor_v(r_h, alpha);
640 r_l = __lsx_vor_v(r_l, g_l);
641 r_h = __lsx_vor_v(r_h, g_h);
642 r_l = __lsx_vor_v(r_l, b_l);
643 r_h = __lsx_vor_v(r_h, b_h);
644 __lsx_vst(r_l, dst_argb1555, 0);
645 __lsx_vst(r_h, dst_argb1555, 16);
646 dst_argb1555 += 32;
647 src_y += 16;
648 src_u += 8;
649 src_v += 8;
650 }
651 }
652
YUY2ToYRow_LSX(const uint8_t * src_yuy2,uint8_t * dst_y,int width)653 void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
654 int x;
655 int len = width / 16;
656 __m128i src0, src1, dst0;
657
658 for (x = 0; x < len; x++) {
659 DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
660 dst0 = __lsx_vpickev_b(src1, src0);
661 __lsx_vst(dst0, dst_y, 0);
662 src_yuy2 += 32;
663 dst_y += 16;
664 }
665 }
666
YUY2ToUVRow_LSX(const uint8_t * src_yuy2,int src_stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)667 void YUY2ToUVRow_LSX(const uint8_t* src_yuy2,
668 int src_stride_yuy2,
669 uint8_t* dst_u,
670 uint8_t* dst_v,
671 int width) {
672 const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
673 int x;
674 int len = width / 16;
675 __m128i src0, src1, src2, src3;
676 __m128i tmp0, dst0, dst1;
677
678 for (x = 0; x < len; x++) {
679 DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0,
680 src_yuy2_next, 16, src0, src1, src2, src3);
681 src0 = __lsx_vpickod_b(src1, src0);
682 src1 = __lsx_vpickod_b(src3, src2);
683 tmp0 = __lsx_vavgr_bu(src1, src0);
684 dst0 = __lsx_vpickev_b(tmp0, tmp0);
685 dst1 = __lsx_vpickod_b(tmp0, tmp0);
686 __lsx_vstelm_d(dst0, dst_u, 0, 0);
687 __lsx_vstelm_d(dst1, dst_v, 0, 0);
688 src_yuy2 += 32;
689 src_yuy2_next += 32;
690 dst_u += 8;
691 dst_v += 8;
692 }
693 }
694
YUY2ToUV422Row_LSX(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)695 void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2,
696 uint8_t* dst_u,
697 uint8_t* dst_v,
698 int width) {
699 int x;
700 int len = width / 16;
701 __m128i src0, src1, tmp0, dst0, dst1;
702
703 for (x = 0; x < len; x++) {
704 DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1);
705 tmp0 = __lsx_vpickod_b(src1, src0);
706 dst0 = __lsx_vpickev_b(tmp0, tmp0);
707 dst1 = __lsx_vpickod_b(tmp0, tmp0);
708 __lsx_vstelm_d(dst0, dst_u, 0, 0);
709 __lsx_vstelm_d(dst1, dst_v, 0, 0);
710 src_yuy2 += 32;
711 dst_u += 8;
712 dst_v += 8;
713 }
714 }
715
UYVYToYRow_LSX(const uint8_t * src_uyvy,uint8_t * dst_y,int width)716 void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
717 int x;
718 int len = width / 16;
719 __m128i src0, src1, dst0;
720
721 for (x = 0; x < len; x++) {
722 DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
723 dst0 = __lsx_vpickod_b(src1, src0);
724 __lsx_vst(dst0, dst_y, 0);
725 src_uyvy += 32;
726 dst_y += 16;
727 }
728 }
729
UYVYToUVRow_LSX(const uint8_t * src_uyvy,int src_stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)730 void UYVYToUVRow_LSX(const uint8_t* src_uyvy,
731 int src_stride_uyvy,
732 uint8_t* dst_u,
733 uint8_t* dst_v,
734 int width) {
735 const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
736 int x;
737 int len = width / 16;
738 __m128i src0, src1, src2, src3, tmp0, dst0, dst1;
739
740 for (x = 0; x < len; x++) {
741 DUP4_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src_uyvy_next, 0,
742 src_uyvy_next, 16, src0, src1, src2, src3);
743 src0 = __lsx_vpickev_b(src1, src0);
744 src1 = __lsx_vpickev_b(src3, src2);
745 tmp0 = __lsx_vavgr_bu(src1, src0);
746 dst0 = __lsx_vpickev_b(tmp0, tmp0);
747 dst1 = __lsx_vpickod_b(tmp0, tmp0);
748 __lsx_vstelm_d(dst0, dst_u, 0, 0);
749 __lsx_vstelm_d(dst1, dst_v, 0, 0);
750 src_uyvy += 32;
751 src_uyvy_next += 32;
752 dst_u += 8;
753 dst_v += 8;
754 }
755 }
756
UYVYToUV422Row_LSX(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)757 void UYVYToUV422Row_LSX(const uint8_t* src_uyvy,
758 uint8_t* dst_u,
759 uint8_t* dst_v,
760 int width) {
761 int x;
762 int len = width / 16;
763 __m128i src0, src1, tmp0, dst0, dst1;
764
765 for (x = 0; x < len; x++) {
766 DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1);
767 tmp0 = __lsx_vpickev_b(src1, src0);
768 dst0 = __lsx_vpickev_b(tmp0, tmp0);
769 dst1 = __lsx_vpickod_b(tmp0, tmp0);
770 __lsx_vstelm_d(dst0, dst_u, 0, 0);
771 __lsx_vstelm_d(dst1, dst_v, 0, 0);
772 src_uyvy += 32;
773 dst_u += 8;
774 dst_v += 8;
775 }
776 }
777
ARGBToUVRow_LSX(const uint8_t * src_argb0,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)778 void ARGBToUVRow_LSX(const uint8_t* src_argb0,
779 int src_stride_argb,
780 uint8_t* dst_u,
781 uint8_t* dst_v,
782 int width) {
783 int x;
784 int len = width / 16;
785 const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
786
787 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
788 __m128i vec0, vec1, vec2, vec3;
789 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
790 __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038};
791 __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025};
792 __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013};
793 __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f};
794 __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009};
795 __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
796 for (x = 0; x < len; x++) {
797 DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0,
798 48, src0, src1, src2, src3);
799 DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1,
800 48, src4, src5, src6, src7);
801 vec0 = __lsx_vaddwev_h_bu(src0, src4);
802 vec1 = __lsx_vaddwev_h_bu(src1, src5);
803 vec2 = __lsx_vaddwev_h_bu(src2, src6);
804 vec3 = __lsx_vaddwev_h_bu(src3, src7);
805 tmp0 = __lsx_vpickev_h(vec1, vec0);
806 tmp1 = __lsx_vpickev_h(vec3, vec2);
807 tmp2 = __lsx_vpickod_h(vec1, vec0);
808 tmp3 = __lsx_vpickod_h(vec3, vec2);
809 vec0 = __lsx_vaddwod_h_bu(src0, src4);
810 vec1 = __lsx_vaddwod_h_bu(src1, src5);
811 vec2 = __lsx_vaddwod_h_bu(src2, src6);
812 vec3 = __lsx_vaddwod_h_bu(src3, src7);
813 tmp4 = __lsx_vpickev_h(vec1, vec0);
814 tmp5 = __lsx_vpickev_h(vec3, vec2);
815 vec0 = __lsx_vpickev_h(tmp1, tmp0);
816 vec1 = __lsx_vpickod_h(tmp1, tmp0);
817 src0 = __lsx_vavgr_h(vec0, vec1);
818 vec0 = __lsx_vpickev_h(tmp3, tmp2);
819 vec1 = __lsx_vpickod_h(tmp3, tmp2);
820 src1 = __lsx_vavgr_h(vec0, vec1);
821 vec0 = __lsx_vpickev_h(tmp5, tmp4);
822 vec1 = __lsx_vpickod_h(tmp5, tmp4);
823 src2 = __lsx_vavgr_h(vec0, vec1);
824 dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70);
825 dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A);
826 dst0 = __lsx_vmsub_h(dst0, src1, const_0x26);
827 dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70);
828 dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E);
829 dst1 = __lsx_vmsub_h(dst1, src0, const_0x12);
830 dst0 = __lsx_vsrai_h(dst0, 8);
831 dst1 = __lsx_vsrai_h(dst1, 8);
832 dst0 = __lsx_vpickev_b(dst1, dst0);
833 __lsx_vstelm_d(dst0, dst_u, 0, 0);
834 __lsx_vstelm_d(dst0, dst_v, 0, 1);
835 src_argb0 += 64;
836 src_argb1 += 64;
837 dst_u += 8;
838 dst_v += 8;
839 }
840 }
841
ARGBToRGB24Row_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)842 void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
843 int x;
844 int len = (width / 16) - 1;
845 __m128i src0, src1, src2, src3;
846 __m128i tmp0, tmp1, tmp2, tmp3;
847 __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A};
848 for (x = 0; x < len; x++) {
849 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
850 src0, src1, src2, src3);
851 tmp0 = __lsx_vshuf_b(src0, src0, shuf);
852 tmp1 = __lsx_vshuf_b(src1, src1, shuf);
853 tmp2 = __lsx_vshuf_b(src2, src2, shuf);
854 tmp3 = __lsx_vshuf_b(src3, src3, shuf);
855 __lsx_vst(tmp0, dst_rgb, 0);
856 __lsx_vst(tmp1, dst_rgb, 12);
857 __lsx_vst(tmp2, dst_rgb, 24);
858 __lsx_vst(tmp3, dst_rgb, 36);
859 dst_rgb += 48;
860 src_argb += 64;
861 }
862 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
863 src0, src1, src2, src3);
864 tmp0 = __lsx_vshuf_b(src0, src0, shuf);
865 tmp1 = __lsx_vshuf_b(src1, src1, shuf);
866 tmp2 = __lsx_vshuf_b(src2, src2, shuf);
867 tmp3 = __lsx_vshuf_b(src3, src3, shuf);
868 __lsx_vst(tmp0, dst_rgb, 0);
869 __lsx_vst(tmp1, dst_rgb, 12);
870 __lsx_vst(tmp2, dst_rgb, 24);
871 dst_rgb += 36;
872 __lsx_vst(tmp3, dst_rgb, 0);
873 }
874
ARGBToRAWRow_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)875 void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
876 int x;
877 int len = (width / 16) - 1;
878 __m128i src0, src1, src2, src3;
879 __m128i tmp0, tmp1, tmp2, tmp3;
880 __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08};
881 for (x = 0; x < len; x++) {
882 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
883 src0, src1, src2, src3);
884 tmp0 = __lsx_vshuf_b(src0, src0, shuf);
885 tmp1 = __lsx_vshuf_b(src1, src1, shuf);
886 tmp2 = __lsx_vshuf_b(src2, src2, shuf);
887 tmp3 = __lsx_vshuf_b(src3, src3, shuf);
888 __lsx_vst(tmp0, dst_rgb, 0);
889 __lsx_vst(tmp1, dst_rgb, 12);
890 __lsx_vst(tmp2, dst_rgb, 24);
891 __lsx_vst(tmp3, dst_rgb, 36);
892 dst_rgb += 48;
893 src_argb += 64;
894 }
895 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
896 src0, src1, src2, src3);
897 tmp0 = __lsx_vshuf_b(src0, src0, shuf);
898 tmp1 = __lsx_vshuf_b(src1, src1, shuf);
899 tmp2 = __lsx_vshuf_b(src2, src2, shuf);
900 tmp3 = __lsx_vshuf_b(src3, src3, shuf);
901 __lsx_vst(tmp0, dst_rgb, 0);
902 __lsx_vst(tmp1, dst_rgb, 12);
903 __lsx_vst(tmp2, dst_rgb, 24);
904 dst_rgb += 36;
905 __lsx_vst(tmp3, dst_rgb, 0);
906 }
907
ARGBToRGB565Row_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)908 void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
909 int x;
910 int len = width / 8;
911 __m128i zero = __lsx_vldi(0);
912 __m128i src0, src1, tmp0, tmp1, dst0;
913 __m128i shift = {0x0300030003000300, 0x0300030003000300};
914
915 for (x = 0; x < len; x++) {
916 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
917 tmp0 = __lsx_vpickev_b(src1, src0);
918 tmp1 = __lsx_vpickod_b(src1, src0);
919 tmp0 = __lsx_vsrli_b(tmp0, 3);
920 tmp1 = __lsx_vpackev_b(zero, tmp1);
921 tmp1 = __lsx_vsrli_h(tmp1, 2);
922 tmp0 = __lsx_vsll_b(tmp0, shift);
923 tmp1 = __lsx_vslli_h(tmp1, 5);
924 dst0 = __lsx_vor_v(tmp0, tmp1);
925 __lsx_vst(dst0, dst_rgb, 0);
926 dst_rgb += 16;
927 src_argb += 32;
928 }
929 }
930
ARGBToARGB1555Row_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)931 void ARGBToARGB1555Row_LSX(const uint8_t* src_argb,
932 uint8_t* dst_rgb,
933 int width) {
934 int x;
935 int len = width / 8;
936 __m128i zero = __lsx_vldi(0);
937 __m128i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
938 __m128i shift1 = {0x0703070307030703, 0x0703070307030703};
939 __m128i shift2 = {0x0200020002000200, 0x0200020002000200};
940
941 for (x = 0; x < len; x++) {
942 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
943 tmp0 = __lsx_vpickev_b(src1, src0);
944 tmp1 = __lsx_vpickod_b(src1, src0);
945 tmp0 = __lsx_vsrli_b(tmp0, 3);
946 tmp1 = __lsx_vsrl_b(tmp1, shift1);
947 tmp0 = __lsx_vsll_b(tmp0, shift2);
948 tmp2 = __lsx_vpackev_b(zero, tmp1);
949 tmp3 = __lsx_vpackod_b(zero, tmp1);
950 tmp2 = __lsx_vslli_h(tmp2, 5);
951 tmp3 = __lsx_vslli_h(tmp3, 15);
952 dst0 = __lsx_vor_v(tmp0, tmp2);
953 dst0 = __lsx_vor_v(dst0, tmp3);
954 __lsx_vst(dst0, dst_rgb, 0);
955 dst_rgb += 16;
956 src_argb += 32;
957 }
958 }
959
ARGBToARGB4444Row_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,int width)960 void ARGBToARGB4444Row_LSX(const uint8_t* src_argb,
961 uint8_t* dst_rgb,
962 int width) {
963 int x;
964 int len = width / 8;
965 __m128i src0, src1, tmp0, tmp1, dst0;
966
967 for (x = 0; x < len; x++) {
968 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
969 tmp0 = __lsx_vpickev_b(src1, src0);
970 tmp1 = __lsx_vpickod_b(src1, src0);
971 tmp1 = __lsx_vandi_b(tmp1, 0xF0);
972 tmp0 = __lsx_vsrli_b(tmp0, 4);
973 dst0 = __lsx_vor_v(tmp1, tmp0);
974 __lsx_vst(dst0, dst_rgb, 0);
975 dst_rgb += 16;
976 src_argb += 32;
977 }
978 }
979
ARGBToUV444Row_LSX(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int32_t width)980 void ARGBToUV444Row_LSX(const uint8_t* src_argb,
981 uint8_t* dst_u,
982 uint8_t* dst_v,
983 int32_t width) {
984 int x;
985 int len = width / 16;
986 __m128i src0, src1, src2, src3;
987 __m128i tmp0, tmp1, tmp2, tmp3;
988 __m128i reg0, reg1, reg2, reg3, dst0, dst1;
989 __m128i const_112 = __lsx_vldi(112);
990 __m128i const_74 = __lsx_vldi(74);
991 __m128i const_38 = __lsx_vldi(38);
992 __m128i const_94 = __lsx_vldi(94);
993 __m128i const_18 = __lsx_vldi(18);
994 __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
995 for (x = 0; x < len; x++) {
996 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
997 src0, src1, src2, src3);
998 tmp0 = __lsx_vpickev_h(src1, src0);
999 tmp1 = __lsx_vpickod_h(src1, src0);
1000 tmp2 = __lsx_vpickev_h(src3, src2);
1001 tmp3 = __lsx_vpickod_h(src3, src2);
1002 reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112);
1003 reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112);
1004 reg2 = __lsx_vmulwod_h_bu(tmp0, const_74);
1005 reg3 = __lsx_vmulwod_h_bu(tmp2, const_74);
1006 reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38);
1007 reg3 = __lsx_vmaddwev_h_bu(reg3, tmp3, const_38);
1008 reg0 = __lsx_vsub_h(reg0, reg2);
1009 reg1 = __lsx_vsub_h(reg1, reg3);
1010 reg0 = __lsx_vsrai_h(reg0, 8);
1011 reg1 = __lsx_vsrai_h(reg1, 8);
1012 dst0 = __lsx_vpickev_b(reg1, reg0);
1013
1014 reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112);
1015 reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112);
1016 reg2 = __lsx_vmulwev_h_bu(tmp0, const_18);
1017 reg3 = __lsx_vmulwev_h_bu(tmp2, const_18);
1018 reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94);
1019 reg3 = __lsx_vmaddwod_h_bu(reg3, tmp2, const_94);
1020 reg0 = __lsx_vsub_h(reg0, reg2);
1021 reg1 = __lsx_vsub_h(reg1, reg3);
1022 reg0 = __lsx_vsrai_h(reg0, 8);
1023 reg1 = __lsx_vsrai_h(reg1, 8);
1024 dst1 = __lsx_vpickev_b(reg1, reg0);
1025
1026 __lsx_vst(dst0, dst_u, 0);
1027 __lsx_vst(dst1, dst_v, 0);
1028 dst_u += 16;
1029 dst_v += 16;
1030 src_argb += 64;
1031 }
1032 }
1033
ARGBMultiplyRow_LSX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1034 void ARGBMultiplyRow_LSX(const uint8_t* src_argb0,
1035 const uint8_t* src_argb1,
1036 uint8_t* dst_argb,
1037 int width) {
1038 int x;
1039 int len = width / 4;
1040 __m128i zero = __lsx_vldi(0);
1041 __m128i src0, src1, dst0, dst1;
1042 __m128i tmp0, tmp1, tmp2, tmp3;
1043
1044 for (x = 0; x < len; x++) {
1045 DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
1046 tmp0 = __lsx_vilvl_b(src0, src0);
1047 tmp1 = __lsx_vilvh_b(src0, src0);
1048 tmp2 = __lsx_vilvl_b(zero, src1);
1049 tmp3 = __lsx_vilvh_b(zero, src1);
1050 dst0 = __lsx_vmuh_hu(tmp0, tmp2);
1051 dst1 = __lsx_vmuh_hu(tmp1, tmp3);
1052 dst0 = __lsx_vpickev_b(dst1, dst0);
1053 __lsx_vst(dst0, dst_argb, 0);
1054 src_argb0 += 16;
1055 src_argb1 += 16;
1056 dst_argb += 16;
1057 }
1058 }
1059
ARGBAddRow_LSX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1060 void ARGBAddRow_LSX(const uint8_t* src_argb0,
1061 const uint8_t* src_argb1,
1062 uint8_t* dst_argb,
1063 int width) {
1064 int x;
1065 int len = width / 4;
1066 __m128i src0, src1, dst0;
1067
1068 for (x = 0; x < len; x++) {
1069 DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
1070 dst0 = __lsx_vsadd_bu(src0, src1);
1071 __lsx_vst(dst0, dst_argb, 0);
1072 src_argb0 += 16;
1073 src_argb1 += 16;
1074 dst_argb += 16;
1075 }
1076 }
1077
ARGBSubtractRow_LSX(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1078 void ARGBSubtractRow_LSX(const uint8_t* src_argb0,
1079 const uint8_t* src_argb1,
1080 uint8_t* dst_argb,
1081 int width) {
1082 int x;
1083 int len = width / 4;
1084 __m128i src0, src1, dst0;
1085
1086 for (x = 0; x < len; x++) {
1087 DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1);
1088 dst0 = __lsx_vssub_bu(src0, src1);
1089 __lsx_vst(dst0, dst_argb, 0);
1090 src_argb0 += 16;
1091 src_argb1 += 16;
1092 dst_argb += 16;
1093 }
1094 }
1095
ARGBAttenuateRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,int width)1096 void ARGBAttenuateRow_LSX(const uint8_t* src_argb,
1097 uint8_t* dst_argb,
1098 int width) {
1099 int x;
1100 int len = width / 8;
1101 __m128i src0, src1, tmp0, tmp1;
1102 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
1103 __m128i b, g, r, a, dst0, dst1;
1104 __m128i control = {0x0005000100040000, 0x0007000300060002};
1105
1106 for (x = 0; x < len; x++) {
1107 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
1108 tmp0 = __lsx_vpickev_b(src1, src0);
1109 tmp1 = __lsx_vpickod_b(src1, src0);
1110 b = __lsx_vpackev_b(tmp0, tmp0);
1111 r = __lsx_vpackod_b(tmp0, tmp0);
1112 g = __lsx_vpackev_b(tmp1, tmp1);
1113 a = __lsx_vpackod_b(tmp1, tmp1);
1114 reg0 = __lsx_vmulwev_w_hu(b, a);
1115 reg1 = __lsx_vmulwod_w_hu(b, a);
1116 reg2 = __lsx_vmulwev_w_hu(r, a);
1117 reg3 = __lsx_vmulwod_w_hu(r, a);
1118 reg4 = __lsx_vmulwev_w_hu(g, a);
1119 reg5 = __lsx_vmulwod_w_hu(g, a);
1120 reg0 = __lsx_vssrani_h_w(reg1, reg0, 24);
1121 reg2 = __lsx_vssrani_h_w(reg3, reg2, 24);
1122 reg4 = __lsx_vssrani_h_w(reg5, reg4, 24);
1123 reg0 = __lsx_vshuf_h(control, reg0, reg0);
1124 reg2 = __lsx_vshuf_h(control, reg2, reg2);
1125 reg4 = __lsx_vshuf_h(control, reg4, reg4);
1126 tmp0 = __lsx_vpackev_b(reg4, reg0);
1127 tmp1 = __lsx_vpackev_b(a, reg2);
1128 dst0 = __lsx_vilvl_h(tmp1, tmp0);
1129 dst1 = __lsx_vilvh_h(tmp1, tmp0);
1130 __lsx_vst(dst0, dst_argb, 0);
1131 __lsx_vst(dst1, dst_argb, 16);
1132 dst_argb += 32;
1133 src_argb += 32;
1134 }
1135 }
1136
ARGBToRGB565DitherRow_LSX(const uint8_t * src_argb,uint8_t * dst_rgb,uint32_t dither4,int width)1137 void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb,
1138 uint8_t* dst_rgb,
1139 uint32_t dither4,
1140 int width) {
1141 int x;
1142 int len = width / 8;
1143 __m128i src0, src1, tmp0, tmp1, dst0;
1144 __m128i b, g, r;
1145 __m128i zero = __lsx_vldi(0);
1146 __m128i vec_dither = __lsx_vldrepl_w(&dither4, 0);
1147
1148 vec_dither = __lsx_vilvl_b(zero, vec_dither);
1149 for (x = 0; x < len; x++) {
1150 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
1151 tmp0 = __lsx_vpickev_b(src1, src0);
1152 tmp1 = __lsx_vpickod_b(src1, src0);
1153 b = __lsx_vpackev_b(zero, tmp0);
1154 r = __lsx_vpackod_b(zero, tmp0);
1155 g = __lsx_vpackev_b(zero, tmp1);
1156 b = __lsx_vadd_h(b, vec_dither);
1157 g = __lsx_vadd_h(g, vec_dither);
1158 r = __lsx_vadd_h(r, vec_dither);
1159 DUP2_ARG1(__lsx_vclip255_h, b, g, b, g);
1160 r = __lsx_vclip255_h(r);
1161 b = __lsx_vsrai_h(b, 3);
1162 g = __lsx_vsrai_h(g, 2);
1163 r = __lsx_vsrai_h(r, 3);
1164 g = __lsx_vslli_h(g, 5);
1165 r = __lsx_vslli_h(r, 11);
1166 dst0 = __lsx_vor_v(b, g);
1167 dst0 = __lsx_vor_v(dst0, r);
1168 __lsx_vst(dst0, dst_rgb, 0);
1169 src_argb += 32;
1170 dst_rgb += 16;
1171 }
1172 }
1173
ARGBShuffleRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1174 void ARGBShuffleRow_LSX(const uint8_t* src_argb,
1175 uint8_t* dst_argb,
1176 const uint8_t* shuffler,
1177 int width) {
1178 int x;
1179 int len = width / 8;
1180 __m128i src0, src1, dst0, dst1;
1181 __m128i shuf = {0x0404040400000000, 0x0C0C0C0C08080808};
1182 __m128i temp = __lsx_vldrepl_w(shuffler, 0);
1183
1184 shuf = __lsx_vadd_b(shuf, temp);
1185 for (x = 0; x < len; x++) {
1186 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
1187 dst0 = __lsx_vshuf_b(src0, src0, shuf);
1188 dst1 = __lsx_vshuf_b(src1, src1, shuf);
1189 __lsx_vst(dst0, dst_argb, 0);
1190 __lsx_vst(dst1, dst_argb, 16);
1191 src_argb += 32;
1192 dst_argb += 32;
1193 }
1194 }
1195
ARGBShadeRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)1196 void ARGBShadeRow_LSX(const uint8_t* src_argb,
1197 uint8_t* dst_argb,
1198 int width,
1199 uint32_t value) {
1200 int x;
1201 int len = width / 4;
1202 __m128i src0, dst0, tmp0, tmp1;
1203 __m128i vec_value = __lsx_vreplgr2vr_w(value);
1204
1205 vec_value = __lsx_vilvl_b(vec_value, vec_value);
1206 for (x = 0; x < len; x++) {
1207 src0 = __lsx_vld(src_argb, 0);
1208 tmp0 = __lsx_vilvl_b(src0, src0);
1209 tmp1 = __lsx_vilvh_b(src0, src0);
1210 tmp0 = __lsx_vmuh_hu(tmp0, vec_value);
1211 tmp1 = __lsx_vmuh_hu(tmp1, vec_value);
1212 dst0 = __lsx_vpickod_b(tmp1, tmp0);
1213 __lsx_vst(dst0, dst_argb, 0);
1214 src_argb += 16;
1215 dst_argb += 16;
1216 }
1217 }
1218
ARGBGrayRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,int width)1219 void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1220 int x;
1221 int len = width / 8;
1222 __m128i src0, src1, tmp0, tmp1;
1223 __m128i reg0, reg1, reg2, dst0, dst1;
1224 __m128i const_128 = __lsx_vldi(0x480);
1225 __m128i const_150 = __lsx_vldi(0x96);
1226 __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
1227
1228 for (x = 0; x < len; x++) {
1229 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
1230 tmp0 = __lsx_vpickev_b(src1, src0);
1231 tmp1 = __lsx_vpickod_b(src1, src0);
1232 reg0 = __lsx_vdp2_h_bu(tmp0, const_br);
1233 reg1 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150);
1234 reg2 = __lsx_vadd_h(reg0, reg1);
1235 tmp0 = __lsx_vpackod_b(reg2, reg2);
1236 tmp1 = __lsx_vpackod_b(tmp1, reg2);
1237 dst0 = __lsx_vilvl_h(tmp1, tmp0);
1238 dst1 = __lsx_vilvh_h(tmp1, tmp0);
1239 __lsx_vst(dst0, dst_argb, 0);
1240 __lsx_vst(dst1, dst_argb, 16);
1241 src_argb += 32;
1242 dst_argb += 32;
1243 }
1244 }
1245
ARGBSepiaRow_LSX(uint8_t * dst_argb,int width)1246 void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width) {
1247 int x;
1248 int len = width / 8;
1249 __m128i src0, src1, tmp0, tmp1;
1250 __m128i reg0, reg1, spb, spg, spr;
1251 __m128i dst0, dst1;
1252 __m128i spb_g = __lsx_vldi(68);
1253 __m128i spg_g = __lsx_vldi(88);
1254 __m128i spr_g = __lsx_vldi(98);
1255 __m128i spb_br = {0x2311231123112311, 0x2311231123112311};
1256 __m128i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16};
1257 __m128i spr_br = {0x3218321832183218, 0x3218321832183218};
1258 __m128i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908};
1259
1260 for (x = 0; x < len; x++) {
1261 DUP2_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, src0, src1);
1262 tmp0 = __lsx_vpickev_b(src1, src0);
1263 tmp1 = __lsx_vpickod_b(src1, src0);
1264 DUP2_ARG2(__lsx_vdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
1265 spr = __lsx_vdp2_h_bu(tmp0, spr_br);
1266 spb = __lsx_vmaddwev_h_bu(spb, tmp1, spb_g);
1267 spg = __lsx_vmaddwev_h_bu(spg, tmp1, spg_g);
1268 spr = __lsx_vmaddwev_h_bu(spr, tmp1, spr_g);
1269 spb = __lsx_vsrli_h(spb, 7);
1270 spg = __lsx_vsrli_h(spg, 7);
1271 spr = __lsx_vsrli_h(spr, 7);
1272 spg = __lsx_vsat_hu(spg, 7);
1273 spr = __lsx_vsat_hu(spr, 7);
1274 reg0 = __lsx_vpackev_b(spg, spb);
1275 reg1 = __lsx_vshuf_b(tmp1, spr, shuff);
1276 dst0 = __lsx_vilvl_h(reg1, reg0);
1277 dst1 = __lsx_vilvh_h(reg1, reg0);
1278 __lsx_vst(dst0, dst_argb, 0);
1279 __lsx_vst(dst1, dst_argb, 16);
1280 dst_argb += 32;
1281 }
1282 }
1283
ARGB4444ToARGBRow_LSX(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1284 void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444,
1285 uint8_t* dst_argb,
1286 int width) {
1287 int x;
1288 int len = width / 16;
1289 __m128i src0, src1;
1290 __m128i tmp0, tmp1, tmp2, tmp3;
1291 __m128i reg0, reg1, reg2, reg3;
1292 __m128i dst0, dst1, dst2, dst3;
1293
1294 for (x = 0; x < len; x++) {
1295 src0 = __lsx_vld(src_argb4444, 0);
1296 src1 = __lsx_vld(src_argb4444, 16);
1297 tmp0 = __lsx_vandi_b(src0, 0x0F);
1298 tmp1 = __lsx_vandi_b(src0, 0xF0);
1299 tmp2 = __lsx_vandi_b(src1, 0x0F);
1300 tmp3 = __lsx_vandi_b(src1, 0xF0);
1301 reg0 = __lsx_vslli_b(tmp0, 4);
1302 reg2 = __lsx_vslli_b(tmp2, 4);
1303 reg1 = __lsx_vsrli_b(tmp1, 4);
1304 reg3 = __lsx_vsrli_b(tmp3, 4);
1305 DUP4_ARG2(__lsx_vor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, tmp0,
1306 tmp1, tmp2, tmp3);
1307 dst0 = __lsx_vilvl_b(tmp1, tmp0);
1308 dst2 = __lsx_vilvl_b(tmp3, tmp2);
1309 dst1 = __lsx_vilvh_b(tmp1, tmp0);
1310 dst3 = __lsx_vilvh_b(tmp3, tmp2);
1311 __lsx_vst(dst0, dst_argb, 0);
1312 __lsx_vst(dst1, dst_argb, 16);
1313 __lsx_vst(dst2, dst_argb, 32);
1314 __lsx_vst(dst3, dst_argb, 48);
1315 dst_argb += 64;
1316 src_argb4444 += 32;
1317 }
1318 }
1319
ARGB1555ToARGBRow_LSX(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1320 void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555,
1321 uint8_t* dst_argb,
1322 int width) {
1323 int x;
1324 int len = width / 16;
1325 __m128i src0, src1;
1326 __m128i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
1327 __m128i reg0, reg1, reg2;
1328 __m128i dst0, dst1, dst2, dst3;
1329
1330 for (x = 0; x < len; x++) {
1331 src0 = __lsx_vld(src_argb1555, 0);
1332 src1 = __lsx_vld(src_argb1555, 16);
1333 tmp0 = __lsx_vpickev_b(src1, src0);
1334 tmp1 = __lsx_vpickod_b(src1, src0);
1335 tmpb = __lsx_vandi_b(tmp0, 0x1F);
1336 tmpg = __lsx_vsrli_b(tmp0, 5);
1337 reg0 = __lsx_vandi_b(tmp1, 0x03);
1338 reg0 = __lsx_vslli_b(reg0, 3);
1339 tmpg = __lsx_vor_v(tmpg, reg0);
1340 reg1 = __lsx_vandi_b(tmp1, 0x7C);
1341 tmpr = __lsx_vsrli_b(reg1, 2);
1342 tmpa = __lsx_vsrli_b(tmp1, 7);
1343 tmpa = __lsx_vneg_b(tmpa);
1344 reg0 = __lsx_vslli_b(tmpb, 3);
1345 reg1 = __lsx_vslli_b(tmpg, 3);
1346 reg2 = __lsx_vslli_b(tmpr, 3);
1347 tmpb = __lsx_vsrli_b(tmpb, 2);
1348 tmpg = __lsx_vsrli_b(tmpg, 2);
1349 tmpr = __lsx_vsrli_b(tmpr, 2);
1350 tmpb = __lsx_vor_v(reg0, tmpb);
1351 tmpg = __lsx_vor_v(reg1, tmpg);
1352 tmpr = __lsx_vor_v(reg2, tmpr);
1353 DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
1354 dst0 = __lsx_vilvl_h(reg1, reg0);
1355 dst1 = __lsx_vilvh_h(reg1, reg0);
1356 DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
1357 dst2 = __lsx_vilvl_h(reg1, reg0);
1358 dst3 = __lsx_vilvh_h(reg1, reg0);
1359 __lsx_vst(dst0, dst_argb, 0);
1360 __lsx_vst(dst1, dst_argb, 16);
1361 __lsx_vst(dst2, dst_argb, 32);
1362 __lsx_vst(dst3, dst_argb, 48);
1363 dst_argb += 64;
1364 src_argb1555 += 32;
1365 }
1366 }
1367
RGB565ToARGBRow_LSX(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)1368 void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565,
1369 uint8_t* dst_argb,
1370 int width) {
1371 int x;
1372 int len = width / 16;
1373 __m128i src0, src1;
1374 __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
1375 __m128i reg0, reg1, dst0, dst1, dst2, dst3;
1376 __m128i alpha = __lsx_vldi(0xFF);
1377
1378 for (x = 0; x < len; x++) {
1379 src0 = __lsx_vld(src_rgb565, 0);
1380 src1 = __lsx_vld(src_rgb565, 16);
1381 tmp0 = __lsx_vpickev_b(src1, src0);
1382 tmp1 = __lsx_vpickod_b(src1, src0);
1383 tmpb = __lsx_vandi_b(tmp0, 0x1F);
1384 tmpr = __lsx_vandi_b(tmp1, 0xF8);
1385 reg1 = __lsx_vandi_b(tmp1, 0x07);
1386 reg0 = __lsx_vsrli_b(tmp0, 5);
1387 reg1 = __lsx_vslli_b(reg1, 3);
1388 tmpg = __lsx_vor_v(reg1, reg0);
1389 reg0 = __lsx_vslli_b(tmpb, 3);
1390 reg1 = __lsx_vsrli_b(tmpb, 2);
1391 tmpb = __lsx_vor_v(reg1, reg0);
1392 reg0 = __lsx_vslli_b(tmpg, 2);
1393 reg1 = __lsx_vsrli_b(tmpg, 4);
1394 tmpg = __lsx_vor_v(reg1, reg0);
1395 reg0 = __lsx_vsrli_b(tmpr, 5);
1396 tmpr = __lsx_vor_v(tmpr, reg0);
1397 DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
1398 dst0 = __lsx_vilvl_h(reg1, reg0);
1399 dst1 = __lsx_vilvh_h(reg1, reg0);
1400 DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
1401 dst2 = __lsx_vilvl_h(reg1, reg0);
1402 dst3 = __lsx_vilvh_h(reg1, reg0);
1403 __lsx_vst(dst0, dst_argb, 0);
1404 __lsx_vst(dst1, dst_argb, 16);
1405 __lsx_vst(dst2, dst_argb, 32);
1406 __lsx_vst(dst3, dst_argb, 48);
1407 dst_argb += 64;
1408 src_rgb565 += 32;
1409 }
1410 }
1411
RGB24ToARGBRow_LSX(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)1412 void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24,
1413 uint8_t* dst_argb,
1414 int width) {
1415 int x;
1416 int len = width / 16;
1417 __m128i src0, src1, src2;
1418 __m128i tmp0, tmp1, tmp2;
1419 __m128i dst0, dst1, dst2, dst3;
1420 __m128i alpha = __lsx_vldi(0xFF);
1421 __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
1422 __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
1423 __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
1424 __m128i shuf3 = {0x1005040310020100, 0x100B0A0910080706};
1425
1426 for (x = 0; x < len; x++) {
1427 src0 = __lsx_vld(src_rgb24, 0);
1428 src1 = __lsx_vld(src_rgb24, 16);
1429 src2 = __lsx_vld(src_rgb24, 32);
1430 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
1431 tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
1432 DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
1433 tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
1434 __lsx_vst(dst0, dst_argb, 0);
1435 __lsx_vst(dst1, dst_argb, 16);
1436 __lsx_vst(dst2, dst_argb, 32);
1437 __lsx_vst(dst3, dst_argb, 48);
1438 dst_argb += 64;
1439 src_rgb24 += 48;
1440 }
1441 }
1442
RAWToARGBRow_LSX(const uint8_t * src_raw,uint8_t * dst_argb,int width)1443 void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
1444 int x;
1445 int len = width / 16;
1446 __m128i src0, src1, src2;
1447 __m128i tmp0, tmp1, tmp2;
1448 __m128i dst0, dst1, dst2, dst3;
1449 __m128i alpha = __lsx_vldi(0xFF);
1450 __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
1451 __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
1452 __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
1453 __m128i shuf3 = {0x1003040510000102, 0x10090A0B10060708};
1454
1455 for (x = 0; x < len; x++) {
1456 src0 = __lsx_vld(src_raw, 0);
1457 src1 = __lsx_vld(src_raw, 16);
1458 src2 = __lsx_vld(src_raw, 32);
1459 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
1460 tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
1461 DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
1462 tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
1463 __lsx_vst(dst0, dst_argb, 0);
1464 __lsx_vst(dst1, dst_argb, 16);
1465 __lsx_vst(dst2, dst_argb, 32);
1466 __lsx_vst(dst3, dst_argb, 48);
1467 dst_argb += 64;
1468 src_raw += 48;
1469 }
1470 }
1471
ARGB1555ToYRow_LSX(const uint8_t * src_argb1555,uint8_t * dst_y,int width)1472 void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555,
1473 uint8_t* dst_y,
1474 int width) {
1475 int x;
1476 int len = width / 16;
1477 __m128i src0, src1;
1478 __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
1479 __m128i reg0, reg1, reg2, dst0;
1480 __m128i const_66 = __lsx_vldi(66);
1481 __m128i const_129 = __lsx_vldi(129);
1482 __m128i const_25 = __lsx_vldi(25);
1483 __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
1484
1485 for (x = 0; x < len; x++) {
1486 src0 = __lsx_vld(src_argb1555, 0);
1487 src1 = __lsx_vld(src_argb1555, 16);
1488 tmp0 = __lsx_vpickev_b(src1, src0);
1489 tmp1 = __lsx_vpickod_b(src1, src0);
1490 tmpb = __lsx_vandi_b(tmp0, 0x1F);
1491 tmpg = __lsx_vsrli_b(tmp0, 5);
1492 reg0 = __lsx_vandi_b(tmp1, 0x03);
1493 reg0 = __lsx_vslli_b(reg0, 3);
1494 tmpg = __lsx_vor_v(tmpg, reg0);
1495 reg1 = __lsx_vandi_b(tmp1, 0x7C);
1496 tmpr = __lsx_vsrli_b(reg1, 2);
1497 reg0 = __lsx_vslli_b(tmpb, 3);
1498 reg1 = __lsx_vslli_b(tmpg, 3);
1499 reg2 = __lsx_vslli_b(tmpr, 3);
1500 tmpb = __lsx_vsrli_b(tmpb, 2);
1501 tmpg = __lsx_vsrli_b(tmpg, 2);
1502 tmpr = __lsx_vsrli_b(tmpr, 2);
1503 tmpb = __lsx_vor_v(reg0, tmpb);
1504 tmpg = __lsx_vor_v(reg1, tmpg);
1505 tmpr = __lsx_vor_v(reg2, tmpr);
1506 reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
1507 reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
1508 reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
1509 reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
1510 reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
1511 reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
1512 dst0 = __lsx_vpackod_b(reg1, reg0);
1513 __lsx_vst(dst0, dst_y, 0);
1514 dst_y += 16;
1515 src_argb1555 += 32;
1516 }
1517 }
1518
ARGB1555ToUVRow_LSX(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)1519 void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
1520 int src_stride_argb1555,
1521 uint8_t* dst_u,
1522 uint8_t* dst_v,
1523 int width) {
1524 int x;
1525 int len = width / 16;
1526 const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
1527 __m128i src0, src1, src2, src3;
1528 __m128i tmp0, tmp1, tmp2, tmp3;
1529 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1530 __m128i reg0, reg1, reg2, reg3, dst0;
1531 __m128i const_112 = __lsx_vldi(0x438);
1532 __m128i const_74 = __lsx_vldi(0x425);
1533 __m128i const_38 = __lsx_vldi(0x413);
1534 __m128i const_94 = __lsx_vldi(0x42F);
1535 __m128i const_18 = __lsx_vldi(0x409);
1536 __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
1537
1538 for (x = 0; x < len; x++) {
1539 DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0,
1540 next_argb1555, 16, src0, src1, src2, src3);
1541 DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
1542 DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
1543 tmpb = __lsx_vandi_b(tmp0, 0x1F);
1544 nexb = __lsx_vandi_b(tmp2, 0x1F);
1545 tmpg = __lsx_vsrli_b(tmp0, 5);
1546 nexg = __lsx_vsrli_b(tmp2, 5);
1547 reg0 = __lsx_vandi_b(tmp1, 0x03);
1548 reg2 = __lsx_vandi_b(tmp3, 0x03);
1549 reg0 = __lsx_vslli_b(reg0, 3);
1550 reg2 = __lsx_vslli_b(reg2, 3);
1551 tmpg = __lsx_vor_v(tmpg, reg0);
1552 nexg = __lsx_vor_v(nexg, reg2);
1553 reg1 = __lsx_vandi_b(tmp1, 0x7C);
1554 reg3 = __lsx_vandi_b(tmp3, 0x7C);
1555 tmpr = __lsx_vsrli_b(reg1, 2);
1556 nexr = __lsx_vsrli_b(reg3, 2);
1557 reg0 = __lsx_vslli_b(tmpb, 3);
1558 reg1 = __lsx_vslli_b(tmpg, 3);
1559 reg2 = __lsx_vslli_b(tmpr, 3);
1560 tmpb = __lsx_vsrli_b(tmpb, 2);
1561 tmpg = __lsx_vsrli_b(tmpg, 2);
1562 tmpr = __lsx_vsrli_b(tmpr, 2);
1563 tmpb = __lsx_vor_v(reg0, tmpb);
1564 tmpg = __lsx_vor_v(reg1, tmpg);
1565 tmpr = __lsx_vor_v(reg2, tmpr);
1566 reg0 = __lsx_vslli_b(nexb, 3);
1567 reg1 = __lsx_vslli_b(nexg, 3);
1568 reg2 = __lsx_vslli_b(nexr, 3);
1569 nexb = __lsx_vsrli_b(nexb, 2);
1570 nexg = __lsx_vsrli_b(nexg, 2);
1571 nexr = __lsx_vsrli_b(nexr, 2);
1572 nexb = __lsx_vor_v(reg0, nexb);
1573 nexg = __lsx_vor_v(reg1, nexg);
1574 nexr = __lsx_vor_v(reg2, nexr);
1575 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1576 __lsx_vstelm_d(dst0, dst_u, 0, 0);
1577 __lsx_vstelm_d(dst0, dst_v, 0, 1);
1578 dst_u += 8;
1579 dst_v += 8;
1580 src_argb1555 += 32;
1581 next_argb1555 += 32;
1582 }
1583 }
1584
RGB565ToYRow_LSX(const uint8_t * src_rgb565,uint8_t * dst_y,int width)1585 void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
1586 int x;
1587 int len = width / 16;
1588 __m128i src0, src1;
1589 __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
1590 __m128i reg0, reg1, dst0;
1591 __m128i const_66 = __lsx_vldi(66);
1592 __m128i const_129 = __lsx_vldi(129);
1593 __m128i const_25 = __lsx_vldi(25);
1594 __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
1595
1596 for (x = 0; x < len; x++) {
1597 src0 = __lsx_vld(src_rgb565, 0);
1598 src1 = __lsx_vld(src_rgb565, 16);
1599 tmp0 = __lsx_vpickev_b(src1, src0);
1600 tmp1 = __lsx_vpickod_b(src1, src0);
1601 tmpb = __lsx_vandi_b(tmp0, 0x1F);
1602 tmpr = __lsx_vandi_b(tmp1, 0xF8);
1603 reg1 = __lsx_vandi_b(tmp1, 0x07);
1604 reg0 = __lsx_vsrli_b(tmp0, 5);
1605 reg1 = __lsx_vslli_b(reg1, 3);
1606 tmpg = __lsx_vor_v(reg1, reg0);
1607 reg0 = __lsx_vslli_b(tmpb, 3);
1608 reg1 = __lsx_vsrli_b(tmpb, 2);
1609 tmpb = __lsx_vor_v(reg1, reg0);
1610 reg0 = __lsx_vslli_b(tmpg, 2);
1611 reg1 = __lsx_vsrli_b(tmpg, 4);
1612 tmpg = __lsx_vor_v(reg1, reg0);
1613 reg0 = __lsx_vsrli_b(tmpr, 5);
1614 tmpr = __lsx_vor_v(tmpr, reg0);
1615 reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
1616 reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
1617 reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
1618 reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
1619 reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
1620 reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
1621 dst0 = __lsx_vpackod_b(reg1, reg0);
1622 __lsx_vst(dst0, dst_y, 0);
1623 dst_y += 16;
1624 src_rgb565 += 32;
1625 }
1626 }
1627
RGB565ToUVRow_LSX(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)1628 void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
1629 int src_stride_rgb565,
1630 uint8_t* dst_u,
1631 uint8_t* dst_v,
1632 int width) {
1633 int x;
1634 int len = width / 16;
1635 const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
1636 __m128i src0, src1, src2, src3;
1637 __m128i tmp0, tmp1, tmp2, tmp3;
1638 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1639 __m128i reg0, reg1, reg2, reg3, dst0;
1640 __m128i const_112 = __lsx_vldi(0x438);
1641 __m128i const_74 = __lsx_vldi(0x425);
1642 __m128i const_38 = __lsx_vldi(0x413);
1643 __m128i const_94 = __lsx_vldi(0x42F);
1644 __m128i const_18 = __lsx_vldi(0x409);
1645 __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
1646
1647 for (x = 0; x < len; x++) {
1648 DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0,
1649 next_rgb565, 16, src0, src1, src2, src3);
1650 DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
1651 DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
1652 tmpb = __lsx_vandi_b(tmp0, 0x1F);
1653 tmpr = __lsx_vandi_b(tmp1, 0xF8);
1654 nexb = __lsx_vandi_b(tmp2, 0x1F);
1655 nexr = __lsx_vandi_b(tmp3, 0xF8);
1656 reg1 = __lsx_vandi_b(tmp1, 0x07);
1657 reg3 = __lsx_vandi_b(tmp3, 0x07);
1658 reg0 = __lsx_vsrli_b(tmp0, 5);
1659 reg1 = __lsx_vslli_b(reg1, 3);
1660 reg2 = __lsx_vsrli_b(tmp2, 5);
1661 reg3 = __lsx_vslli_b(reg3, 3);
1662 tmpg = __lsx_vor_v(reg1, reg0);
1663 nexg = __lsx_vor_v(reg2, reg3);
1664 reg0 = __lsx_vslli_b(tmpb, 3);
1665 reg1 = __lsx_vsrli_b(tmpb, 2);
1666 reg2 = __lsx_vslli_b(nexb, 3);
1667 reg3 = __lsx_vsrli_b(nexb, 2);
1668 tmpb = __lsx_vor_v(reg1, reg0);
1669 nexb = __lsx_vor_v(reg2, reg3);
1670 reg0 = __lsx_vslli_b(tmpg, 2);
1671 reg1 = __lsx_vsrli_b(tmpg, 4);
1672 reg2 = __lsx_vslli_b(nexg, 2);
1673 reg3 = __lsx_vsrli_b(nexg, 4);
1674 tmpg = __lsx_vor_v(reg1, reg0);
1675 nexg = __lsx_vor_v(reg2, reg3);
1676 reg0 = __lsx_vsrli_b(tmpr, 5);
1677 reg2 = __lsx_vsrli_b(nexr, 5);
1678 tmpr = __lsx_vor_v(tmpr, reg0);
1679 nexr = __lsx_vor_v(nexr, reg2);
1680 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1681 __lsx_vstelm_d(dst0, dst_u, 0, 0);
1682 __lsx_vstelm_d(dst0, dst_v, 0, 1);
1683 dst_u += 8;
1684 dst_v += 8;
1685 src_rgb565 += 32;
1686 next_rgb565 += 32;
1687 }
1688 }
1689
RGB24ToUVRow_LSX(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)1690 void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
1691 int src_stride_rgb24,
1692 uint8_t* dst_u,
1693 uint8_t* dst_v,
1694 int width) {
1695 int x;
1696 const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
1697 int len = width / 16;
1698 __m128i src0, src1, src2;
1699 __m128i nex0, nex1, nex2, dst0;
1700 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1701 __m128i const_112 = __lsx_vldi(0x438);
1702 __m128i const_74 = __lsx_vldi(0x425);
1703 __m128i const_38 = __lsx_vldi(0x413);
1704 __m128i const_94 = __lsx_vldi(0x42F);
1705 __m128i const_18 = __lsx_vldi(0x409);
1706 __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
1707 __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18};
1708 __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908};
1709 __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
1710 __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
1711 __m128i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A};
1712 __m128i shuff1_r = {0x0706050403020100, 0x1F1C191613100908};
1713
1714 for (x = 0; x < len; x++) {
1715 src0 = __lsx_vld(src_rgb24, 0);
1716 src1 = __lsx_vld(src_rgb24, 16);
1717 src2 = __lsx_vld(src_rgb24, 32);
1718 nex0 = __lsx_vld(next_rgb24, 0);
1719 nex1 = __lsx_vld(next_rgb24, 16);
1720 nex2 = __lsx_vld(next_rgb24, 32);
1721 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
1722 nexb);
1723 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
1724 nexg);
1725 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
1726 nexr);
1727 DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
1728 nexb);
1729 DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
1730 nexg);
1731 DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
1732 nexr);
1733 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1734 __lsx_vstelm_d(dst0, dst_u, 0, 0);
1735 __lsx_vstelm_d(dst0, dst_v, 0, 1);
1736 dst_u += 8;
1737 dst_v += 8;
1738 src_rgb24 += 48;
1739 next_rgb24 += 48;
1740 }
1741 }
1742
RAWToUVRow_LSX(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)1743 void RAWToUVRow_LSX(const uint8_t* src_raw,
1744 int src_stride_raw,
1745 uint8_t* dst_u,
1746 uint8_t* dst_v,
1747 int width) {
1748 int x;
1749 const uint8_t* next_raw = src_raw + src_stride_raw;
1750 int len = width / 16;
1751 __m128i src0, src1, src2;
1752 __m128i nex0, nex1, nex2, dst0;
1753 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1754 __m128i const_112 = __lsx_vldi(0x438);
1755 __m128i const_74 = __lsx_vldi(0x425);
1756 __m128i const_38 = __lsx_vldi(0x413);
1757 __m128i const_94 = __lsx_vldi(0x42F);
1758 __m128i const_18 = __lsx_vldi(0x409);
1759 __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
1760 __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18};
1761 __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908};
1762 __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
1763 __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
1764 __m128i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A};
1765 __m128i shuff1_b = {0x0706050403020100, 0x1F1C191613100908};
1766
1767 for (x = 0; x < len; x++) {
1768 src0 = __lsx_vld(src_raw, 0);
1769 src1 = __lsx_vld(src_raw, 16);
1770 src2 = __lsx_vld(src_raw, 32);
1771 nex0 = __lsx_vld(next_raw, 0);
1772 nex1 = __lsx_vld(next_raw, 16);
1773 nex2 = __lsx_vld(next_raw, 32);
1774 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
1775 nexb);
1776 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
1777 nexg);
1778 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
1779 nexr);
1780 DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
1781 nexb);
1782 DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
1783 nexg);
1784 DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
1785 nexr);
1786 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
1787 __lsx_vstelm_d(dst0, dst_u, 0, 0);
1788 __lsx_vstelm_d(dst0, dst_v, 0, 1);
1789 dst_u += 8;
1790 dst_v += 8;
1791 src_raw += 48;
1792 next_raw += 48;
1793 }
1794 }
1795
NV12ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)1796 void NV12ToARGBRow_LSX(const uint8_t* src_y,
1797 const uint8_t* src_uv,
1798 uint8_t* dst_argb,
1799 const struct YuvConstants* yuvconstants,
1800 int width) {
1801 int x;
1802 int len = width / 8;
1803 __m128i vec_y, vec_vu;
1804 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
1805 __m128i vec_vrub, vec_vgug;
1806 __m128i out_b, out_g, out_r;
1807 __m128i const_80 = __lsx_vldi(0x480);
1808 __m128i alpha = __lsx_vldi(0xFF);
1809 __m128i zero = __lsx_vldi(0);
1810
1811 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
1812 vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
1813 vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
1814
1815 for (x = 0; x < len; x++) {
1816 vec_y = __lsx_vld(src_y, 0);
1817 vec_vu = __lsx_vld(src_uv, 0);
1818 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
1819 out_r);
1820 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
1821 src_y += 8;
1822 src_uv += 8;
1823 }
1824 }
1825
NV12ToRGB565Row_LSX(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)1826 void NV12ToRGB565Row_LSX(const uint8_t* src_y,
1827 const uint8_t* src_uv,
1828 uint8_t* dst_rgb565,
1829 const struct YuvConstants* yuvconstants,
1830 int width) {
1831 int x;
1832 int len = width / 8;
1833 __m128i vec_y, vec_vu;
1834 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
1835 __m128i vec_vrub, vec_vgug;
1836 __m128i out_b, out_g, out_r;
1837 __m128i const_80 = __lsx_vldi(0x480);
1838 __m128i zero = __lsx_vldi(0);
1839
1840 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
1841 vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
1842 vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
1843
1844 for (x = 0; x < len; x++) {
1845 vec_y = __lsx_vld(src_y, 0);
1846 vec_vu = __lsx_vld(src_uv, 0);
1847 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
1848 out_r);
1849 out_b = __lsx_vsrli_h(out_b, 3);
1850 out_g = __lsx_vsrli_h(out_g, 2);
1851 out_r = __lsx_vsrli_h(out_r, 3);
1852 out_g = __lsx_vslli_h(out_g, 5);
1853 out_r = __lsx_vslli_h(out_r, 11);
1854 out_r = __lsx_vor_v(out_r, out_g);
1855 out_r = __lsx_vor_v(out_r, out_b);
1856 __lsx_vst(out_r, dst_rgb565, 0);
1857 src_y += 8;
1858 src_uv += 8;
1859 dst_rgb565 += 16;
1860 }
1861 }
1862
NV21ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)1863 void NV21ToARGBRow_LSX(const uint8_t* src_y,
1864 const uint8_t* src_vu,
1865 uint8_t* dst_argb,
1866 const struct YuvConstants* yuvconstants,
1867 int width) {
1868 int x;
1869 int len = width / 8;
1870 __m128i vec_y, vec_uv;
1871 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
1872 __m128i vec_ubvr, vec_ugvg;
1873 __m128i out_b, out_g, out_r;
1874 __m128i const_80 = __lsx_vldi(0x480);
1875 __m128i alpha = __lsx_vldi(0xFF);
1876 __m128i zero = __lsx_vldi(0);
1877
1878 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
1879 vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
1880 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
1881
1882 for (x = 0; x < len; x++) {
1883 vec_y = __lsx_vld(src_y, 0);
1884 vec_uv = __lsx_vld(src_vu, 0);
1885 YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_r, out_g,
1886 out_b);
1887 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
1888 src_y += 8;
1889 src_vu += 8;
1890 }
1891 }
1892
SobelRow_LSX(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)1893 void SobelRow_LSX(const uint8_t* src_sobelx,
1894 const uint8_t* src_sobely,
1895 uint8_t* dst_argb,
1896 int width) {
1897 int x;
1898 int len = width / 16;
1899 __m128i src0, src1, tmp0;
1900 __m128i out0, out1, out2, out3;
1901 __m128i alpha = __lsx_vldi(0xFF);
1902 __m128i shuff0 = {0x1001010110000000, 0x1003030310020202};
1903 __m128i shuff1 = __lsx_vaddi_bu(shuff0, 0x04);
1904 __m128i shuff2 = __lsx_vaddi_bu(shuff1, 0x04);
1905 __m128i shuff3 = __lsx_vaddi_bu(shuff2, 0x04);
1906
1907 for (x = 0; x < len; x++) {
1908 src0 = __lsx_vld(src_sobelx, 0);
1909 src1 = __lsx_vld(src_sobely, 0);
1910 tmp0 = __lsx_vsadd_bu(src0, src1);
1911 DUP4_ARG3(__lsx_vshuf_b, alpha, tmp0, shuff0, alpha, tmp0, shuff1, alpha,
1912 tmp0, shuff2, alpha, tmp0, shuff3, out0, out1, out2, out3);
1913 __lsx_vst(out0, dst_argb, 0);
1914 __lsx_vst(out1, dst_argb, 16);
1915 __lsx_vst(out2, dst_argb, 32);
1916 __lsx_vst(out3, dst_argb, 48);
1917 src_sobelx += 16;
1918 src_sobely += 16;
1919 dst_argb += 64;
1920 }
1921 }
1922
SobelToPlaneRow_LSX(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)1923 void SobelToPlaneRow_LSX(const uint8_t* src_sobelx,
1924 const uint8_t* src_sobely,
1925 uint8_t* dst_y,
1926 int width) {
1927 int x;
1928 int len = width / 32;
1929 __m128i src0, src1, src2, src3, dst0, dst1;
1930
1931 for (x = 0; x < len; x++) {
1932 DUP2_ARG2(__lsx_vld, src_sobelx, 0, src_sobelx, 16, src0, src1);
1933 DUP2_ARG2(__lsx_vld, src_sobely, 0, src_sobely, 16, src2, src3);
1934 dst0 = __lsx_vsadd_bu(src0, src2);
1935 dst1 = __lsx_vsadd_bu(src1, src3);
1936 __lsx_vst(dst0, dst_y, 0);
1937 __lsx_vst(dst1, dst_y, 16);
1938 src_sobelx += 32;
1939 src_sobely += 32;
1940 dst_y += 32;
1941 }
1942 }
1943
SobelXYRow_LSX(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)1944 void SobelXYRow_LSX(const uint8_t* src_sobelx,
1945 const uint8_t* src_sobely,
1946 uint8_t* dst_argb,
1947 int width) {
1948 int x;
1949 int len = width / 16;
1950 __m128i src_r, src_b, src_g;
1951 __m128i tmp0, tmp1, tmp2, tmp3;
1952 __m128i dst0, dst1, dst2, dst3;
1953 __m128i alpha = __lsx_vldi(0xFF);
1954
1955 for (x = 0; x < len; x++) {
1956 src_r = __lsx_vld(src_sobelx, 0);
1957 src_b = __lsx_vld(src_sobely, 0);
1958 src_g = __lsx_vsadd_bu(src_r, src_b);
1959 tmp0 = __lsx_vilvl_b(src_g, src_b);
1960 tmp1 = __lsx_vilvh_b(src_g, src_b);
1961 tmp2 = __lsx_vilvl_b(alpha, src_r);
1962 tmp3 = __lsx_vilvh_b(alpha, src_r);
1963 dst0 = __lsx_vilvl_h(tmp2, tmp0);
1964 dst1 = __lsx_vilvh_h(tmp2, tmp0);
1965 dst2 = __lsx_vilvl_h(tmp3, tmp1);
1966 dst3 = __lsx_vilvh_h(tmp3, tmp1);
1967 __lsx_vst(dst0, dst_argb, 0);
1968 __lsx_vst(dst1, dst_argb, 16);
1969 __lsx_vst(dst2, dst_argb, 32);
1970 __lsx_vst(dst3, dst_argb, 48);
1971 src_sobelx += 16;
1972 src_sobely += 16;
1973 dst_argb += 64;
1974 }
1975 }
1976
BGRAToUVRow_LSX(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1977 void BGRAToUVRow_LSX(const uint8_t* src_bgra,
1978 int src_stride_bgra,
1979 uint8_t* dst_u,
1980 uint8_t* dst_v,
1981 int width) {
1982 int x;
1983 const uint8_t* next_bgra = src_bgra + src_stride_bgra;
1984 int len = width / 16;
1985 __m128i src0, src1, src2, src3;
1986 __m128i nex0, nex1, nex2, nex3;
1987 __m128i tmp0, tmp1, tmp2, tmp3, dst0;
1988 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
1989 __m128i const_112 = __lsx_vldi(0x438);
1990 __m128i const_74 = __lsx_vldi(0x425);
1991 __m128i const_38 = __lsx_vldi(0x413);
1992 __m128i const_94 = __lsx_vldi(0x42F);
1993 __m128i const_18 = __lsx_vldi(0x409);
1994 __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
1995
1996 for (x = 0; x < len; x++) {
1997 DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
1998 src0, src1, src2, src3);
1999 DUP4_ARG2(__lsx_vld, next_bgra, 0, next_bgra, 16, next_bgra, 32, next_bgra,
2000 48, nex0, nex1, nex2, nex3);
2001 tmp0 = __lsx_vpickod_b(src1, src0);
2002 tmp1 = __lsx_vpickev_b(src1, src0);
2003 tmp2 = __lsx_vpickod_b(src3, src2);
2004 tmp3 = __lsx_vpickev_b(src3, src2);
2005 tmpb = __lsx_vpickod_b(tmp2, tmp0);
2006 tmpr = __lsx_vpickev_b(tmp2, tmp0);
2007 tmpg = __lsx_vpickod_b(tmp3, tmp1);
2008 tmp0 = __lsx_vpickod_b(nex1, nex0);
2009 tmp1 = __lsx_vpickev_b(nex1, nex0);
2010 tmp2 = __lsx_vpickod_b(nex3, nex2);
2011 tmp3 = __lsx_vpickev_b(nex3, nex2);
2012 nexb = __lsx_vpickod_b(tmp2, tmp0);
2013 nexr = __lsx_vpickev_b(tmp2, tmp0);
2014 nexg = __lsx_vpickod_b(tmp3, tmp1);
2015 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
2016 __lsx_vstelm_d(dst0, dst_u, 0, 0);
2017 __lsx_vstelm_d(dst0, dst_v, 0, 1);
2018 dst_u += 8;
2019 dst_v += 8;
2020 src_bgra += 64;
2021 next_bgra += 64;
2022 }
2023 }
2024
ABGRToUVRow_LSX(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2025 void ABGRToUVRow_LSX(const uint8_t* src_abgr,
2026 int src_stride_abgr,
2027 uint8_t* dst_u,
2028 uint8_t* dst_v,
2029 int width) {
2030 int x;
2031 const uint8_t* next_abgr = src_abgr + src_stride_abgr;
2032 int len = width / 16;
2033 __m128i src0, src1, src2, src3;
2034 __m128i nex0, nex1, nex2, nex3;
2035 __m128i tmp0, tmp1, tmp2, tmp3, dst0;
2036 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
2037 __m128i const_112 = __lsx_vldi(0x438);
2038 __m128i const_74 = __lsx_vldi(0x425);
2039 __m128i const_38 = __lsx_vldi(0x413);
2040 __m128i const_94 = __lsx_vldi(0x42F);
2041 __m128i const_18 = __lsx_vldi(0x409);
2042 __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
2043
2044 for (x = 0; x < len; x++) {
2045 DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
2046 src0, src1, src2, src3);
2047 DUP4_ARG2(__lsx_vld, next_abgr, 0, next_abgr, 16, next_abgr, 32, next_abgr,
2048 48, nex0, nex1, nex2, nex3);
2049 tmp0 = __lsx_vpickev_b(src1, src0);
2050 tmp1 = __lsx_vpickod_b(src1, src0);
2051 tmp2 = __lsx_vpickev_b(src3, src2);
2052 tmp3 = __lsx_vpickod_b(src3, src2);
2053 tmpb = __lsx_vpickod_b(tmp2, tmp0);
2054 tmpr = __lsx_vpickev_b(tmp2, tmp0);
2055 tmpg = __lsx_vpickev_b(tmp3, tmp1);
2056 tmp0 = __lsx_vpickev_b(nex1, nex0);
2057 tmp1 = __lsx_vpickod_b(nex1, nex0);
2058 tmp2 = __lsx_vpickev_b(nex3, nex2);
2059 tmp3 = __lsx_vpickod_b(nex3, nex2);
2060 nexb = __lsx_vpickod_b(tmp2, tmp0);
2061 nexr = __lsx_vpickev_b(tmp2, tmp0);
2062 nexg = __lsx_vpickev_b(tmp3, tmp1);
2063 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
2064 __lsx_vstelm_d(dst0, dst_u, 0, 0);
2065 __lsx_vstelm_d(dst0, dst_v, 0, 1);
2066 dst_u += 8;
2067 dst_v += 8;
2068 src_abgr += 64;
2069 next_abgr += 64;
2070 }
2071 }
2072
RGBAToUVRow_LSX(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2073 void RGBAToUVRow_LSX(const uint8_t* src_rgba,
2074 int src_stride_rgba,
2075 uint8_t* dst_u,
2076 uint8_t* dst_v,
2077 int width) {
2078 int x;
2079 const uint8_t* next_rgba = src_rgba + src_stride_rgba;
2080 int len = width / 16;
2081 __m128i src0, src1, src2, src3;
2082 __m128i nex0, nex1, nex2, nex3;
2083 __m128i tmp0, tmp1, tmp2, tmp3, dst0;
2084 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
2085 __m128i const_112 = __lsx_vldi(0x438);
2086 __m128i const_74 = __lsx_vldi(0x425);
2087 __m128i const_38 = __lsx_vldi(0x413);
2088 __m128i const_94 = __lsx_vldi(0x42F);
2089 __m128i const_18 = __lsx_vldi(0x409);
2090 __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
2091
2092 for (x = 0; x < len; x++) {
2093 DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
2094 src0, src1, src2, src3);
2095 DUP4_ARG2(__lsx_vld, next_rgba, 0, next_rgba, 16, next_rgba, 32, next_rgba,
2096 48, nex0, nex1, nex2, nex3);
2097 tmp0 = __lsx_vpickod_b(src1, src0);
2098 tmp1 = __lsx_vpickev_b(src1, src0);
2099 tmp2 = __lsx_vpickod_b(src3, src2);
2100 tmp3 = __lsx_vpickev_b(src3, src2);
2101 tmpr = __lsx_vpickod_b(tmp2, tmp0);
2102 tmpb = __lsx_vpickev_b(tmp2, tmp0);
2103 tmpg = __lsx_vpickod_b(tmp3, tmp1);
2104 tmp0 = __lsx_vpickod_b(nex1, nex0);
2105 tmp1 = __lsx_vpickev_b(nex1, nex0);
2106 tmp2 = __lsx_vpickod_b(nex3, nex2);
2107 tmp3 = __lsx_vpickev_b(nex3, nex2);
2108 nexr = __lsx_vpickod_b(tmp2, tmp0);
2109 nexb = __lsx_vpickev_b(tmp2, tmp0);
2110 nexg = __lsx_vpickod_b(tmp3, tmp1);
2111 RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
2112 __lsx_vstelm_d(dst0, dst_u, 0, 0);
2113 __lsx_vstelm_d(dst0, dst_v, 0, 1);
2114 dst_u += 8;
2115 dst_v += 8;
2116 src_rgba += 64;
2117 next_rgba += 64;
2118 }
2119 }
2120
ARGBToUVJRow_LSX(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2121 void ARGBToUVJRow_LSX(const uint8_t* src_argb,
2122 int src_stride_argb,
2123 uint8_t* dst_u,
2124 uint8_t* dst_v,
2125 int width) {
2126 int x;
2127 const uint8_t* next_argb = src_argb + src_stride_argb;
2128 int len = width / 16;
2129 __m128i src0, src1, src2, src3;
2130 __m128i nex0, nex1, nex2, nex3;
2131 __m128i tmp0, tmp1, tmp2, tmp3;
2132 __m128i reg0, reg1, dst0;
2133 __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
2134 __m128i const_63 = __lsx_vldi(0x43F);
2135 __m128i const_42 = __lsx_vldi(0x42A);
2136 __m128i const_21 = __lsx_vldi(0x415);
2137 __m128i const_53 = __lsx_vldi(0x435);
2138 __m128i const_10 = __lsx_vldi(0x40A);
2139 __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
2140
2141 for (x = 0; x < len; x++) {
2142 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
2143 src0, src1, src2, src3);
2144 DUP4_ARG2(__lsx_vld, next_argb, 0, next_argb, 16, next_argb, 32, next_argb,
2145 48, nex0, nex1, nex2, nex3);
2146 tmp0 = __lsx_vpickev_b(src1, src0);
2147 tmp1 = __lsx_vpickod_b(src1, src0);
2148 tmp2 = __lsx_vpickev_b(src3, src2);
2149 tmp3 = __lsx_vpickod_b(src3, src2);
2150 tmpr = __lsx_vpickod_b(tmp2, tmp0);
2151 tmpb = __lsx_vpickev_b(tmp2, tmp0);
2152 tmpg = __lsx_vpickev_b(tmp3, tmp1);
2153 tmp0 = __lsx_vpickev_b(nex1, nex0);
2154 tmp1 = __lsx_vpickod_b(nex1, nex0);
2155 tmp2 = __lsx_vpickev_b(nex3, nex2);
2156 tmp3 = __lsx_vpickod_b(nex3, nex2);
2157 nexr = __lsx_vpickod_b(tmp2, tmp0);
2158 nexb = __lsx_vpickev_b(tmp2, tmp0);
2159 nexg = __lsx_vpickev_b(tmp3, tmp1);
2160 tmp0 = __lsx_vaddwev_h_bu(tmpb, nexb);
2161 tmp1 = __lsx_vaddwod_h_bu(tmpb, nexb);
2162 tmp2 = __lsx_vaddwev_h_bu(tmpg, nexg);
2163 tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg);
2164 reg0 = __lsx_vaddwev_h_bu(tmpr, nexr);
2165 reg1 = __lsx_vaddwod_h_bu(tmpr, nexr);
2166 tmpb = __lsx_vavgr_hu(tmp0, tmp1);
2167 tmpg = __lsx_vavgr_hu(tmp2, tmp3);
2168 tmpr = __lsx_vavgr_hu(reg0, reg1);
2169 reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb);
2170 reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr);
2171 reg0 = __lsx_vmsub_h(reg0, const_42, tmpg);
2172 reg1 = __lsx_vmsub_h(reg1, const_53, tmpg);
2173 reg0 = __lsx_vmsub_h(reg0, const_21, tmpr);
2174 reg1 = __lsx_vmsub_h(reg1, const_10, tmpb);
2175 dst0 = __lsx_vpickod_b(reg1, reg0);
2176 __lsx_vstelm_d(dst0, dst_u, 0, 0);
2177 __lsx_vstelm_d(dst0, dst_v, 0, 1);
2178 dst_u += 8;
2179 dst_v += 8;
2180 src_argb += 64;
2181 next_argb += 64;
2182 }
2183 }
2184
I444ToARGBRow_LSX(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2185 void I444ToARGBRow_LSX(const uint8_t* src_y,
2186 const uint8_t* src_u,
2187 const uint8_t* src_v,
2188 uint8_t* dst_argb,
2189 const struct YuvConstants* yuvconstants,
2190 int width) {
2191 int x;
2192 int len = width / 16;
2193 __m128i vec_y, vec_u, vec_v, out_b, out_g, out_r;
2194 __m128i vec_yl, vec_yh, vec_ul, vec_vl, vec_uh, vec_vh;
2195 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb, vec_ugvg;
2196 __m128i const_80 = __lsx_vldi(0x480);
2197 __m128i alpha = __lsx_vldi(0xFF);
2198 __m128i zero = __lsx_vldi(0);
2199
2200 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
2201 vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
2202
2203 for (x = 0; x < len; x++) {
2204 vec_y = __lsx_vld(src_y, 0);
2205 vec_u = __lsx_vld(src_u, 0);
2206 vec_v = __lsx_vld(src_v, 0);
2207 vec_yl = __lsx_vilvl_b(vec_y, vec_y);
2208 vec_ul = __lsx_vilvl_b(zero, vec_u);
2209 vec_vl = __lsx_vilvl_b(zero, vec_v);
2210 I444TORGB(vec_yl, vec_ul, vec_vl, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
2211 out_b, out_g, out_r);
2212 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
2213 vec_yh = __lsx_vilvh_b(vec_y, vec_y);
2214 vec_uh = __lsx_vilvh_b(zero, vec_u);
2215 vec_vh = __lsx_vilvh_b(zero, vec_v);
2216 I444TORGB(vec_yh, vec_uh, vec_vh, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
2217 out_b, out_g, out_r);
2218 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
2219 src_y += 16;
2220 src_u += 16;
2221 src_v += 16;
2222 }
2223 }
2224
I400ToARGBRow_LSX(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2225 void I400ToARGBRow_LSX(const uint8_t* src_y,
2226 uint8_t* dst_argb,
2227 const struct YuvConstants* yuvconstants,
2228 int width) {
2229 int x;
2230 int len = width / 16;
2231 __m128i vec_y, vec_yl, vec_yh, out0;
2232 __m128i y_ev, y_od, dst0, dst1, dst2, dst3;
2233 __m128i temp0, temp1;
2234 __m128i alpha = __lsx_vldi(0xFF);
2235 __m128i vec_yg = __lsx_vreplgr2vr_h(yuvconstants->kYToRgb[0]);
2236 __m128i vec_yb = __lsx_vreplgr2vr_w(yuvconstants->kYBiasToRgb[0]);
2237
2238 for (x = 0; x < len; x++) {
2239 vec_y = __lsx_vld(src_y, 0);
2240 vec_yl = __lsx_vilvl_b(vec_y, vec_y);
2241 y_ev = __lsx_vmulwev_w_hu_h(vec_yl, vec_yg);
2242 y_od = __lsx_vmulwod_w_hu_h(vec_yl, vec_yg);
2243 y_ev = __lsx_vsrai_w(y_ev, 16);
2244 y_od = __lsx_vsrai_w(y_od, 16);
2245 y_ev = __lsx_vadd_w(y_ev, vec_yb);
2246 y_od = __lsx_vadd_w(y_od, vec_yb);
2247 y_ev = __lsx_vsrai_w(y_ev, 6);
2248 y_od = __lsx_vsrai_w(y_od, 6);
2249 y_ev = __lsx_vclip255_w(y_ev);
2250 y_od = __lsx_vclip255_w(y_od);
2251 out0 = __lsx_vpackev_h(y_od, y_ev);
2252 temp0 = __lsx_vpackev_b(out0, out0);
2253 temp1 = __lsx_vpackev_b(alpha, out0);
2254 dst0 = __lsx_vilvl_h(temp1, temp0);
2255 dst1 = __lsx_vilvh_h(temp1, temp0);
2256 vec_yh = __lsx_vilvh_b(vec_y, vec_y);
2257 y_ev = __lsx_vmulwev_w_hu_h(vec_yh, vec_yg);
2258 y_od = __lsx_vmulwod_w_hu_h(vec_yh, vec_yg);
2259 y_ev = __lsx_vsrai_w(y_ev, 16);
2260 y_od = __lsx_vsrai_w(y_od, 16);
2261 y_ev = __lsx_vadd_w(y_ev, vec_yb);
2262 y_od = __lsx_vadd_w(y_od, vec_yb);
2263 y_ev = __lsx_vsrai_w(y_ev, 6);
2264 y_od = __lsx_vsrai_w(y_od, 6);
2265 y_ev = __lsx_vclip255_w(y_ev);
2266 y_od = __lsx_vclip255_w(y_od);
2267 out0 = __lsx_vpackev_h(y_od, y_ev);
2268 temp0 = __lsx_vpackev_b(out0, out0);
2269 temp1 = __lsx_vpackev_b(alpha, out0);
2270 dst2 = __lsx_vilvl_h(temp1, temp0);
2271 dst3 = __lsx_vilvh_h(temp1, temp0);
2272 __lsx_vst(dst0, dst_argb, 0);
2273 __lsx_vst(dst1, dst_argb, 16);
2274 __lsx_vst(dst2, dst_argb, 32);
2275 __lsx_vst(dst3, dst_argb, 48);
2276 dst_argb += 64;
2277 src_y += 16;
2278 }
2279 }
2280
J400ToARGBRow_LSX(const uint8_t * src_y,uint8_t * dst_argb,int width)2281 void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width) {
2282 int x;
2283 int len = width / 16;
2284 __m128i vec_y, dst0, dst1, dst2, dst3;
2285 __m128i tmp0, tmp1, tmp2, tmp3;
2286 __m128i alpha = __lsx_vldi(0xFF);
2287
2288 for (x = 0; x < len; x++) {
2289 vec_y = __lsx_vld(src_y, 0);
2290 tmp0 = __lsx_vilvl_b(vec_y, vec_y);
2291 tmp1 = __lsx_vilvh_b(vec_y, vec_y);
2292 tmp2 = __lsx_vilvl_b(alpha, vec_y);
2293 tmp3 = __lsx_vilvh_b(alpha, vec_y);
2294 dst0 = __lsx_vilvl_h(tmp2, tmp0);
2295 dst1 = __lsx_vilvh_h(tmp2, tmp0);
2296 dst2 = __lsx_vilvl_h(tmp3, tmp1);
2297 dst3 = __lsx_vilvh_h(tmp3, tmp1);
2298 __lsx_vst(dst0, dst_argb, 0);
2299 __lsx_vst(dst1, dst_argb, 16);
2300 __lsx_vst(dst2, dst_argb, 32);
2301 __lsx_vst(dst3, dst_argb, 48);
2302 dst_argb += 64;
2303 src_y += 16;
2304 }
2305 }
2306
YUY2ToARGBRow_LSX(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2307 void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2,
2308 uint8_t* dst_argb,
2309 const struct YuvConstants* yuvconstants,
2310 int width) {
2311 int x;
2312 int len = width / 8;
2313 __m128i src0, vec_y, vec_vu;
2314 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
2315 __m128i vec_vrub, vec_vgug;
2316 __m128i out_b, out_g, out_r;
2317 __m128i const_80 = __lsx_vldi(0x480);
2318 __m128i zero = __lsx_vldi(0);
2319 __m128i alpha = __lsx_vldi(0xFF);
2320
2321 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
2322 vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
2323 vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
2324
2325 for (x = 0; x < len; x++) {
2326 src0 = __lsx_vld(src_yuy2, 0);
2327 vec_y = __lsx_vpickev_b(src0, src0);
2328 vec_vu = __lsx_vpickod_b(src0, src0);
2329 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
2330 out_r);
2331 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
2332 src_yuy2 += 16;
2333 }
2334 }
2335
UYVYToARGBRow_LSX(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)2336 void UYVYToARGBRow_LSX(const uint8_t* src_uyvy,
2337 uint8_t* dst_argb,
2338 const struct YuvConstants* yuvconstants,
2339 int width) {
2340 int x;
2341 int len = width / 8;
2342 __m128i src0, vec_y, vec_vu;
2343 __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
2344 __m128i vec_vrub, vec_vgug;
2345 __m128i out_b, out_g, out_r;
2346 __m128i const_80 = __lsx_vldi(0x480);
2347 __m128i zero = __lsx_vldi(0);
2348 __m128i alpha = __lsx_vldi(0xFF);
2349
2350 YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
2351 vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
2352 vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
2353
2354 for (x = 0; x < len; x++) {
2355 src0 = __lsx_vld(src_uyvy, 0);
2356 vec_y = __lsx_vpickod_b(src0, src0);
2357 vec_vu = __lsx_vpickev_b(src0, src0);
2358 YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
2359 out_r);
2360 STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
2361 src_uyvy += 16;
2362 }
2363 }
2364
InterpolateRow_LSX(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int width,int32_t source_y_fraction)2365 void InterpolateRow_LSX(uint8_t* dst_ptr,
2366 const uint8_t* src_ptr,
2367 ptrdiff_t src_stride,
2368 int width,
2369 int32_t source_y_fraction) {
2370 int x;
2371 int y1_fraction = source_y_fraction;
2372 int y0_fraction = 256 - y1_fraction;
2373 const uint8_t* nex_ptr = src_ptr + src_stride;
2374 uint16_t y_fractions;
2375 int len = width / 32;
2376 __m128i src0, src1, nex0, nex1;
2377 __m128i dst0, dst1, y_frac;
2378 __m128i tmp0, tmp1, tmp2, tmp3;
2379 __m128i const_128 = __lsx_vldi(0x480);
2380
2381 if (y1_fraction == 0) {
2382 for (x = 0; x < len; x++) {
2383 DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
2384 __lsx_vst(src0, dst_ptr, 0);
2385 __lsx_vst(src1, dst_ptr, 16);
2386 src_ptr += 32;
2387 dst_ptr += 32;
2388 }
2389 return;
2390 }
2391
2392 if (y1_fraction == 128) {
2393 for (x = 0; x < len; x++) {
2394 DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
2395 DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
2396 dst0 = __lsx_vavgr_bu(src0, nex0);
2397 dst1 = __lsx_vavgr_bu(src1, nex1);
2398 __lsx_vst(dst0, dst_ptr, 0);
2399 __lsx_vst(dst1, dst_ptr, 16);
2400 src_ptr += 32;
2401 nex_ptr += 32;
2402 dst_ptr += 32;
2403 }
2404 return;
2405 }
2406
2407 y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
2408 y_frac = __lsx_vreplgr2vr_h(y_fractions);
2409
2410 for (x = 0; x < len; x++) {
2411 DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
2412 DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
2413 tmp0 = __lsx_vilvl_b(nex0, src0);
2414 tmp1 = __lsx_vilvh_b(nex0, src0);
2415 tmp2 = __lsx_vilvl_b(nex1, src1);
2416 tmp3 = __lsx_vilvh_b(nex1, src1);
2417 tmp0 = __lsx_vdp2add_h_bu(const_128, tmp0, y_frac);
2418 tmp1 = __lsx_vdp2add_h_bu(const_128, tmp1, y_frac);
2419 tmp2 = __lsx_vdp2add_h_bu(const_128, tmp2, y_frac);
2420 tmp3 = __lsx_vdp2add_h_bu(const_128, tmp3, y_frac);
2421 dst0 = __lsx_vsrlni_b_h(tmp1, tmp0, 8);
2422 dst1 = __lsx_vsrlni_b_h(tmp3, tmp2, 8);
2423 __lsx_vst(dst0, dst_ptr, 0);
2424 __lsx_vst(dst1, dst_ptr, 16);
2425 src_ptr += 32;
2426 nex_ptr += 32;
2427 dst_ptr += 32;
2428 }
2429 }
2430
ARGBSetRow_LSX(uint8_t * dst_argb,uint32_t v32,int width)2431 void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width) {
2432 int x;
2433 int len = width / 4;
2434 __m128i dst0 = __lsx_vreplgr2vr_w(v32);
2435
2436 for (x = 0; x < len; x++) {
2437 __lsx_vst(dst0, dst_argb, 0);
2438 dst_argb += 16;
2439 }
2440 }
2441
RAWToRGB24Row_LSX(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)2442 void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
2443 int x;
2444 int len = width / 16;
2445 __m128i src0, src1, src2;
2446 __m128i dst0, dst1, dst2;
2447 __m128i shuf0 = {0x0708030405000102, 0x110C0D0E090A0B06};
2448 __m128i shuf1 = {0x1516171213140F10, 0x1F1E1B1C1D18191A};
2449 __m128i shuf2 = {0x090405060102031E, 0x0D0E0F0A0B0C0708};
2450
2451 for (x = 0; x < len; x++) {
2452 DUP2_ARG2(__lsx_vld, src_raw, 0, src_raw, 16, src0, src1);
2453 src2 = __lsx_vld(src_raw, 32);
2454 DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src0, shuf1, dst0, dst1);
2455 dst2 = __lsx_vshuf_b(src1, src2, shuf2);
2456 dst1 = __lsx_vinsgr2vr_b(dst1, src_raw[32], 0x0E);
2457 __lsx_vst(dst0, dst_rgb24, 0);
2458 __lsx_vst(dst1, dst_rgb24, 16);
2459 __lsx_vst(dst2, dst_rgb24, 32);
2460 dst_rgb24 += 48;
2461 src_raw += 48;
2462 }
2463 }
2464
MergeUVRow_LSX(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)2465 void MergeUVRow_LSX(const uint8_t* src_u,
2466 const uint8_t* src_v,
2467 uint8_t* dst_uv,
2468 int width) {
2469 int x;
2470 int len = width / 16;
2471 __m128i src0, src1, dst0, dst1;
2472
2473 for (x = 0; x < len; x++) {
2474 DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src0, src1);
2475 dst0 = __lsx_vilvl_b(src1, src0);
2476 dst1 = __lsx_vilvh_b(src1, src0);
2477 __lsx_vst(dst0, dst_uv, 0);
2478 __lsx_vst(dst1, dst_uv, 16);
2479 src_u += 16;
2480 src_v += 16;
2481 dst_uv += 32;
2482 }
2483 }
2484
ARGBExtractAlphaRow_LSX(const uint8_t * src_argb,uint8_t * dst_a,int width)2485 void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
2486 uint8_t* dst_a,
2487 int width) {
2488 int x;
2489 int len = width / 16;
2490 __m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
2491
2492 for (x = 0; x < len; x++) {
2493 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
2494 src0, src1, src2, src3);
2495 tmp0 = __lsx_vpickod_b(src1, src0);
2496 tmp1 = __lsx_vpickod_b(src3, src2);
2497 dst0 = __lsx_vpickod_b(tmp1, tmp0);
2498 __lsx_vst(dst0, dst_a, 0);
2499 src_argb += 64;
2500 dst_a += 16;
2501 }
2502 }
2503
ARGBBlendRow_LSX(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2504 void ARGBBlendRow_LSX(const uint8_t* src_argb,
2505 const uint8_t* src_argb1,
2506 uint8_t* dst_argb,
2507 int width) {
2508 int x;
2509 int len = width / 8;
2510 __m128i src0, src1, src2, src3;
2511 __m128i tmp0, tmp1, dst0, dst1;
2512 __m128i reg0, reg1, reg2, reg3;
2513 __m128i a0, a1, a2, a3;
2514 __m128i const_256 = __lsx_vldi(0x500);
2515 __m128i zero = __lsx_vldi(0);
2516 __m128i alpha = __lsx_vldi(0xFF);
2517 __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000};
2518
2519 for (x = 0; x < len; x++) {
2520 DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb1, 0, src_argb1, 16,
2521 src0, src1, src2, src3);
2522 tmp0 = __lsx_vshuf4i_b(src0, 0xFF);
2523 tmp1 = __lsx_vshuf4i_b(src1, 0xFF);
2524 a0 = __lsx_vilvl_b(zero, tmp0);
2525 a1 = __lsx_vilvh_b(zero, tmp0);
2526 a2 = __lsx_vilvl_b(zero, tmp1);
2527 a3 = __lsx_vilvh_b(zero, tmp1);
2528 reg0 = __lsx_vilvl_b(zero, src2);
2529 reg1 = __lsx_vilvh_b(zero, src2);
2530 reg2 = __lsx_vilvl_b(zero, src3);
2531 reg3 = __lsx_vilvh_b(zero, src3);
2532 DUP4_ARG2(__lsx_vsub_h, const_256, a0, const_256, a1, const_256, a2,
2533 const_256, a3, a0, a1, a2, a3);
2534 DUP4_ARG2(__lsx_vmul_h, a0, reg0, a1, reg1, a2, reg2, a3, reg3, reg0, reg1,
2535 reg2, reg3);
2536 DUP2_ARG3(__lsx_vsrani_b_h, reg1, reg0, 8, reg3, reg2, 8, dst0, dst1);
2537 dst0 = __lsx_vsadd_bu(dst0, src0);
2538 dst1 = __lsx_vsadd_bu(dst1, src1);
2539 dst0 = __lsx_vbitsel_v(dst0, alpha, control);
2540 dst1 = __lsx_vbitsel_v(dst1, alpha, control);
2541 __lsx_vst(dst0, dst_argb, 0);
2542 __lsx_vst(dst1, dst_argb, 16);
2543 src_argb += 32;
2544 src_argb1 += 32;
2545 dst_argb += 32;
2546 }
2547 }
2548
ARGBQuantizeRow_LSX(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)2549 void ARGBQuantizeRow_LSX(uint8_t* dst_argb,
2550 int scale,
2551 int interval_size,
2552 int interval_offset,
2553 int width) {
2554 int x;
2555 int len = width / 16;
2556 __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
2557 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2558 __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
2559 __m128i vec_size = __lsx_vreplgr2vr_b(interval_size);
2560 __m128i vec_offset = __lsx_vreplgr2vr_b(interval_offset);
2561 __m128i vec_scale = __lsx_vreplgr2vr_w(scale);
2562 __m128i zero = __lsx_vldi(0);
2563 __m128i control = (__m128i)v2u64{0xFF000000FF000000, 0xFF000000FF000000};
2564
2565 for (x = 0; x < len; x++) {
2566 DUP4_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, dst_argb, 32, dst_argb, 48,
2567 src0, src1, src2, src3);
2568 reg0 = __lsx_vilvl_b(zero, src0);
2569 reg1 = __lsx_vilvh_b(zero, src0);
2570 reg2 = __lsx_vilvl_b(zero, src1);
2571 reg3 = __lsx_vilvh_b(zero, src1);
2572 reg4 = __lsx_vilvl_b(zero, src2);
2573 reg5 = __lsx_vilvh_b(zero, src2);
2574 reg6 = __lsx_vilvl_b(zero, src3);
2575 reg7 = __lsx_vilvh_b(zero, src3);
2576 tmp0 = __lsx_vilvl_h(zero, reg0);
2577 tmp1 = __lsx_vilvh_h(zero, reg0);
2578 tmp2 = __lsx_vilvl_h(zero, reg1);
2579 tmp3 = __lsx_vilvh_h(zero, reg1);
2580 tmp4 = __lsx_vilvl_h(zero, reg2);
2581 tmp5 = __lsx_vilvh_h(zero, reg2);
2582 tmp6 = __lsx_vilvl_h(zero, reg3);
2583 tmp7 = __lsx_vilvh_h(zero, reg3);
2584 DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
2585 tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
2586 DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
2587 tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
2588 DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
2589 tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
2590 dst0 = __lsx_vpickev_b(reg1, reg0);
2591 dst1 = __lsx_vpickev_b(reg3, reg2);
2592 tmp0 = __lsx_vilvl_h(zero, reg4);
2593 tmp1 = __lsx_vilvh_h(zero, reg4);
2594 tmp2 = __lsx_vilvl_h(zero, reg5);
2595 tmp3 = __lsx_vilvh_h(zero, reg5);
2596 tmp4 = __lsx_vilvl_h(zero, reg6);
2597 tmp5 = __lsx_vilvh_h(zero, reg6);
2598 tmp6 = __lsx_vilvl_h(zero, reg7);
2599 tmp7 = __lsx_vilvh_h(zero, reg7);
2600 DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
2601 tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
2602 DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
2603 tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
2604 DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
2605 tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
2606 dst2 = __lsx_vpickev_b(reg1, reg0);
2607 dst3 = __lsx_vpickev_b(reg3, reg2);
2608 DUP4_ARG2(__lsx_vmul_b, dst0, vec_size, dst1, vec_size, dst2, vec_size,
2609 dst3, vec_size, dst0, dst1, dst2, dst3);
2610 DUP4_ARG2(__lsx_vadd_b, dst0, vec_offset, dst1, vec_offset, dst2,
2611 vec_offset, dst3, vec_offset, dst0, dst1, dst2, dst3);
2612 DUP4_ARG3(__lsx_vbitsel_v, dst0, src0, control, dst1, src1, control, dst2,
2613 src2, control, dst3, src3, control, dst0, dst1, dst2, dst3);
2614 __lsx_vst(dst0, dst_argb, 0);
2615 __lsx_vst(dst1, dst_argb, 16);
2616 __lsx_vst(dst2, dst_argb, 32);
2617 __lsx_vst(dst3, dst_argb, 48);
2618 dst_argb += 64;
2619 }
2620 }
2621
ARGBColorMatrixRow_LSX(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)2622 void ARGBColorMatrixRow_LSX(const uint8_t* src_argb,
2623 uint8_t* dst_argb,
2624 const int8_t* matrix_argb,
2625 int width) {
2626 int x;
2627 int len = width / 8;
2628 __m128i src0, src1, tmp0, tmp1, dst0, dst1;
2629 __m128i tmp_b, tmp_g, tmp_r, tmp_a;
2630 __m128i reg_b, reg_g, reg_r, reg_a;
2631 __m128i matrix_b = __lsx_vldrepl_w(matrix_argb, 0);
2632 __m128i matrix_g = __lsx_vldrepl_w(matrix_argb, 4);
2633 __m128i matrix_r = __lsx_vldrepl_w(matrix_argb, 8);
2634 __m128i matrix_a = __lsx_vldrepl_w(matrix_argb, 12);
2635
2636 for (x = 0; x < len; x++) {
2637 DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
2638 DUP4_ARG2(__lsx_vdp2_h_bu_b, src0, matrix_b, src0, matrix_g, src0, matrix_r,
2639 src0, matrix_a, tmp_b, tmp_g, tmp_r, tmp_a);
2640 DUP4_ARG2(__lsx_vdp2_h_bu_b, src1, matrix_b, src1, matrix_g, src1, matrix_r,
2641 src1, matrix_a, reg_b, reg_g, reg_r, reg_a);
2642 DUP4_ARG2(__lsx_vhaddw_w_h, tmp_b, tmp_b, tmp_g, tmp_g, tmp_r, tmp_r, tmp_a,
2643 tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
2644 DUP4_ARG2(__lsx_vhaddw_w_h, reg_b, reg_b, reg_g, reg_g, reg_r, reg_r, reg_a,
2645 reg_a, reg_b, reg_g, reg_r, reg_a);
2646 DUP4_ARG2(__lsx_vsrai_w, tmp_b, 6, tmp_g, 6, tmp_r, 6, tmp_a, 6, tmp_b,
2647 tmp_g, tmp_r, tmp_a);
2648 DUP4_ARG2(__lsx_vsrai_w, reg_b, 6, reg_g, 6, reg_r, 6, reg_a, 6, reg_b,
2649 reg_g, reg_r, reg_a);
2650 DUP4_ARG1(__lsx_vclip255_w, tmp_b, tmp_g, tmp_r, tmp_a, tmp_b, tmp_g, tmp_r,
2651 tmp_a)
2652 DUP4_ARG1(__lsx_vclip255_w, reg_b, reg_g, reg_r, reg_a, reg_b, reg_g, reg_r,
2653 reg_a)
2654 DUP4_ARG2(__lsx_vpickev_h, reg_b, tmp_b, reg_g, tmp_g, reg_r, tmp_r, reg_a,
2655 tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
2656 tmp0 = __lsx_vpackev_b(tmp_g, tmp_b);
2657 tmp1 = __lsx_vpackev_b(tmp_a, tmp_r);
2658 dst0 = __lsx_vilvl_h(tmp1, tmp0);
2659 dst1 = __lsx_vilvh_h(tmp1, tmp0);
2660 __lsx_vst(dst0, dst_argb, 0);
2661 __lsx_vst(dst1, dst_argb, 16);
2662 src_argb += 32;
2663 dst_argb += 32;
2664 }
2665 }
2666
SplitUVRow_LSX(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)2667 void SplitUVRow_LSX(const uint8_t* src_uv,
2668 uint8_t* dst_u,
2669 uint8_t* dst_v,
2670 int width) {
2671 int x;
2672 int len = width / 32;
2673 __m128i src0, src1, src2, src3;
2674 __m128i dst0, dst1, dst2, dst3;
2675
2676 for (x = 0; x < len; x++) {
2677 DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src0,
2678 src1, src2, src3);
2679 DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, dst0, dst1);
2680 DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst2, dst3);
2681 __lsx_vst(dst0, dst_u, 0);
2682 __lsx_vst(dst1, dst_u, 16);
2683 __lsx_vst(dst2, dst_v, 0);
2684 __lsx_vst(dst3, dst_v, 16);
2685 src_uv += 64;
2686 dst_u += 32;
2687 dst_v += 32;
2688 }
2689 }
2690
SetRow_LSX(uint8_t * dst,uint8_t v8,int width)2691 void SetRow_LSX(uint8_t* dst, uint8_t v8, int width) {
2692 int x;
2693 int len = width / 16;
2694 __m128i dst0 = __lsx_vreplgr2vr_b(v8);
2695
2696 for (x = 0; x < len; x++) {
2697 __lsx_vst(dst0, dst, 0);
2698 dst += 16;
2699 }
2700 }
2701
MirrorSplitUVRow_LSX(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)2702 void MirrorSplitUVRow_LSX(const uint8_t* src_uv,
2703 uint8_t* dst_u,
2704 uint8_t* dst_v,
2705 int width) {
2706 int x;
2707 int len = width / 32;
2708 __m128i src0, src1, src2, src3;
2709 __m128i dst0, dst1, dst2, dst3;
2710 __m128i shuff0 = {0x10121416181A1C1E, 0x00020406080A0C0E};
2711 __m128i shuff1 = {0x11131517191B1D1F, 0x01030507090B0D0F};
2712
2713 src_uv += (width << 1);
2714 for (x = 0; x < len; x++) {
2715 src_uv -= 64;
2716 DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src2,
2717 src3, src0, src1);
2718 DUP4_ARG3(__lsx_vshuf_b, src1, src0, shuff1, src3, src2, shuff1, src1, src0,
2719 shuff0, src3, src2, shuff0, dst0, dst1, dst2, dst3);
2720 __lsx_vst(dst0, dst_v, 0);
2721 __lsx_vst(dst1, dst_v, 16);
2722 __lsx_vst(dst2, dst_u, 0);
2723 __lsx_vst(dst3, dst_u, 16);
2724 dst_u += 32;
2725 dst_v += 32;
2726 }
2727 }
2728
HalfFloatRow_LSX(const uint16_t * src,uint16_t * dst,float scale,int width)2729 void HalfFloatRow_LSX(const uint16_t* src,
2730 uint16_t* dst,
2731 float scale,
2732 int width) {
2733 int x;
2734 int len = width / 32;
2735 float mult = 1.9259299444e-34f * scale;
2736 __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
2737 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2738 __m128 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
2739 __m128 vec_mult = (__m128)__lsx_vldrepl_w(&mult, 0);
2740 __m128i zero = __lsx_vldi(0);
2741
2742 for (x = 0; x < len; x++) {
2743 DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
2744 src3);
2745 DUP4_ARG2(__lsx_vilvl_h, zero, src0, zero, src1, zero, src2, zero, src3,
2746 tmp0, tmp2, tmp4, tmp6);
2747 DUP4_ARG2(__lsx_vilvh_h, zero, src0, zero, src1, zero, src2, zero, src3,
2748 tmp1, tmp3, tmp5, tmp7);
2749 DUP4_ARG1(__lsx_vffint_s_wu, tmp0, tmp2, tmp4, tmp6, reg0, reg2, reg4,
2750 reg6);
2751 DUP4_ARG1(__lsx_vffint_s_wu, tmp1, tmp3, tmp5, tmp7, reg1, reg3, reg5,
2752 reg7);
2753 DUP4_ARG2(__lsx_vfmul_s, reg0, vec_mult, reg1, vec_mult, reg2, vec_mult,
2754 reg3, vec_mult, reg0, reg1, reg2, reg3);
2755 DUP4_ARG2(__lsx_vfmul_s, reg4, vec_mult, reg5, vec_mult, reg6, vec_mult,
2756 reg7, vec_mult, reg4, reg5, reg6, reg7);
2757 DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg0, 13, (v4u32)reg1, 13, (v4u32)reg2, 13,
2758 (v4u32)reg3, 13, tmp0, tmp1, tmp2, tmp3);
2759 DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg4, 13, (v4u32)reg5, 13, (v4u32)reg6, 13,
2760 (v4u32)reg7, 13, tmp4, tmp5, tmp6, tmp7);
2761 DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
2762 dst0, dst1, dst2, dst3);
2763 __lsx_vst(dst0, dst, 0);
2764 __lsx_vst(dst1, dst, 16);
2765 __lsx_vst(dst2, dst, 32);
2766 __lsx_vst(dst3, dst, 48);
2767 src += 32;
2768 dst += 32;
2769 }
2770 }
2771
2772 struct RgbConstants {
2773 uint8_t kRGBToY[4];
2774 uint16_t kAddY;
2775 uint16_t pad;
2776 };
2777
2778 // RGB to JPeg coefficients
2779 // B * 0.1140 coefficient = 29
2780 // G * 0.5870 coefficient = 150
2781 // R * 0.2990 coefficient = 77
2782 // Add 0.5 = 0x80
2783 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
2784 128,
2785 0};
2786
2787 static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
2788
2789 // RGB to BT.601 coefficients
2790 // B * 0.1016 coefficient = 25
2791 // G * 0.5078 coefficient = 129
2792 // R * 0.2578 coefficient = 66
2793 // Add 16.5 = 0x1080
2794
2795 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
2796 0x1080,
2797 0};
2798
2799 static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
2800 0x1080,
2801 0};
2802
2803 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
ARGBToYMatrixRow_LSX(const uint8_t * src_argb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2804 static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
2805 uint8_t* dst_y,
2806 int width,
2807 const struct RgbConstants* rgbconstants) {
2808 asm volatile(
2809 "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
2810 "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
2811 "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
2812 "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
2813 "1: \n\t"
2814 "vld $vr4, %0, 0 \n\t"
2815 "vld $vr5, %0, 16 \n\t"
2816 "vld $vr6, %0, 32 \n\t"
2817 "vld $vr7, %0, 48 \n\t" // load 16 pixels of
2818 // ARGB
2819 "vor.v $vr12, $vr3, $vr3 \n\t"
2820 "vor.v $vr13, $vr3, $vr3 \n\t"
2821 "addi.d %2, %2, -16 \n\t" // 16 processed per
2822 // loop.
2823 "vpickev.b $vr8, $vr5, $vr4 \n\t" // BR
2824 "vpickev.b $vr10, $vr7, $vr6 \n\t"
2825 "vpickod.b $vr9, $vr5, $vr4 \n\t" // GA
2826 "vpickod.b $vr11, $vr7, $vr6 \n\t"
2827 "vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" // B
2828 "vmaddwev.h.bu $vr13, $vr10, $vr0 \n\t"
2829 "vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" // G
2830 "vmaddwev.h.bu $vr13, $vr11, $vr1 \n\t"
2831 "vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" // R
2832 "vmaddwod.h.bu $vr13, $vr10, $vr2 \n\t"
2833 "addi.d %0, %0, 64 \n\t"
2834 "vpickod.b $vr10, $vr13, $vr12 \n\t"
2835 "vst $vr10, %1, 0 \n\t"
2836 "addi.d %1, %1, 16 \n\t"
2837 "bnez %2, 1b \n\t"
2838 : "+&r"(src_argb), // %0
2839 "+&r"(dst_y), // %1
2840 "+&r"(width) // %2
2841 : "r"(rgbconstants)
2842 : "memory");
2843 }
2844
ARGBToYRow_LSX(const uint8_t * src_argb,uint8_t * dst_y,int width)2845 void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
2846 ARGBToYMatrixRow_LSX(src_argb, dst_y, width, &kRgb24I601Constants);
2847 }
2848
ARGBToYJRow_LSX(const uint8_t * src_argb,uint8_t * dst_yj,int width)2849 void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
2850 ARGBToYMatrixRow_LSX(src_argb, dst_yj, width, &kRgb24JPEGConstants);
2851 }
2852
ABGRToYRow_LSX(const uint8_t * src_abgr,uint8_t * dst_y,int width)2853 void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
2854 ARGBToYMatrixRow_LSX(src_abgr, dst_y, width, &kRawI601Constants);
2855 }
2856
ABGRToYJRow_LSX(const uint8_t * src_abgr,uint8_t * dst_yj,int width)2857 void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
2858 ARGBToYMatrixRow_LSX(src_abgr, dst_yj, width, &kRawJPEGConstants);
2859 }
2860
2861 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
2862 // Same code as ARGB, except the LD4
RGBAToYMatrixRow_LSX(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2863 static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
2864 uint8_t* dst_y,
2865 int width,
2866 const struct RgbConstants* rgbconstants) {
2867 asm volatile(
2868 "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
2869 "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
2870 "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
2871 "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
2872 "1: \n\t"
2873 "vld $vr4, %0, 0 \n\t"
2874 "vld $vr5, %0, 16 \n\t"
2875 "vld $vr6, %0, 32 \n\t"
2876 "vld $vr7, %0, 48 \n\t" // load 16 pixels of
2877 // RGBA
2878 "vor.v $vr12, $vr3, $vr3 \n\t"
2879 "vor.v $vr13, $vr3, $vr3 \n\t"
2880 "addi.d %2, %2, -16 \n\t" // 16 processed per
2881 // loop.
2882 "vpickev.b $vr8, $vr5, $vr4 \n\t" // AG
2883 "vpickev.b $vr10, $vr7, $vr6 \n\t"
2884 "vpickod.b $vr9, $vr5, $vr4 \n\t" // BR
2885 "vpickod.b $vr11, $vr7, $vr6 \n\t"
2886 "vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" // B
2887 "vmaddwev.h.bu $vr13, $vr11, $vr0 \n\t"
2888 "vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" // G
2889 "vmaddwod.h.bu $vr13, $vr10, $vr1 \n\t"
2890 "vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" // R
2891 "vmaddwod.h.bu $vr13, $vr11, $vr2 \n\t"
2892 "addi.d %0, %0, 64 \n\t"
2893 "vpickod.b $vr10, $vr13, $vr12 \n\t"
2894 "vst $vr10, %1, 0 \n\t"
2895 "addi.d %1, %1, 16 \n\t"
2896 "bnez %2, 1b \n\t"
2897 : "+&r"(src_rgba), // %0
2898 "+&r"(dst_y), // %1
2899 "+&r"(width) // %2
2900 : "r"(rgbconstants)
2901 : "memory");
2902 }
2903
RGBAToYRow_LSX(const uint8_t * src_rgba,uint8_t * dst_y,int width)2904 void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
2905 RGBAToYMatrixRow_LSX(src_rgba, dst_y, width, &kRgb24I601Constants);
2906 }
2907
RGBAToYJRow_LSX(const uint8_t * src_rgba,uint8_t * dst_yj,int width)2908 void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
2909 RGBAToYMatrixRow_LSX(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
2910 }
2911
BGRAToYRow_LSX(const uint8_t * src_bgra,uint8_t * dst_y,int width)2912 void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
2913 RGBAToYMatrixRow_LSX(src_bgra, dst_y, width, &kRawI601Constants);
2914 }
2915
RGBToYMatrixRow_LSX(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2916 static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
2917 uint8_t* dst_y,
2918 int width,
2919 const struct RgbConstants* rgbconstants) {
2920 int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18,
2921 20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6,
2922 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
2923 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
2924 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
2925 asm volatile(
2926 "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
2927 "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
2928 "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
2929 "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
2930 "vld $vr4, %4, 0 \n\t" // load shuff
2931 "vld $vr5, %4, 16 \n\t"
2932 "vld $vr6, %4, 32 \n\t"
2933 "vld $vr7, %4, 48 \n\t"
2934 "1: \n\t"
2935 "vld $vr8, %0, 0 \n\t"
2936 "vld $vr9, %0, 16 \n\t"
2937 "vld $vr10, %0, 32 \n\t" // load 16 pixels of
2938 // RGB
2939 "vor.v $vr12, $vr3, $vr3 \n\t"
2940 "vor.v $vr13, $vr3, $vr3 \n\t"
2941 "addi.d %2, %2, -16 \n\t" // 16 processed per
2942 // loop.
2943 "vshuf.b $vr14, $vr9, $vr8, $vr4 \n\t"
2944 "vshuf.b $vr15, $vr9, $vr10, $vr5 \n\t"
2945 "vshuf.b $vr16, $vr9, $vr8, $vr6 \n\t"
2946 "vshuf.b $vr17, $vr9, $vr10, $vr7 \n\t"
2947 "vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" // G
2948 "vmaddwev.h.bu $vr13, $vr17, $vr1 \n\t"
2949 "vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" // B
2950 "vmaddwev.h.bu $vr13, $vr15, $vr0 \n\t"
2951 "vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" // R
2952 "vmaddwod.h.bu $vr13, $vr15, $vr2 \n\t"
2953 "addi.d %0, %0, 48 \n\t"
2954 "vpickod.b $vr10, $vr13, $vr12 \n\t"
2955 "vst $vr10, %1, 0 \n\t"
2956 "addi.d %1, %1, 16 \n\t"
2957 "bnez %2, 1b \n\t"
2958 : "+&r"(src_rgba), // %0
2959 "+&r"(dst_y), // %1
2960 "+&r"(width) // %2
2961 : "r"(rgbconstants), // %3
2962 "r"(shuff) // %4
2963 : "memory");
2964 }
2965
RGB24ToYJRow_LSX(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)2966 void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
2967 RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
2968 }
2969
RAWToYJRow_LSX(const uint8_t * src_raw,uint8_t * dst_yj,int width)2970 void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
2971 RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants);
2972 }
2973
RGB24ToYRow_LSX(const uint8_t * src_rgb24,uint8_t * dst_y,int width)2974 void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
2975 RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants);
2976 }
2977
RAWToYRow_LSX(const uint8_t * src_raw,uint8_t * dst_y,int width)2978 void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
2979 RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants);
2980 }
2981
2982 #ifdef __cplusplus
2983 } // extern "C"
2984 } // namespace libyuv
2985 #endif
2986
2987 #endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
2988