1*4e366538SXin Li /*
2*4e366538SXin Li * Copyright 2022 The LibYuv Project Authors. All rights reserved.
3*4e366538SXin Li *
4*4e366538SXin Li * Copyright (c) 2022 Loongson Technology Corporation Limited
5*4e366538SXin Li *
6*4e366538SXin Li * Use of this source code is governed by a BSD-style license
7*4e366538SXin Li * that can be found in the LICENSE file in the root of the source
8*4e366538SXin Li * tree. An additional intellectual property rights grant can be found
9*4e366538SXin Li * in the file PATENTS. All contributing project authors may
10*4e366538SXin Li * be found in the AUTHORS file in the root of the source tree.
11*4e366538SXin Li */
12*4e366538SXin Li
13*4e366538SXin Li #include "libyuv/rotate_row.h"
14*4e366538SXin Li
15*4e366538SXin Li #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
16*4e366538SXin Li #include "libyuv/loongson_intrinsics.h"
17*4e366538SXin Li
18*4e366538SXin Li #ifdef __cplusplus
19*4e366538SXin Li namespace libyuv {
20*4e366538SXin Li extern "C" {
21*4e366538SXin Li #endif
22*4e366538SXin Li
23*4e366538SXin Li #define ILVLH_B(in0, in1, in2, in3, out0, out1, out2, out3) \
24*4e366538SXin Li { \
25*4e366538SXin Li DUP2_ARG2(__lsx_vilvl_b, in1, in0, in3, in2, out0, out2); \
26*4e366538SXin Li DUP2_ARG2(__lsx_vilvh_b, in1, in0, in3, in2, out1, out3); \
27*4e366538SXin Li }
28*4e366538SXin Li
29*4e366538SXin Li #define ILVLH_H(in0, in1, in2, in3, out0, out1, out2, out3) \
30*4e366538SXin Li { \
31*4e366538SXin Li DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, out0, out2); \
32*4e366538SXin Li DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, out1, out3); \
33*4e366538SXin Li }
34*4e366538SXin Li
35*4e366538SXin Li #define ILVLH_W(in0, in1, in2, in3, out0, out1, out2, out3) \
36*4e366538SXin Li { \
37*4e366538SXin Li DUP2_ARG2(__lsx_vilvl_w, in1, in0, in3, in2, out0, out2); \
38*4e366538SXin Li DUP2_ARG2(__lsx_vilvh_w, in1, in0, in3, in2, out1, out3); \
39*4e366538SXin Li }
40*4e366538SXin Li
41*4e366538SXin Li #define ILVLH_D(in0, in1, in2, in3, out0, out1, out2, out3) \
42*4e366538SXin Li { \
43*4e366538SXin Li DUP2_ARG2(__lsx_vilvl_d, in1, in0, in3, in2, out0, out2); \
44*4e366538SXin Li DUP2_ARG2(__lsx_vilvh_d, in1, in0, in3, in2, out1, out3); \
45*4e366538SXin Li }
46*4e366538SXin Li
47*4e366538SXin Li #define LSX_ST_4(_dst0, _dst1, _dst2, _dst3, _dst, _stride, _stride2, \
48*4e366538SXin Li _stride3, _stride4) \
49*4e366538SXin Li { \
50*4e366538SXin Li __lsx_vst(_dst0, _dst, 0); \
51*4e366538SXin Li __lsx_vstx(_dst1, _dst, _stride); \
52*4e366538SXin Li __lsx_vstx(_dst2, _dst, _stride2); \
53*4e366538SXin Li __lsx_vstx(_dst3, _dst, _stride3); \
54*4e366538SXin Li _dst += _stride4; \
55*4e366538SXin Li }
56*4e366538SXin Li
57*4e366538SXin Li #define LSX_ST_2(_dst0, _dst1, _dst, _stride, _stride2) \
58*4e366538SXin Li { \
59*4e366538SXin Li __lsx_vst(_dst0, _dst, 0); \
60*4e366538SXin Li __lsx_vstx(_dst1, _dst, _stride); \
61*4e366538SXin Li _dst += _stride2; \
62*4e366538SXin Li }
63*4e366538SXin Li
TransposeWx16_C(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)64*4e366538SXin Li void TransposeWx16_C(const uint8_t* src,
65*4e366538SXin Li int src_stride,
66*4e366538SXin Li uint8_t* dst,
67*4e366538SXin Li int dst_stride,
68*4e366538SXin Li int width) {
69*4e366538SXin Li TransposeWx8_C(src, src_stride, dst, dst_stride, width);
70*4e366538SXin Li TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
71*4e366538SXin Li width);
72*4e366538SXin Li }
73*4e366538SXin Li
TransposeUVWx16_C(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)74*4e366538SXin Li void TransposeUVWx16_C(const uint8_t* src,
75*4e366538SXin Li int src_stride,
76*4e366538SXin Li uint8_t* dst_a,
77*4e366538SXin Li int dst_stride_a,
78*4e366538SXin Li uint8_t* dst_b,
79*4e366538SXin Li int dst_stride_b,
80*4e366538SXin Li int width) {
81*4e366538SXin Li TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
82*4e366538SXin Li width);
83*4e366538SXin Li TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
84*4e366538SXin Li dst_stride_a, (dst_b + 8), dst_stride_b, width);
85*4e366538SXin Li }
86*4e366538SXin Li
TransposeWx16_LSX(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)87*4e366538SXin Li void TransposeWx16_LSX(const uint8_t* src,
88*4e366538SXin Li int src_stride,
89*4e366538SXin Li uint8_t* dst,
90*4e366538SXin Li int dst_stride,
91*4e366538SXin Li int width) {
92*4e366538SXin Li int x;
93*4e366538SXin Li int len = width / 16;
94*4e366538SXin Li uint8_t* s;
95*4e366538SXin Li int src_stride2 = src_stride << 1;
96*4e366538SXin Li int src_stride3 = src_stride + src_stride2;
97*4e366538SXin Li int src_stride4 = src_stride2 << 1;
98*4e366538SXin Li int dst_stride2 = dst_stride << 1;
99*4e366538SXin Li int dst_stride3 = dst_stride + dst_stride2;
100*4e366538SXin Li int dst_stride4 = dst_stride2 << 1;
101*4e366538SXin Li __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
102*4e366538SXin Li __m128i tmp0, tmp1, tmp2, tmp3;
103*4e366538SXin Li __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
104*4e366538SXin Li __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
105*4e366538SXin Li
106*4e366538SXin Li for (x = 0; x < len; x++) {
107*4e366538SXin Li s = (uint8_t*)src;
108*4e366538SXin Li src0 = __lsx_vld(s, 0);
109*4e366538SXin Li src1 = __lsx_vldx(s, src_stride);
110*4e366538SXin Li src2 = __lsx_vldx(s, src_stride2);
111*4e366538SXin Li src3 = __lsx_vldx(s, src_stride3);
112*4e366538SXin Li s += src_stride4;
113*4e366538SXin Li ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
114*4e366538SXin Li ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
115*4e366538SXin Li src0 = __lsx_vld(s, 0);
116*4e366538SXin Li src1 = __lsx_vldx(s, src_stride);
117*4e366538SXin Li src2 = __lsx_vldx(s, src_stride2);
118*4e366538SXin Li src3 = __lsx_vldx(s, src_stride3);
119*4e366538SXin Li s += src_stride4;
120*4e366538SXin Li ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
121*4e366538SXin Li ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
122*4e366538SXin Li ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
123*4e366538SXin Li ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
124*4e366538SXin Li src0 = __lsx_vld(s, 0);
125*4e366538SXin Li src1 = __lsx_vldx(s, src_stride);
126*4e366538SXin Li src2 = __lsx_vldx(s, src_stride2);
127*4e366538SXin Li src3 = __lsx_vldx(s, src_stride3);
128*4e366538SXin Li s += src_stride4;
129*4e366538SXin Li ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
130*4e366538SXin Li ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
131*4e366538SXin Li src0 = __lsx_vld(s, 0);
132*4e366538SXin Li src1 = __lsx_vldx(s, src_stride);
133*4e366538SXin Li src2 = __lsx_vldx(s, src_stride2);
134*4e366538SXin Li src3 = __lsx_vldx(s, src_stride3);
135*4e366538SXin Li s += src_stride4;
136*4e366538SXin Li ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
137*4e366538SXin Li ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
138*4e366538SXin Li res8 = __lsx_vilvl_w(reg4, reg0);
139*4e366538SXin Li res9 = __lsx_vilvh_w(reg4, reg0);
140*4e366538SXin Li ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
141*4e366538SXin Li LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
142*4e366538SXin Li dst_stride4);
143*4e366538SXin Li res8 = __lsx_vilvl_w(reg5, reg1);
144*4e366538SXin Li res9 = __lsx_vilvh_w(reg5, reg1);
145*4e366538SXin Li ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
146*4e366538SXin Li LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
147*4e366538SXin Li dst_stride4);
148*4e366538SXin Li res8 = __lsx_vilvl_w(reg6, reg2);
149*4e366538SXin Li res9 = __lsx_vilvh_w(reg6, reg2);
150*4e366538SXin Li ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
151*4e366538SXin Li LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
152*4e366538SXin Li dst_stride4);
153*4e366538SXin Li res8 = __lsx_vilvl_w(reg7, reg3);
154*4e366538SXin Li res9 = __lsx_vilvh_w(reg7, reg3);
155*4e366538SXin Li ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
156*4e366538SXin Li LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
157*4e366538SXin Li dst_stride4);
158*4e366538SXin Li src += 16;
159*4e366538SXin Li }
160*4e366538SXin Li }
161*4e366538SXin Li
TransposeUVWx16_LSX(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)162*4e366538SXin Li void TransposeUVWx16_LSX(const uint8_t* src,
163*4e366538SXin Li int src_stride,
164*4e366538SXin Li uint8_t* dst_a,
165*4e366538SXin Li int dst_stride_a,
166*4e366538SXin Li uint8_t* dst_b,
167*4e366538SXin Li int dst_stride_b,
168*4e366538SXin Li int width) {
169*4e366538SXin Li int x;
170*4e366538SXin Li int len = width / 8;
171*4e366538SXin Li uint8_t* s;
172*4e366538SXin Li int src_stride2 = src_stride << 1;
173*4e366538SXin Li int src_stride3 = src_stride + src_stride2;
174*4e366538SXin Li int src_stride4 = src_stride2 << 1;
175*4e366538SXin Li int dst_stride_a2 = dst_stride_a << 1;
176*4e366538SXin Li int dst_stride_b2 = dst_stride_b << 1;
177*4e366538SXin Li __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
178*4e366538SXin Li __m128i tmp0, tmp1, tmp2, tmp3;
179*4e366538SXin Li __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
180*4e366538SXin Li __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
181*4e366538SXin Li
182*4e366538SXin Li for (x = 0; x < len; x++) {
183*4e366538SXin Li s = (uint8_t*)src;
184*4e366538SXin Li src0 = __lsx_vld(s, 0);
185*4e366538SXin Li src1 = __lsx_vldx(s, src_stride);
186*4e366538SXin Li src2 = __lsx_vldx(s, src_stride2);
187*4e366538SXin Li src3 = __lsx_vldx(s, src_stride3);
188*4e366538SXin Li s += src_stride4;
189*4e366538SXin Li ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
190*4e366538SXin Li ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
191*4e366538SXin Li src0 = __lsx_vld(s, 0);
192*4e366538SXin Li src1 = __lsx_vldx(s, src_stride);
193*4e366538SXin Li src2 = __lsx_vldx(s, src_stride2);
194*4e366538SXin Li src3 = __lsx_vldx(s, src_stride3);
195*4e366538SXin Li s += src_stride4;
196*4e366538SXin Li ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
197*4e366538SXin Li ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
198*4e366538SXin Li ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
199*4e366538SXin Li ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
200*4e366538SXin Li src0 = __lsx_vld(s, 0);
201*4e366538SXin Li src1 = __lsx_vldx(s, src_stride);
202*4e366538SXin Li src2 = __lsx_vldx(s, src_stride2);
203*4e366538SXin Li src3 = __lsx_vldx(s, src_stride3);
204*4e366538SXin Li s += src_stride4;
205*4e366538SXin Li ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
206*4e366538SXin Li ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
207*4e366538SXin Li src0 = __lsx_vld(s, 0);
208*4e366538SXin Li src1 = __lsx_vldx(s, src_stride);
209*4e366538SXin Li src2 = __lsx_vldx(s, src_stride2);
210*4e366538SXin Li src3 = __lsx_vldx(s, src_stride3);
211*4e366538SXin Li s += src_stride4;
212*4e366538SXin Li ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
213*4e366538SXin Li ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
214*4e366538SXin Li res8 = __lsx_vilvl_w(reg4, reg0);
215*4e366538SXin Li res9 = __lsx_vilvh_w(reg4, reg0);
216*4e366538SXin Li ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
217*4e366538SXin Li LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
218*4e366538SXin Li LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
219*4e366538SXin Li res8 = __lsx_vilvl_w(reg5, reg1);
220*4e366538SXin Li res9 = __lsx_vilvh_w(reg5, reg1);
221*4e366538SXin Li ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
222*4e366538SXin Li LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
223*4e366538SXin Li LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
224*4e366538SXin Li res8 = __lsx_vilvl_w(reg6, reg2);
225*4e366538SXin Li res9 = __lsx_vilvh_w(reg6, reg2);
226*4e366538SXin Li ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
227*4e366538SXin Li LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
228*4e366538SXin Li LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
229*4e366538SXin Li res8 = __lsx_vilvl_w(reg7, reg3);
230*4e366538SXin Li res9 = __lsx_vilvh_w(reg7, reg3);
231*4e366538SXin Li ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
232*4e366538SXin Li LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
233*4e366538SXin Li LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
234*4e366538SXin Li src += 16;
235*4e366538SXin Li }
236*4e366538SXin Li }
237*4e366538SXin Li
238*4e366538SXin Li #ifdef __cplusplus
239*4e366538SXin Li } // extern "C"
240*4e366538SXin Li } // namespace libyuv
241*4e366538SXin Li #endif
242*4e366538SXin Li
243*4e366538SXin Li #endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
244