1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker *
10*fb1b10abSAndroid Build Coastguard Worker */
11*fb1b10abSAndroid Build Coastguard Worker
12*fb1b10abSAndroid Build Coastguard Worker #ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
13*fb1b10abSAndroid Build Coastguard Worker #define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
14*fb1b10abSAndroid Build Coastguard Worker
15*fb1b10abSAndroid Build Coastguard Worker /*
16*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2021 Loongson Technology Corporation Limited
17*fb1b10abSAndroid Build Coastguard Worker * All rights reserved.
18*fb1b10abSAndroid Build Coastguard Worker *
19*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
20*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
21*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
22*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
23*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
24*fb1b10abSAndroid Build Coastguard Worker *
25*fb1b10abSAndroid Build Coastguard Worker * Contributed by Shiyou Yin <[email protected]>
26*fb1b10abSAndroid Build Coastguard Worker * Xiwei Gu <[email protected]>
27*fb1b10abSAndroid Build Coastguard Worker * Lu Wang <[email protected]>
28*fb1b10abSAndroid Build Coastguard Worker *
29*fb1b10abSAndroid Build Coastguard Worker * This file is a header file for loongarch builtin extension.
30*fb1b10abSAndroid Build Coastguard Worker *
31*fb1b10abSAndroid Build Coastguard Worker */
32*fb1b10abSAndroid Build Coastguard Worker
33*fb1b10abSAndroid Build Coastguard Worker #ifndef LOONGSON_INTRINSICS_H
34*fb1b10abSAndroid Build Coastguard Worker #define LOONGSON_INTRINSICS_H
35*fb1b10abSAndroid Build Coastguard Worker
36*fb1b10abSAndroid Build Coastguard Worker /**
37*fb1b10abSAndroid Build Coastguard Worker * MAJOR version: Macro usage changes.
38*fb1b10abSAndroid Build Coastguard Worker * MINOR version: Add new functions, or bug fixes.
39*fb1b10abSAndroid Build Coastguard Worker * MICRO version: Comment changes or implementation changes.
40*fb1b10abSAndroid Build Coastguard Worker */
41*fb1b10abSAndroid Build Coastguard Worker #define LSOM_VERSION_MAJOR 1
42*fb1b10abSAndroid Build Coastguard Worker #define LSOM_VERSION_MINOR 2
43*fb1b10abSAndroid Build Coastguard Worker #define LSOM_VERSION_MICRO 1
44*fb1b10abSAndroid Build Coastguard Worker
45*fb1b10abSAndroid Build Coastguard Worker #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
46*fb1b10abSAndroid Build Coastguard Worker { \
47*fb1b10abSAndroid Build Coastguard Worker _OUT0 = _INS(_IN0); \
48*fb1b10abSAndroid Build Coastguard Worker _OUT1 = _INS(_IN1); \
49*fb1b10abSAndroid Build Coastguard Worker }
50*fb1b10abSAndroid Build Coastguard Worker
51*fb1b10abSAndroid Build Coastguard Worker #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
52*fb1b10abSAndroid Build Coastguard Worker { \
53*fb1b10abSAndroid Build Coastguard Worker _OUT0 = _INS(_IN0, _IN1); \
54*fb1b10abSAndroid Build Coastguard Worker _OUT1 = _INS(_IN2, _IN3); \
55*fb1b10abSAndroid Build Coastguard Worker }
56*fb1b10abSAndroid Build Coastguard Worker
57*fb1b10abSAndroid Build Coastguard Worker #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
58*fb1b10abSAndroid Build Coastguard Worker { \
59*fb1b10abSAndroid Build Coastguard Worker _OUT0 = _INS(_IN0, _IN1, _IN2); \
60*fb1b10abSAndroid Build Coastguard Worker _OUT1 = _INS(_IN3, _IN4, _IN5); \
61*fb1b10abSAndroid Build Coastguard Worker }
62*fb1b10abSAndroid Build Coastguard Worker
63*fb1b10abSAndroid Build Coastguard Worker #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
64*fb1b10abSAndroid Build Coastguard Worker { \
65*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
66*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
67*fb1b10abSAndroid Build Coastguard Worker }
68*fb1b10abSAndroid Build Coastguard Worker
69*fb1b10abSAndroid Build Coastguard Worker #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
70*fb1b10abSAndroid Build Coastguard Worker _OUT1, _OUT2, _OUT3) \
71*fb1b10abSAndroid Build Coastguard Worker { \
72*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
73*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
74*fb1b10abSAndroid Build Coastguard Worker }
75*fb1b10abSAndroid Build Coastguard Worker
76*fb1b10abSAndroid Build Coastguard Worker #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
77*fb1b10abSAndroid Build Coastguard Worker _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
78*fb1b10abSAndroid Build Coastguard Worker { \
79*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
80*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
81*fb1b10abSAndroid Build Coastguard Worker }
82*fb1b10abSAndroid Build Coastguard Worker
83*fb1b10abSAndroid Build Coastguard Worker #ifdef __loongarch_sx
84*fb1b10abSAndroid Build Coastguard Worker #include <lsxintrin.h>
85*fb1b10abSAndroid Build Coastguard Worker /*
86*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
87*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product & addition of byte vector elements
88*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
89*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
90*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
91*fb1b10abSAndroid Build Coastguard Worker * Details : Signed byte elements from in_h are multiplied by
92*fb1b10abSAndroid Build Coastguard Worker * signed byte elements from in_l, and then added adjacent to
93*fb1b10abSAndroid Build Coastguard Worker * each other to get a result twice the size of input. Then
94*fb1b10abSAndroid Build Coastguard Worker * the results are added to signed half-word elements from in_c.
95*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
96*fb1b10abSAndroid Build Coastguard Worker * in_c : 1,2,3,4, 1,2,3,4
97*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
98*fb1b10abSAndroid Build Coastguard Worker * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
99*fb1b10abSAndroid Build Coastguard Worker * out : 23,40,41,26, 23,40,41,26
100*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
101*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vdp2add_h_b(__m128i in_c,__m128i in_h,__m128i in_l)102*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
103*fb1b10abSAndroid Build Coastguard Worker __m128i in_l) {
104*fb1b10abSAndroid Build Coastguard Worker __m128i out;
105*fb1b10abSAndroid Build Coastguard Worker
106*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
107*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwod_h_b(out, in_h, in_l);
108*fb1b10abSAndroid Build Coastguard Worker return out;
109*fb1b10abSAndroid Build Coastguard Worker }
110*fb1b10abSAndroid Build Coastguard Worker
111*fb1b10abSAndroid Build Coastguard Worker /*
112*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
113*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product & addition of byte vector elements
114*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
115*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
116*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
117*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned byte elements from in_h are multiplied by
118*fb1b10abSAndroid Build Coastguard Worker * unsigned byte elements from in_l, and then added adjacent to
119*fb1b10abSAndroid Build Coastguard Worker * each other to get a result twice the size of input.
120*fb1b10abSAndroid Build Coastguard Worker * The results are added to signed half-word elements from in_c.
121*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
122*fb1b10abSAndroid Build Coastguard Worker * in_c : 1,2,3,4, 1,2,3,4
123*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
124*fb1b10abSAndroid Build Coastguard Worker * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
125*fb1b10abSAndroid Build Coastguard Worker * out : 23,40,41,26, 23,40,41,26
126*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
127*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vdp2add_h_bu(__m128i in_c,__m128i in_h,__m128i in_l)128*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
129*fb1b10abSAndroid Build Coastguard Worker __m128i in_l) {
130*fb1b10abSAndroid Build Coastguard Worker __m128i out;
131*fb1b10abSAndroid Build Coastguard Worker
132*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
133*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
134*fb1b10abSAndroid Build Coastguard Worker return out;
135*fb1b10abSAndroid Build Coastguard Worker }
136*fb1b10abSAndroid Build Coastguard Worker
137*fb1b10abSAndroid Build Coastguard Worker /*
138*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
139*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product & addition of byte vector elements
140*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
141*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
142*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
143*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned byte elements from in_h are multiplied by
144*fb1b10abSAndroid Build Coastguard Worker * signed byte elements from in_l, and then added adjacent to
145*fb1b10abSAndroid Build Coastguard Worker * each other to get a result twice the size of input.
146*fb1b10abSAndroid Build Coastguard Worker * The results are added to signed half-word elements from in_c.
147*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
148*fb1b10abSAndroid Build Coastguard Worker * in_c : 1,1,1,1, 1,1,1,1
149*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
150*fb1b10abSAndroid Build Coastguard Worker * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
151*fb1b10abSAndroid Build Coastguard Worker * out : -4,-24,-60,-112, 6,26,62,114
152*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
153*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vdp2add_h_bu_b(__m128i in_c,__m128i in_h,__m128i in_l)154*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
155*fb1b10abSAndroid Build Coastguard Worker __m128i in_l) {
156*fb1b10abSAndroid Build Coastguard Worker __m128i out;
157*fb1b10abSAndroid Build Coastguard Worker
158*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
159*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
160*fb1b10abSAndroid Build Coastguard Worker return out;
161*fb1b10abSAndroid Build Coastguard Worker }
162*fb1b10abSAndroid Build Coastguard Worker
163*fb1b10abSAndroid Build Coastguard Worker /*
164*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
165*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product & addition of half-word vector elements
166*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
167*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
168*fb1b10abSAndroid Build Coastguard Worker * Return Type - __m128i
169*fb1b10abSAndroid Build Coastguard Worker * Details : Signed half-word elements from in_h are multiplied by
170*fb1b10abSAndroid Build Coastguard Worker * signed half-word elements from in_l, and then added adjacent to
171*fb1b10abSAndroid Build Coastguard Worker * each other to get a result twice the size of input.
172*fb1b10abSAndroid Build Coastguard Worker * Then the results are added to signed word elements from in_c.
173*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
174*fb1b10abSAndroid Build Coastguard Worker * in_c : 1,2,3,4
175*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 5,6,7,8
176*fb1b10abSAndroid Build Coastguard Worker * in_l : 8,7,6,5, 4,3,2,1
177*fb1b10abSAndroid Build Coastguard Worker * out : 23,40,41,26
178*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
179*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vdp2add_w_h(__m128i in_c,__m128i in_h,__m128i in_l)180*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
181*fb1b10abSAndroid Build Coastguard Worker __m128i in_l) {
182*fb1b10abSAndroid Build Coastguard Worker __m128i out;
183*fb1b10abSAndroid Build Coastguard Worker
184*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
185*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwod_w_h(out, in_h, in_l);
186*fb1b10abSAndroid Build Coastguard Worker return out;
187*fb1b10abSAndroid Build Coastguard Worker }
188*fb1b10abSAndroid Build Coastguard Worker
189*fb1b10abSAndroid Build Coastguard Worker /*
190*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
191*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of byte vector elements
192*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
193*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
194*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
195*fb1b10abSAndroid Build Coastguard Worker * Details : Signed byte elements from in_h are multiplied by
196*fb1b10abSAndroid Build Coastguard Worker * signed byte elements from in_l, and then added adjacent to
197*fb1b10abSAndroid Build Coastguard Worker * each other to get a result twice the size of input.
198*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vdp2_h_b(in_h, in_l)
199*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
200*fb1b10abSAndroid Build Coastguard Worker * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
201*fb1b10abSAndroid Build Coastguard Worker * out : 22,38,38,22, 22,38,38,22
202*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
203*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vdp2_h_b(__m128i in_h,__m128i in_l)204*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
205*fb1b10abSAndroid Build Coastguard Worker __m128i out;
206*fb1b10abSAndroid Build Coastguard Worker
207*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmulwev_h_b(in_h, in_l);
208*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwod_h_b(out, in_h, in_l);
209*fb1b10abSAndroid Build Coastguard Worker return out;
210*fb1b10abSAndroid Build Coastguard Worker }
211*fb1b10abSAndroid Build Coastguard Worker
212*fb1b10abSAndroid Build Coastguard Worker /*
213*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
214*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of byte vector elements
215*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
216*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
217*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
218*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned byte elements from in_h are multiplied by
219*fb1b10abSAndroid Build Coastguard Worker * unsigned byte elements from in_l, and then added adjacent to
220*fb1b10abSAndroid Build Coastguard Worker * each other to get a result twice the size of input.
221*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vdp2_h_bu(in_h, in_l)
222*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
223*fb1b10abSAndroid Build Coastguard Worker * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
224*fb1b10abSAndroid Build Coastguard Worker * out : 22,38,38,22, 22,38,38,22
225*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
226*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vdp2_h_bu(__m128i in_h,__m128i in_l)227*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
228*fb1b10abSAndroid Build Coastguard Worker __m128i out;
229*fb1b10abSAndroid Build Coastguard Worker
230*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmulwev_h_bu(in_h, in_l);
231*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
232*fb1b10abSAndroid Build Coastguard Worker return out;
233*fb1b10abSAndroid Build Coastguard Worker }
234*fb1b10abSAndroid Build Coastguard Worker
235*fb1b10abSAndroid Build Coastguard Worker /*
236*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
237*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of byte vector elements
238*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
239*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
240*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
241*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned byte elements from in_h are multiplied by
242*fb1b10abSAndroid Build Coastguard Worker * signed byte elements from in_l, and then added adjacent to
243*fb1b10abSAndroid Build Coastguard Worker * each other to get a result twice the size of input.
244*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l)
245*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
246*fb1b10abSAndroid Build Coastguard Worker * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
247*fb1b10abSAndroid Build Coastguard Worker * out : 22,38,38,22, 22,38,38,6
248*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
249*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vdp2_h_bu_b(__m128i in_h,__m128i in_l)250*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
251*fb1b10abSAndroid Build Coastguard Worker __m128i out;
252*fb1b10abSAndroid Build Coastguard Worker
253*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmulwev_h_bu_b(in_h, in_l);
254*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
255*fb1b10abSAndroid Build Coastguard Worker return out;
256*fb1b10abSAndroid Build Coastguard Worker }
257*fb1b10abSAndroid Build Coastguard Worker
258*fb1b10abSAndroid Build Coastguard Worker /*
259*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
260*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of byte vector elements
261*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
262*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
263*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
264*fb1b10abSAndroid Build Coastguard Worker * Details : Signed byte elements from in_h are multiplied by
265*fb1b10abSAndroid Build Coastguard Worker * signed byte elements from in_l, and then added adjacent to
266*fb1b10abSAndroid Build Coastguard Worker * each other to get a result twice the size of input.
267*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vdp2_w_h(in_h, in_l)
268*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 5,6,7,8
269*fb1b10abSAndroid Build Coastguard Worker * in_l : 8,7,6,5, 4,3,2,1
270*fb1b10abSAndroid Build Coastguard Worker * out : 22,38,38,22
271*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
272*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vdp2_w_h(__m128i in_h,__m128i in_l)273*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
274*fb1b10abSAndroid Build Coastguard Worker __m128i out;
275*fb1b10abSAndroid Build Coastguard Worker
276*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmulwev_w_h(in_h, in_l);
277*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwod_w_h(out, in_h, in_l);
278*fb1b10abSAndroid Build Coastguard Worker return out;
279*fb1b10abSAndroid Build Coastguard Worker }
280*fb1b10abSAndroid Build Coastguard Worker
281*fb1b10abSAndroid Build Coastguard Worker /*
282*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
283*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of byte vector elements
284*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
285*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
286*fb1b10abSAndroid Build Coastguard Worker * Return Type - double
287*fb1b10abSAndroid Build Coastguard Worker * Details : Signed byte elements from in_h are multiplied by
288*fb1b10abSAndroid Build Coastguard Worker * signed byte elements from in_l, and then added adjacent to
289*fb1b10abSAndroid Build Coastguard Worker * each other to get a result twice the size of input.
290*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vdp2_d_w(in_h, in_l)
291*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4
292*fb1b10abSAndroid Build Coastguard Worker * in_l : 8,7,6,5
293*fb1b10abSAndroid Build Coastguard Worker * out : 22,38
294*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
295*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vdp2_d_w(__m128i in_h,__m128i in_l)296*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) {
297*fb1b10abSAndroid Build Coastguard Worker __m128i out;
298*fb1b10abSAndroid Build Coastguard Worker
299*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmulwev_d_w(in_h, in_l);
300*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaddwod_d_w(out, in_h, in_l);
301*fb1b10abSAndroid Build Coastguard Worker return out;
302*fb1b10abSAndroid Build Coastguard Worker }
303*fb1b10abSAndroid Build Coastguard Worker
304*fb1b10abSAndroid Build Coastguard Worker /*
305*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
306*fb1b10abSAndroid Build Coastguard Worker * Description : Clip all halfword elements of input vector between min & max
307*fb1b10abSAndroid Build Coastguard Worker * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
308*fb1b10abSAndroid Build Coastguard Worker * (_in))
309*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in (input vector)
310*fb1b10abSAndroid Build Coastguard Worker * - min (min threshold)
311*fb1b10abSAndroid Build Coastguard Worker * - max (max threshold)
312*fb1b10abSAndroid Build Coastguard Worker * Outputs - out (output vector with clipped elements)
313*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed halfword
314*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vclip_h(_in)
315*fb1b10abSAndroid Build Coastguard Worker * _in : -8,2,280,249, -8,255,280,249
316*fb1b10abSAndroid Build Coastguard Worker * min : 1,1,1,1, 1,1,1,1
317*fb1b10abSAndroid Build Coastguard Worker * max : 9,9,9,9, 9,9,9,9
318*fb1b10abSAndroid Build Coastguard Worker * out : 1,2,9,9, 1,9,9,9
319*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
320*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vclip_h(__m128i _in,__m128i min,__m128i max)321*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
322*fb1b10abSAndroid Build Coastguard Worker __m128i out;
323*fb1b10abSAndroid Build Coastguard Worker
324*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmax_h(min, _in);
325*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmin_h(max, out);
326*fb1b10abSAndroid Build Coastguard Worker return out;
327*fb1b10abSAndroid Build Coastguard Worker }
328*fb1b10abSAndroid Build Coastguard Worker
329*fb1b10abSAndroid Build Coastguard Worker /*
330*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
331*fb1b10abSAndroid Build Coastguard Worker * Description : Set each element of vector between 0 and 255
332*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in
333*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
334*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
335*fb1b10abSAndroid Build Coastguard Worker * Details : Signed byte elements from _in are clamped between 0 and 255.
336*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vclip255_h(_in)
337*fb1b10abSAndroid Build Coastguard Worker * _in : -8,255,280,249, -8,255,280,249
338*fb1b10abSAndroid Build Coastguard Worker * out : 0,255,255,249, 0,255,255,249
339*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
340*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vclip255_h(__m128i _in)341*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vclip255_h(__m128i _in) {
342*fb1b10abSAndroid Build Coastguard Worker __m128i out;
343*fb1b10abSAndroid Build Coastguard Worker
344*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaxi_h(_in, 0);
345*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vsat_hu(out, 7);
346*fb1b10abSAndroid Build Coastguard Worker return out;
347*fb1b10abSAndroid Build Coastguard Worker }
348*fb1b10abSAndroid Build Coastguard Worker
349*fb1b10abSAndroid Build Coastguard Worker /*
350*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
351*fb1b10abSAndroid Build Coastguard Worker * Description : Set each element of vector between 0 and 255
352*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in
353*fb1b10abSAndroid Build Coastguard Worker * Outputs - out
354*fb1b10abSAndroid Build Coastguard Worker * Return Type - word
355*fb1b10abSAndroid Build Coastguard Worker * Details : Signed byte elements from _in are clamped between 0 and 255.
356*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lsx_vclip255_w(_in)
357*fb1b10abSAndroid Build Coastguard Worker * _in : -8,255,280,249
358*fb1b10abSAndroid Build Coastguard Worker * out : 0,255,255,249
359*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
360*fb1b10abSAndroid Build Coastguard Worker */
__lsx_vclip255_w(__m128i _in)361*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vclip255_w(__m128i _in) {
362*fb1b10abSAndroid Build Coastguard Worker __m128i out;
363*fb1b10abSAndroid Build Coastguard Worker
364*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vmaxi_w(_in, 0);
365*fb1b10abSAndroid Build Coastguard Worker out = __lsx_vsat_wu(out, 7);
366*fb1b10abSAndroid Build Coastguard Worker return out;
367*fb1b10abSAndroid Build Coastguard Worker }
368*fb1b10abSAndroid Build Coastguard Worker
369*fb1b10abSAndroid Build Coastguard Worker /*
370*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
371*fb1b10abSAndroid Build Coastguard Worker * Description : Swap two variables
372*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1
373*fb1b10abSAndroid Build Coastguard Worker * Outputs - _in0, _in1 (in-place)
374*fb1b10abSAndroid Build Coastguard Worker * Details : Swapping of two input variables using xor
375*fb1b10abSAndroid Build Coastguard Worker * Example : LSX_SWAP(_in0, _in1)
376*fb1b10abSAndroid Build Coastguard Worker * _in0 : 1,2,3,4
377*fb1b10abSAndroid Build Coastguard Worker * _in1 : 5,6,7,8
378*fb1b10abSAndroid Build Coastguard Worker * _in0(out) : 5,6,7,8
379*fb1b10abSAndroid Build Coastguard Worker * _in1(out) : 1,2,3,4
380*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
381*fb1b10abSAndroid Build Coastguard Worker */
382*fb1b10abSAndroid Build Coastguard Worker #define LSX_SWAP(_in0, _in1) \
383*fb1b10abSAndroid Build Coastguard Worker { \
384*fb1b10abSAndroid Build Coastguard Worker _in0 = __lsx_vxor_v(_in0, _in1); \
385*fb1b10abSAndroid Build Coastguard Worker _in1 = __lsx_vxor_v(_in0, _in1); \
386*fb1b10abSAndroid Build Coastguard Worker _in0 = __lsx_vxor_v(_in0, _in1); \
387*fb1b10abSAndroid Build Coastguard Worker }
388*fb1b10abSAndroid Build Coastguard Worker
389*fb1b10abSAndroid Build Coastguard Worker /*
390*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
391*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose 4x4 block with word elements in vectors
392*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in0, in1, in2, in3
393*fb1b10abSAndroid Build Coastguard Worker * Outputs - out0, out1, out2, out3
394*fb1b10abSAndroid Build Coastguard Worker * Details :
395*fb1b10abSAndroid Build Coastguard Worker * Example :
396*fb1b10abSAndroid Build Coastguard Worker * 1, 2, 3, 4 1, 5, 9,13
397*fb1b10abSAndroid Build Coastguard Worker * 5, 6, 7, 8 to 2, 6,10,14
398*fb1b10abSAndroid Build Coastguard Worker * 9,10,11,12 =====> 3, 7,11,15
399*fb1b10abSAndroid Build Coastguard Worker * 13,14,15,16 4, 8,12,16
400*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
401*fb1b10abSAndroid Build Coastguard Worker */
402*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
403*fb1b10abSAndroid Build Coastguard Worker { \
404*fb1b10abSAndroid Build Coastguard Worker __m128i _t0, _t1, _t2, _t3; \
405*fb1b10abSAndroid Build Coastguard Worker \
406*fb1b10abSAndroid Build Coastguard Worker _t0 = __lsx_vilvl_w(_in1, _in0); \
407*fb1b10abSAndroid Build Coastguard Worker _t1 = __lsx_vilvh_w(_in1, _in0); \
408*fb1b10abSAndroid Build Coastguard Worker _t2 = __lsx_vilvl_w(_in3, _in2); \
409*fb1b10abSAndroid Build Coastguard Worker _t3 = __lsx_vilvh_w(_in3, _in2); \
410*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vilvl_d(_t2, _t0); \
411*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vilvh_d(_t2, _t0); \
412*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vilvl_d(_t3, _t1); \
413*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vilvh_d(_t3, _t1); \
414*fb1b10abSAndroid Build Coastguard Worker }
415*fb1b10abSAndroid Build Coastguard Worker
416*fb1b10abSAndroid Build Coastguard Worker /*
417*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
418*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose 8x8 block with byte elements in vectors
419*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
420*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
421*fb1b10abSAndroid Build Coastguard Worker * _out7
422*fb1b10abSAndroid Build Coastguard Worker * Details : The rows of the matrix become columns, and the columns
423*fb1b10abSAndroid Build Coastguard Worker * become rows.
424*fb1b10abSAndroid Build Coastguard Worker * Example : LSX_TRANSPOSE8x8_B
425*fb1b10abSAndroid Build Coastguard Worker * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
426*fb1b10abSAndroid Build Coastguard Worker * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
427*fb1b10abSAndroid Build Coastguard Worker * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
428*fb1b10abSAndroid Build Coastguard Worker * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
429*fb1b10abSAndroid Build Coastguard Worker * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
430*fb1b10abSAndroid Build Coastguard Worker * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
431*fb1b10abSAndroid Build Coastguard Worker * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
432*fb1b10abSAndroid Build Coastguard Worker * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
433*fb1b10abSAndroid Build Coastguard Worker *
434*fb1b10abSAndroid Build Coastguard Worker * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
435*fb1b10abSAndroid Build Coastguard Worker * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
436*fb1b10abSAndroid Build Coastguard Worker * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
437*fb1b10abSAndroid Build Coastguard Worker * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
438*fb1b10abSAndroid Build Coastguard Worker * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
439*fb1b10abSAndroid Build Coastguard Worker * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
440*fb1b10abSAndroid Build Coastguard Worker * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
441*fb1b10abSAndroid Build Coastguard Worker * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
442*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
443*fb1b10abSAndroid Build Coastguard Worker */
444*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
445*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
446*fb1b10abSAndroid Build Coastguard Worker _out7) \
447*fb1b10abSAndroid Build Coastguard Worker { \
448*fb1b10abSAndroid Build Coastguard Worker __m128i zero = { 0 }; \
449*fb1b10abSAndroid Build Coastguard Worker __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
450*fb1b10abSAndroid Build Coastguard Worker __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
451*fb1b10abSAndroid Build Coastguard Worker \
452*fb1b10abSAndroid Build Coastguard Worker _t0 = __lsx_vilvl_b(_in2, _in0); \
453*fb1b10abSAndroid Build Coastguard Worker _t1 = __lsx_vilvl_b(_in3, _in1); \
454*fb1b10abSAndroid Build Coastguard Worker _t2 = __lsx_vilvl_b(_in6, _in4); \
455*fb1b10abSAndroid Build Coastguard Worker _t3 = __lsx_vilvl_b(_in7, _in5); \
456*fb1b10abSAndroid Build Coastguard Worker _t4 = __lsx_vilvl_b(_t1, _t0); \
457*fb1b10abSAndroid Build Coastguard Worker _t5 = __lsx_vilvh_b(_t1, _t0); \
458*fb1b10abSAndroid Build Coastguard Worker _t6 = __lsx_vilvl_b(_t3, _t2); \
459*fb1b10abSAndroid Build Coastguard Worker _t7 = __lsx_vilvh_b(_t3, _t2); \
460*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vilvl_w(_t6, _t4); \
461*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vilvh_w(_t6, _t4); \
462*fb1b10abSAndroid Build Coastguard Worker _out4 = __lsx_vilvl_w(_t7, _t5); \
463*fb1b10abSAndroid Build Coastguard Worker _out6 = __lsx_vilvh_w(_t7, _t5); \
464*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
465*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
466*fb1b10abSAndroid Build Coastguard Worker _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
467*fb1b10abSAndroid Build Coastguard Worker _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
468*fb1b10abSAndroid Build Coastguard Worker }
469*fb1b10abSAndroid Build Coastguard Worker
470*fb1b10abSAndroid Build Coastguard Worker /*
471*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
472*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose 8x8 block with half-word elements in vectors
473*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
474*fb1b10abSAndroid Build Coastguard Worker * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
475*fb1b10abSAndroid Build Coastguard Worker * Details :
476*fb1b10abSAndroid Build Coastguard Worker * Example :
477*fb1b10abSAndroid Build Coastguard Worker * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70
478*fb1b10abSAndroid Build Coastguard Worker * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71
479*fb1b10abSAndroid Build Coastguard Worker * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72
480*fb1b10abSAndroid Build Coastguard Worker * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73
481*fb1b10abSAndroid Build Coastguard Worker * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74
482*fb1b10abSAndroid Build Coastguard Worker * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75
483*fb1b10abSAndroid Build Coastguard Worker * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76
484*fb1b10abSAndroid Build Coastguard Worker * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
485*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
486*fb1b10abSAndroid Build Coastguard Worker */
487*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
488*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
489*fb1b10abSAndroid Build Coastguard Worker _out7) \
490*fb1b10abSAndroid Build Coastguard Worker { \
491*fb1b10abSAndroid Build Coastguard Worker __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
492*fb1b10abSAndroid Build Coastguard Worker \
493*fb1b10abSAndroid Build Coastguard Worker _s0 = __lsx_vilvl_h(_in6, _in4); \
494*fb1b10abSAndroid Build Coastguard Worker _s1 = __lsx_vilvl_h(_in7, _in5); \
495*fb1b10abSAndroid Build Coastguard Worker _t0 = __lsx_vilvl_h(_s1, _s0); \
496*fb1b10abSAndroid Build Coastguard Worker _t1 = __lsx_vilvh_h(_s1, _s0); \
497*fb1b10abSAndroid Build Coastguard Worker _s0 = __lsx_vilvh_h(_in6, _in4); \
498*fb1b10abSAndroid Build Coastguard Worker _s1 = __lsx_vilvh_h(_in7, _in5); \
499*fb1b10abSAndroid Build Coastguard Worker _t2 = __lsx_vilvl_h(_s1, _s0); \
500*fb1b10abSAndroid Build Coastguard Worker _t3 = __lsx_vilvh_h(_s1, _s0); \
501*fb1b10abSAndroid Build Coastguard Worker _s0 = __lsx_vilvl_h(_in2, _in0); \
502*fb1b10abSAndroid Build Coastguard Worker _s1 = __lsx_vilvl_h(_in3, _in1); \
503*fb1b10abSAndroid Build Coastguard Worker _t4 = __lsx_vilvl_h(_s1, _s0); \
504*fb1b10abSAndroid Build Coastguard Worker _t5 = __lsx_vilvh_h(_s1, _s0); \
505*fb1b10abSAndroid Build Coastguard Worker _s0 = __lsx_vilvh_h(_in2, _in0); \
506*fb1b10abSAndroid Build Coastguard Worker _s1 = __lsx_vilvh_h(_in3, _in1); \
507*fb1b10abSAndroid Build Coastguard Worker _t6 = __lsx_vilvl_h(_s1, _s0); \
508*fb1b10abSAndroid Build Coastguard Worker _t7 = __lsx_vilvh_h(_s1, _s0); \
509*fb1b10abSAndroid Build Coastguard Worker \
510*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vpickev_d(_t0, _t4); \
511*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vpickev_d(_t1, _t5); \
512*fb1b10abSAndroid Build Coastguard Worker _out4 = __lsx_vpickev_d(_t2, _t6); \
513*fb1b10abSAndroid Build Coastguard Worker _out6 = __lsx_vpickev_d(_t3, _t7); \
514*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vpickod_d(_t0, _t4); \
515*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vpickod_d(_t1, _t5); \
516*fb1b10abSAndroid Build Coastguard Worker _out5 = __lsx_vpickod_d(_t2, _t6); \
517*fb1b10abSAndroid Build Coastguard Worker _out7 = __lsx_vpickod_d(_t3, _t7); \
518*fb1b10abSAndroid Build Coastguard Worker }
519*fb1b10abSAndroid Build Coastguard Worker
520*fb1b10abSAndroid Build Coastguard Worker /*
521*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
522*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose input 8x4 byte block into 4x8
523*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
524*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
525*fb1b10abSAndroid Build Coastguard Worker * Return Type - as per RTYPE
526*fb1b10abSAndroid Build Coastguard Worker * Details : The rows of the matrix become columns, and the columns become
527*fb1b10abSAndroid Build Coastguard Worker * rows.
528*fb1b10abSAndroid Build Coastguard Worker * Example : LSX_TRANSPOSE8x4_B
529*fb1b10abSAndroid Build Coastguard Worker * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
530*fb1b10abSAndroid Build Coastguard Worker * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
531*fb1b10abSAndroid Build Coastguard Worker * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
532*fb1b10abSAndroid Build Coastguard Worker * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
533*fb1b10abSAndroid Build Coastguard Worker * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
534*fb1b10abSAndroid Build Coastguard Worker * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
535*fb1b10abSAndroid Build Coastguard Worker * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
536*fb1b10abSAndroid Build Coastguard Worker * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
537*fb1b10abSAndroid Build Coastguard Worker *
538*fb1b10abSAndroid Build Coastguard Worker * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
539*fb1b10abSAndroid Build Coastguard Worker * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
540*fb1b10abSAndroid Build Coastguard Worker * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
541*fb1b10abSAndroid Build Coastguard Worker * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
542*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
543*fb1b10abSAndroid Build Coastguard Worker */
544*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
545*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3) \
546*fb1b10abSAndroid Build Coastguard Worker { \
547*fb1b10abSAndroid Build Coastguard Worker __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
548*fb1b10abSAndroid Build Coastguard Worker \
549*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
550*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
551*fb1b10abSAndroid Build Coastguard Worker _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
552*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
553*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
554*fb1b10abSAndroid Build Coastguard Worker \
555*fb1b10abSAndroid Build Coastguard Worker _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
556*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
557*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
558*fb1b10abSAndroid Build Coastguard Worker \
559*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
560*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
561*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vilvh_d(_out2, _out0); \
562*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vilvh_d(_out0, _out2); \
563*fb1b10abSAndroid Build Coastguard Worker }
564*fb1b10abSAndroid Build Coastguard Worker
565*fb1b10abSAndroid Build Coastguard Worker /*
566*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
567*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose 16x8 block with byte elements in vectors
568*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8
569*fb1b10abSAndroid Build Coastguard Worker * in9, in10, in11, in12, in13, in14, in15
570*fb1b10abSAndroid Build Coastguard Worker * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
571*fb1b10abSAndroid Build Coastguard Worker * Details :
572*fb1b10abSAndroid Build Coastguard Worker * Example :
573*fb1b10abSAndroid Build Coastguard Worker * 000,001,002,003,004,005,006,007
574*fb1b10abSAndroid Build Coastguard Worker * 008,009,010,011,012,013,014,015
575*fb1b10abSAndroid Build Coastguard Worker * 016,017,018,019,020,021,022,023
576*fb1b10abSAndroid Build Coastguard Worker * 024,025,026,027,028,029,030,031
577*fb1b10abSAndroid Build Coastguard Worker * 032,033,034,035,036,037,038,039
578*fb1b10abSAndroid Build Coastguard Worker * 040,041,042,043,044,045,046,047 000,008,...,112,120
579*fb1b10abSAndroid Build Coastguard Worker * 048,049,050,051,052,053,054,055 001,009,...,113,121
580*fb1b10abSAndroid Build Coastguard Worker * 056,057,058,059,060,061,062,063 to 002,010,...,114,122
581*fb1b10abSAndroid Build Coastguard Worker * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
582*fb1b10abSAndroid Build Coastguard Worker * 072,073,074,075,076,077,078,079 004,012,...,116,124
583*fb1b10abSAndroid Build Coastguard Worker * 080,081,082,083,084,085,086,087 005,013,...,117,125
584*fb1b10abSAndroid Build Coastguard Worker * 088,089,090,091,092,093,094,095 006,014,...,118,126
585*fb1b10abSAndroid Build Coastguard Worker * 096,097,098,099,100,101,102,103 007,015,...,119,127
586*fb1b10abSAndroid Build Coastguard Worker * 104,105,106,107,108,109,110,111
587*fb1b10abSAndroid Build Coastguard Worker * 112,113,114,115,116,117,118,119
588*fb1b10abSAndroid Build Coastguard Worker * 120,121,122,123,124,125,126,127
589*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
590*fb1b10abSAndroid Build Coastguard Worker */
591*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
592*fb1b10abSAndroid Build Coastguard Worker _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
593*fb1b10abSAndroid Build Coastguard Worker _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
594*fb1b10abSAndroid Build Coastguard Worker _out6, _out7) \
595*fb1b10abSAndroid Build Coastguard Worker { \
596*fb1b10abSAndroid Build Coastguard Worker __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
597*fb1b10abSAndroid Build Coastguard Worker __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
598*fb1b10abSAndroid Build Coastguard Worker DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
599*fb1b10abSAndroid Build Coastguard Worker _tmp0, _tmp1, _tmp2, _tmp3); \
600*fb1b10abSAndroid Build Coastguard Worker DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
601*fb1b10abSAndroid Build Coastguard Worker _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
602*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
603*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
604*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
605*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
606*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
607*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
608*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
609*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
610*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
611*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
612*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
613*fb1b10abSAndroid Build Coastguard Worker DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
614*fb1b10abSAndroid Build Coastguard Worker }
615*fb1b10abSAndroid Build Coastguard Worker
616*fb1b10abSAndroid Build Coastguard Worker /*
617*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
618*fb1b10abSAndroid Build Coastguard Worker * Description : Butterfly of 4 input vectors
619*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in0, in1, in2, in3
620*fb1b10abSAndroid Build Coastguard Worker * Outputs - out0, out1, out2, out3
621*fb1b10abSAndroid Build Coastguard Worker * Details : Butterfly operation
622*fb1b10abSAndroid Build Coastguard Worker * Example :
623*fb1b10abSAndroid Build Coastguard Worker * out0 = in0 + in3;
624*fb1b10abSAndroid Build Coastguard Worker * out1 = in1 + in2;
625*fb1b10abSAndroid Build Coastguard Worker * out2 = in1 - in2;
626*fb1b10abSAndroid Build Coastguard Worker * out3 = in0 - in3;
627*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
628*fb1b10abSAndroid Build Coastguard Worker */
629*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
630*fb1b10abSAndroid Build Coastguard Worker { \
631*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_b(_in0, _in3); \
632*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_b(_in1, _in2); \
633*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vsub_b(_in1, _in2); \
634*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vsub_b(_in0, _in3); \
635*fb1b10abSAndroid Build Coastguard Worker }
636*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
637*fb1b10abSAndroid Build Coastguard Worker { \
638*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_h(_in0, _in3); \
639*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_h(_in1, _in2); \
640*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vsub_h(_in1, _in2); \
641*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vsub_h(_in0, _in3); \
642*fb1b10abSAndroid Build Coastguard Worker }
643*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
644*fb1b10abSAndroid Build Coastguard Worker { \
645*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_w(_in0, _in3); \
646*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_w(_in1, _in2); \
647*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vsub_w(_in1, _in2); \
648*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vsub_w(_in0, _in3); \
649*fb1b10abSAndroid Build Coastguard Worker }
650*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
651*fb1b10abSAndroid Build Coastguard Worker { \
652*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_d(_in0, _in3); \
653*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_d(_in1, _in2); \
654*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vsub_d(_in1, _in2); \
655*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vsub_d(_in0, _in3); \
656*fb1b10abSAndroid Build Coastguard Worker }
657*fb1b10abSAndroid Build Coastguard Worker
658*fb1b10abSAndroid Build Coastguard Worker /*
659*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
660*fb1b10abSAndroid Build Coastguard Worker * Description : Butterfly of 8 input vectors
661*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
662*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3, ~
663*fb1b10abSAndroid Build Coastguard Worker * Details : Butterfly operation
664*fb1b10abSAndroid Build Coastguard Worker * Example :
665*fb1b10abSAndroid Build Coastguard Worker * _out0 = _in0 + _in7;
666*fb1b10abSAndroid Build Coastguard Worker * _out1 = _in1 + _in6;
667*fb1b10abSAndroid Build Coastguard Worker * _out2 = _in2 + _in5;
668*fb1b10abSAndroid Build Coastguard Worker * _out3 = _in3 + _in4;
669*fb1b10abSAndroid Build Coastguard Worker * _out4 = _in3 - _in4;
670*fb1b10abSAndroid Build Coastguard Worker * _out5 = _in2 - _in5;
671*fb1b10abSAndroid Build Coastguard Worker * _out6 = _in1 - _in6;
672*fb1b10abSAndroid Build Coastguard Worker * _out7 = _in0 - _in7;
673*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
674*fb1b10abSAndroid Build Coastguard Worker */
675*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
676*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
677*fb1b10abSAndroid Build Coastguard Worker _out7) \
678*fb1b10abSAndroid Build Coastguard Worker { \
679*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_b(_in0, _in7); \
680*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_b(_in1, _in6); \
681*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vadd_b(_in2, _in5); \
682*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vadd_b(_in3, _in4); \
683*fb1b10abSAndroid Build Coastguard Worker _out4 = __lsx_vsub_b(_in3, _in4); \
684*fb1b10abSAndroid Build Coastguard Worker _out5 = __lsx_vsub_b(_in2, _in5); \
685*fb1b10abSAndroid Build Coastguard Worker _out6 = __lsx_vsub_b(_in1, _in6); \
686*fb1b10abSAndroid Build Coastguard Worker _out7 = __lsx_vsub_b(_in0, _in7); \
687*fb1b10abSAndroid Build Coastguard Worker }
688*fb1b10abSAndroid Build Coastguard Worker
689*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
690*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
691*fb1b10abSAndroid Build Coastguard Worker _out7) \
692*fb1b10abSAndroid Build Coastguard Worker { \
693*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_h(_in0, _in7); \
694*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_h(_in1, _in6); \
695*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vadd_h(_in2, _in5); \
696*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vadd_h(_in3, _in4); \
697*fb1b10abSAndroid Build Coastguard Worker _out4 = __lsx_vsub_h(_in3, _in4); \
698*fb1b10abSAndroid Build Coastguard Worker _out5 = __lsx_vsub_h(_in2, _in5); \
699*fb1b10abSAndroid Build Coastguard Worker _out6 = __lsx_vsub_h(_in1, _in6); \
700*fb1b10abSAndroid Build Coastguard Worker _out7 = __lsx_vsub_h(_in0, _in7); \
701*fb1b10abSAndroid Build Coastguard Worker }
702*fb1b10abSAndroid Build Coastguard Worker
703*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
704*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
705*fb1b10abSAndroid Build Coastguard Worker _out7) \
706*fb1b10abSAndroid Build Coastguard Worker { \
707*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_w(_in0, _in7); \
708*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_w(_in1, _in6); \
709*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vadd_w(_in2, _in5); \
710*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vadd_w(_in3, _in4); \
711*fb1b10abSAndroid Build Coastguard Worker _out4 = __lsx_vsub_w(_in3, _in4); \
712*fb1b10abSAndroid Build Coastguard Worker _out5 = __lsx_vsub_w(_in2, _in5); \
713*fb1b10abSAndroid Build Coastguard Worker _out6 = __lsx_vsub_w(_in1, _in6); \
714*fb1b10abSAndroid Build Coastguard Worker _out7 = __lsx_vsub_w(_in0, _in7); \
715*fb1b10abSAndroid Build Coastguard Worker }
716*fb1b10abSAndroid Build Coastguard Worker
717*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
718*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
719*fb1b10abSAndroid Build Coastguard Worker _out7) \
720*fb1b10abSAndroid Build Coastguard Worker { \
721*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_d(_in0, _in7); \
722*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_d(_in1, _in6); \
723*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vadd_d(_in2, _in5); \
724*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vadd_d(_in3, _in4); \
725*fb1b10abSAndroid Build Coastguard Worker _out4 = __lsx_vsub_d(_in3, _in4); \
726*fb1b10abSAndroid Build Coastguard Worker _out5 = __lsx_vsub_d(_in2, _in5); \
727*fb1b10abSAndroid Build Coastguard Worker _out6 = __lsx_vsub_d(_in1, _in6); \
728*fb1b10abSAndroid Build Coastguard Worker _out7 = __lsx_vsub_d(_in0, _in7); \
729*fb1b10abSAndroid Build Coastguard Worker }
730*fb1b10abSAndroid Build Coastguard Worker
731*fb1b10abSAndroid Build Coastguard Worker /*
732*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
733*fb1b10abSAndroid Build Coastguard Worker * Description : Butterfly of 16 input vectors
734*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
735*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3, ~
736*fb1b10abSAndroid Build Coastguard Worker * Details : Butterfly operation
737*fb1b10abSAndroid Build Coastguard Worker * Example :
738*fb1b10abSAndroid Build Coastguard Worker * _out0 = _in0 + _in15;
739*fb1b10abSAndroid Build Coastguard Worker * _out1 = _in1 + _in14;
740*fb1b10abSAndroid Build Coastguard Worker * _out2 = _in2 + _in13;
741*fb1b10abSAndroid Build Coastguard Worker * _out3 = _in3 + _in12;
742*fb1b10abSAndroid Build Coastguard Worker * _out4 = _in4 + _in11;
743*fb1b10abSAndroid Build Coastguard Worker * _out5 = _in5 + _in10;
744*fb1b10abSAndroid Build Coastguard Worker * _out6 = _in6 + _in9;
745*fb1b10abSAndroid Build Coastguard Worker * _out7 = _in7 + _in8;
746*fb1b10abSAndroid Build Coastguard Worker * _out8 = _in7 - _in8;
747*fb1b10abSAndroid Build Coastguard Worker * _out9 = _in6 - _in9;
748*fb1b10abSAndroid Build Coastguard Worker * _out10 = _in5 - _in10;
749*fb1b10abSAndroid Build Coastguard Worker * _out11 = _in4 - _in11;
750*fb1b10abSAndroid Build Coastguard Worker * _out12 = _in3 - _in12;
751*fb1b10abSAndroid Build Coastguard Worker * _out13 = _in2 - _in13;
752*fb1b10abSAndroid Build Coastguard Worker * _out14 = _in1 - _in14;
753*fb1b10abSAndroid Build Coastguard Worker * _out15 = _in0 - _in15;
754*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
755*fb1b10abSAndroid Build Coastguard Worker */
756*fb1b10abSAndroid Build Coastguard Worker
757*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
758*fb1b10abSAndroid Build Coastguard Worker _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
759*fb1b10abSAndroid Build Coastguard Worker _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
760*fb1b10abSAndroid Build Coastguard Worker _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
761*fb1b10abSAndroid Build Coastguard Worker _out13, _out14, _out15) \
762*fb1b10abSAndroid Build Coastguard Worker { \
763*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_b(_in0, _in15); \
764*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_b(_in1, _in14); \
765*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vadd_b(_in2, _in13); \
766*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vadd_b(_in3, _in12); \
767*fb1b10abSAndroid Build Coastguard Worker _out4 = __lsx_vadd_b(_in4, _in11); \
768*fb1b10abSAndroid Build Coastguard Worker _out5 = __lsx_vadd_b(_in5, _in10); \
769*fb1b10abSAndroid Build Coastguard Worker _out6 = __lsx_vadd_b(_in6, _in9); \
770*fb1b10abSAndroid Build Coastguard Worker _out7 = __lsx_vadd_b(_in7, _in8); \
771*fb1b10abSAndroid Build Coastguard Worker \
772*fb1b10abSAndroid Build Coastguard Worker _out8 = __lsx_vsub_b(_in7, _in8); \
773*fb1b10abSAndroid Build Coastguard Worker _out9 = __lsx_vsub_b(_in6, _in9); \
774*fb1b10abSAndroid Build Coastguard Worker _out10 = __lsx_vsub_b(_in5, _in10); \
775*fb1b10abSAndroid Build Coastguard Worker _out11 = __lsx_vsub_b(_in4, _in11); \
776*fb1b10abSAndroid Build Coastguard Worker _out12 = __lsx_vsub_b(_in3, _in12); \
777*fb1b10abSAndroid Build Coastguard Worker _out13 = __lsx_vsub_b(_in2, _in13); \
778*fb1b10abSAndroid Build Coastguard Worker _out14 = __lsx_vsub_b(_in1, _in14); \
779*fb1b10abSAndroid Build Coastguard Worker _out15 = __lsx_vsub_b(_in0, _in15); \
780*fb1b10abSAndroid Build Coastguard Worker }
781*fb1b10abSAndroid Build Coastguard Worker
782*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
783*fb1b10abSAndroid Build Coastguard Worker _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
784*fb1b10abSAndroid Build Coastguard Worker _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
785*fb1b10abSAndroid Build Coastguard Worker _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
786*fb1b10abSAndroid Build Coastguard Worker _out13, _out14, _out15) \
787*fb1b10abSAndroid Build Coastguard Worker { \
788*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_h(_in0, _in15); \
789*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_h(_in1, _in14); \
790*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vadd_h(_in2, _in13); \
791*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vadd_h(_in3, _in12); \
792*fb1b10abSAndroid Build Coastguard Worker _out4 = __lsx_vadd_h(_in4, _in11); \
793*fb1b10abSAndroid Build Coastguard Worker _out5 = __lsx_vadd_h(_in5, _in10); \
794*fb1b10abSAndroid Build Coastguard Worker _out6 = __lsx_vadd_h(_in6, _in9); \
795*fb1b10abSAndroid Build Coastguard Worker _out7 = __lsx_vadd_h(_in7, _in8); \
796*fb1b10abSAndroid Build Coastguard Worker \
797*fb1b10abSAndroid Build Coastguard Worker _out8 = __lsx_vsub_h(_in7, _in8); \
798*fb1b10abSAndroid Build Coastguard Worker _out9 = __lsx_vsub_h(_in6, _in9); \
799*fb1b10abSAndroid Build Coastguard Worker _out10 = __lsx_vsub_h(_in5, _in10); \
800*fb1b10abSAndroid Build Coastguard Worker _out11 = __lsx_vsub_h(_in4, _in11); \
801*fb1b10abSAndroid Build Coastguard Worker _out12 = __lsx_vsub_h(_in3, _in12); \
802*fb1b10abSAndroid Build Coastguard Worker _out13 = __lsx_vsub_h(_in2, _in13); \
803*fb1b10abSAndroid Build Coastguard Worker _out14 = __lsx_vsub_h(_in1, _in14); \
804*fb1b10abSAndroid Build Coastguard Worker _out15 = __lsx_vsub_h(_in0, _in15); \
805*fb1b10abSAndroid Build Coastguard Worker }
806*fb1b10abSAndroid Build Coastguard Worker
807*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
808*fb1b10abSAndroid Build Coastguard Worker _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
809*fb1b10abSAndroid Build Coastguard Worker _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
810*fb1b10abSAndroid Build Coastguard Worker _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
811*fb1b10abSAndroid Build Coastguard Worker _out13, _out14, _out15) \
812*fb1b10abSAndroid Build Coastguard Worker { \
813*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_w(_in0, _in15); \
814*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_w(_in1, _in14); \
815*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vadd_w(_in2, _in13); \
816*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vadd_w(_in3, _in12); \
817*fb1b10abSAndroid Build Coastguard Worker _out4 = __lsx_vadd_w(_in4, _in11); \
818*fb1b10abSAndroid Build Coastguard Worker _out5 = __lsx_vadd_w(_in5, _in10); \
819*fb1b10abSAndroid Build Coastguard Worker _out6 = __lsx_vadd_w(_in6, _in9); \
820*fb1b10abSAndroid Build Coastguard Worker _out7 = __lsx_vadd_w(_in7, _in8); \
821*fb1b10abSAndroid Build Coastguard Worker \
822*fb1b10abSAndroid Build Coastguard Worker _out8 = __lsx_vsub_w(_in7, _in8); \
823*fb1b10abSAndroid Build Coastguard Worker _out9 = __lsx_vsub_w(_in6, _in9); \
824*fb1b10abSAndroid Build Coastguard Worker _out10 = __lsx_vsub_w(_in5, _in10); \
825*fb1b10abSAndroid Build Coastguard Worker _out11 = __lsx_vsub_w(_in4, _in11); \
826*fb1b10abSAndroid Build Coastguard Worker _out12 = __lsx_vsub_w(_in3, _in12); \
827*fb1b10abSAndroid Build Coastguard Worker _out13 = __lsx_vsub_w(_in2, _in13); \
828*fb1b10abSAndroid Build Coastguard Worker _out14 = __lsx_vsub_w(_in1, _in14); \
829*fb1b10abSAndroid Build Coastguard Worker _out15 = __lsx_vsub_w(_in0, _in15); \
830*fb1b10abSAndroid Build Coastguard Worker }
831*fb1b10abSAndroid Build Coastguard Worker
832*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
833*fb1b10abSAndroid Build Coastguard Worker _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
834*fb1b10abSAndroid Build Coastguard Worker _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
835*fb1b10abSAndroid Build Coastguard Worker _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
836*fb1b10abSAndroid Build Coastguard Worker _out13, _out14, _out15) \
837*fb1b10abSAndroid Build Coastguard Worker { \
838*fb1b10abSAndroid Build Coastguard Worker _out0 = __lsx_vadd_d(_in0, _in15); \
839*fb1b10abSAndroid Build Coastguard Worker _out1 = __lsx_vadd_d(_in1, _in14); \
840*fb1b10abSAndroid Build Coastguard Worker _out2 = __lsx_vadd_d(_in2, _in13); \
841*fb1b10abSAndroid Build Coastguard Worker _out3 = __lsx_vadd_d(_in3, _in12); \
842*fb1b10abSAndroid Build Coastguard Worker _out4 = __lsx_vadd_d(_in4, _in11); \
843*fb1b10abSAndroid Build Coastguard Worker _out5 = __lsx_vadd_d(_in5, _in10); \
844*fb1b10abSAndroid Build Coastguard Worker _out6 = __lsx_vadd_d(_in6, _in9); \
845*fb1b10abSAndroid Build Coastguard Worker _out7 = __lsx_vadd_d(_in7, _in8); \
846*fb1b10abSAndroid Build Coastguard Worker \
847*fb1b10abSAndroid Build Coastguard Worker _out8 = __lsx_vsub_d(_in7, _in8); \
848*fb1b10abSAndroid Build Coastguard Worker _out9 = __lsx_vsub_d(_in6, _in9); \
849*fb1b10abSAndroid Build Coastguard Worker _out10 = __lsx_vsub_d(_in5, _in10); \
850*fb1b10abSAndroid Build Coastguard Worker _out11 = __lsx_vsub_d(_in4, _in11); \
851*fb1b10abSAndroid Build Coastguard Worker _out12 = __lsx_vsub_d(_in3, _in12); \
852*fb1b10abSAndroid Build Coastguard Worker _out13 = __lsx_vsub_d(_in2, _in13); \
853*fb1b10abSAndroid Build Coastguard Worker _out14 = __lsx_vsub_d(_in1, _in14); \
854*fb1b10abSAndroid Build Coastguard Worker _out15 = __lsx_vsub_d(_in0, _in15); \
855*fb1b10abSAndroid Build Coastguard Worker }
856*fb1b10abSAndroid Build Coastguard Worker
857*fb1b10abSAndroid Build Coastguard Worker #endif // LSX
858*fb1b10abSAndroid Build Coastguard Worker
859*fb1b10abSAndroid Build Coastguard Worker #ifdef __loongarch_asx
860*fb1b10abSAndroid Build Coastguard Worker #include <lasxintrin.h>
861*fb1b10abSAndroid Build Coastguard Worker /*
862*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
863*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of byte vector elements
864*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
865*fb1b10abSAndroid Build Coastguard Worker * Output - out
866*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed halfword
867*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned byte elements from in_h are multiplied with
868*fb1b10abSAndroid Build Coastguard Worker * unsigned byte elements from in_l producing a result
869*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed halfword.
870*fb1b10abSAndroid Build Coastguard Worker * Then these multiplied results of adjacent odd-even elements
871*fb1b10abSAndroid Build Coastguard Worker * are added to the out vector
872*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
873*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
874*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2_h_bu(__m256i in_h,__m256i in_l)875*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
876*fb1b10abSAndroid Build Coastguard Worker __m256i out;
877*fb1b10abSAndroid Build Coastguard Worker
878*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmulwev_h_bu(in_h, in_l);
879*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
880*fb1b10abSAndroid Build Coastguard Worker return out;
881*fb1b10abSAndroid Build Coastguard Worker }
882*fb1b10abSAndroid Build Coastguard Worker
883*fb1b10abSAndroid Build Coastguard Worker /*
884*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
885*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of byte vector elements
886*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
887*fb1b10abSAndroid Build Coastguard Worker * Output - out
888*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed halfword
889*fb1b10abSAndroid Build Coastguard Worker * Details : Signed byte elements from in_h are multiplied with
890*fb1b10abSAndroid Build Coastguard Worker * signed byte elements from in_l producing a result
891*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed halfword.
892*fb1b10abSAndroid Build Coastguard Worker * Then these multiplication results of adjacent odd-even elements
893*fb1b10abSAndroid Build Coastguard Worker * are added to the out vector
894*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
895*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
896*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2_h_b(__m256i in_h,__m256i in_l)897*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
898*fb1b10abSAndroid Build Coastguard Worker __m256i out;
899*fb1b10abSAndroid Build Coastguard Worker
900*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmulwev_h_b(in_h, in_l);
901*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
902*fb1b10abSAndroid Build Coastguard Worker return out;
903*fb1b10abSAndroid Build Coastguard Worker }
904*fb1b10abSAndroid Build Coastguard Worker
905*fb1b10abSAndroid Build Coastguard Worker /*
906*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
907*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of halfword vector elements
908*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
909*fb1b10abSAndroid Build Coastguard Worker * Output - out
910*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed word
911*fb1b10abSAndroid Build Coastguard Worker * Details : Signed halfword elements from in_h are multiplied with
912*fb1b10abSAndroid Build Coastguard Worker * signed halfword elements from in_l producing a result
913*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed word.
914*fb1b10abSAndroid Build Coastguard Worker * Then these multiplied results of adjacent odd-even elements
915*fb1b10abSAndroid Build Coastguard Worker * are added to the out vector.
916*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvdp2_w_h(in_h, in_l)
917*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
918*fb1b10abSAndroid Build Coastguard Worker * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
919*fb1b10abSAndroid Build Coastguard Worker * out : 22,38,38,22, 22,38,38,22
920*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
921*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2_w_h(__m256i in_h,__m256i in_l)922*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
923*fb1b10abSAndroid Build Coastguard Worker __m256i out;
924*fb1b10abSAndroid Build Coastguard Worker
925*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmulwev_w_h(in_h, in_l);
926*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
927*fb1b10abSAndroid Build Coastguard Worker return out;
928*fb1b10abSAndroid Build Coastguard Worker }
929*fb1b10abSAndroid Build Coastguard Worker
930*fb1b10abSAndroid Build Coastguard Worker /*
931*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
932*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of word vector elements
933*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
934*fb1b10abSAndroid Build Coastguard Worker * Output - out
935*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed double
936*fb1b10abSAndroid Build Coastguard Worker * Details : Signed word elements from in_h are multiplied with
937*fb1b10abSAndroid Build Coastguard Worker * signed word elements from in_l producing a result
938*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed double-word.
939*fb1b10abSAndroid Build Coastguard Worker * Then these multiplied results of adjacent odd-even elements
940*fb1b10abSAndroid Build Coastguard Worker * are added to the out vector.
941*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
942*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
943*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2_d_w(__m256i in_h,__m256i in_l)944*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
945*fb1b10abSAndroid Build Coastguard Worker __m256i out;
946*fb1b10abSAndroid Build Coastguard Worker
947*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmulwev_d_w(in_h, in_l);
948*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
949*fb1b10abSAndroid Build Coastguard Worker return out;
950*fb1b10abSAndroid Build Coastguard Worker }
951*fb1b10abSAndroid Build Coastguard Worker
952*fb1b10abSAndroid Build Coastguard Worker /*
953*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
954*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of halfword vector elements
955*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
956*fb1b10abSAndroid Build Coastguard Worker * Output - out
957*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed word
958*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned halfword elements from in_h are multiplied with
959*fb1b10abSAndroid Build Coastguard Worker * signed halfword elements from in_l producing a result
960*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. unsigned word.
961*fb1b10abSAndroid Build Coastguard Worker * Multiplication result of adjacent odd-even elements
962*fb1b10abSAndroid Build Coastguard Worker * are added to the out vector
963*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
964*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
965*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2_w_hu_h(__m256i in_h,__m256i in_l)966*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
967*fb1b10abSAndroid Build Coastguard Worker __m256i out;
968*fb1b10abSAndroid Build Coastguard Worker
969*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
970*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
971*fb1b10abSAndroid Build Coastguard Worker return out;
972*fb1b10abSAndroid Build Coastguard Worker }
973*fb1b10abSAndroid Build Coastguard Worker
974*fb1b10abSAndroid Build Coastguard Worker /*
975*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
976*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product & addition of byte vector elements
977*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
978*fb1b10abSAndroid Build Coastguard Worker * Output - out
979*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
980*fb1b10abSAndroid Build Coastguard Worker * Details : Signed byte elements from in_h are multiplied with
981*fb1b10abSAndroid Build Coastguard Worker * signed byte elements from in_l producing a result
982*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed halfword.
983*fb1b10abSAndroid Build Coastguard Worker * Then these multiplied results of adjacent odd-even elements
984*fb1b10abSAndroid Build Coastguard Worker * are added to the in_c vector.
985*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
986*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
987*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h,__m256i in_l)988*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
989*fb1b10abSAndroid Build Coastguard Worker __m256i in_l) {
990*fb1b10abSAndroid Build Coastguard Worker __m256i out;
991*fb1b10abSAndroid Build Coastguard Worker
992*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
993*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
994*fb1b10abSAndroid Build Coastguard Worker return out;
995*fb1b10abSAndroid Build Coastguard Worker }
996*fb1b10abSAndroid Build Coastguard Worker
997*fb1b10abSAndroid Build Coastguard Worker /*
998*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
999*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product & addition of byte vector elements
1000*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1001*fb1b10abSAndroid Build Coastguard Worker * Output - out
1002*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
1003*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned byte elements from in_h are multiplied with
1004*fb1b10abSAndroid Build Coastguard Worker * unsigned byte elements from in_l producing a result
1005*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed halfword.
1006*fb1b10abSAndroid Build Coastguard Worker * Then these multiplied results of adjacent odd-even elements
1007*fb1b10abSAndroid Build Coastguard Worker * are added to the in_c vector.
1008*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1009*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1010*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2add_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)1011*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
1012*fb1b10abSAndroid Build Coastguard Worker __m256i in_l) {
1013*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1014*fb1b10abSAndroid Build Coastguard Worker
1015*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
1016*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
1017*fb1b10abSAndroid Build Coastguard Worker return out;
1018*fb1b10abSAndroid Build Coastguard Worker }
1019*fb1b10abSAndroid Build Coastguard Worker
1020*fb1b10abSAndroid Build Coastguard Worker /*
1021*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1022*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product & addition of byte vector elements
1023*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1024*fb1b10abSAndroid Build Coastguard Worker * Output - out
1025*fb1b10abSAndroid Build Coastguard Worker * Return Type - halfword
1026*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned byte elements from in_h are multiplied with
1027*fb1b10abSAndroid Build Coastguard Worker * signed byte elements from in_l producing a result
1028*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed halfword.
1029*fb1b10abSAndroid Build Coastguard Worker * Then these multiplied results of adjacent odd-even elements
1030*fb1b10abSAndroid Build Coastguard Worker * are added to the in_c vector.
1031*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1032*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1033*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2add_h_bu_b(__m256i in_c,__m256i in_h,__m256i in_l)1034*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
1035*fb1b10abSAndroid Build Coastguard Worker __m256i in_l) {
1036*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1037*fb1b10abSAndroid Build Coastguard Worker
1038*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
1039*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
1040*fb1b10abSAndroid Build Coastguard Worker return out;
1041*fb1b10abSAndroid Build Coastguard Worker }
1042*fb1b10abSAndroid Build Coastguard Worker
1043*fb1b10abSAndroid Build Coastguard Worker /*
1044*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1045*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of halfword vector elements
1046*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
1047*fb1b10abSAndroid Build Coastguard Worker * Output - out
1048*fb1b10abSAndroid Build Coastguard Worker * Return Type - per RTYPE
1049*fb1b10abSAndroid Build Coastguard Worker * Details : Signed halfword elements from in_h are multiplied with
1050*fb1b10abSAndroid Build Coastguard Worker * signed halfword elements from in_l producing a result
1051*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed word.
1052*fb1b10abSAndroid Build Coastguard Worker * Multiplication result of adjacent odd-even elements
1053*fb1b10abSAndroid Build Coastguard Worker * are added to the in_c vector.
1054*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1055*fb1b10abSAndroid Build Coastguard Worker * in_c : 1,2,3,4, 1,2,3,4
1056*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
1057*fb1b10abSAndroid Build Coastguard Worker * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
1058*fb1b10abSAndroid Build Coastguard Worker * out : 23,40,41,26, 23,40,41,26
1059*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1060*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2add_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1061*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
1062*fb1b10abSAndroid Build Coastguard Worker __m256i in_l) {
1063*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1064*fb1b10abSAndroid Build Coastguard Worker
1065*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
1066*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1067*fb1b10abSAndroid Build Coastguard Worker return out;
1068*fb1b10abSAndroid Build Coastguard Worker }
1069*fb1b10abSAndroid Build Coastguard Worker
1070*fb1b10abSAndroid Build Coastguard Worker /*
1071*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1072*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of halfword vector elements
1073*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
1074*fb1b10abSAndroid Build Coastguard Worker * Output - out
1075*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed word
1076*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned halfword elements from in_h are multiplied with
1077*fb1b10abSAndroid Build Coastguard Worker * unsigned halfword elements from in_l producing a result
1078*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed word.
1079*fb1b10abSAndroid Build Coastguard Worker * Multiplication result of adjacent odd-even elements
1080*fb1b10abSAndroid Build Coastguard Worker * are added to the in_c vector.
1081*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1082*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1083*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2add_w_hu(__m256i in_c,__m256i in_h,__m256i in_l)1084*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
1085*fb1b10abSAndroid Build Coastguard Worker __m256i in_l) {
1086*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1087*fb1b10abSAndroid Build Coastguard Worker
1088*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
1089*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
1090*fb1b10abSAndroid Build Coastguard Worker return out;
1091*fb1b10abSAndroid Build Coastguard Worker }
1092*fb1b10abSAndroid Build Coastguard Worker
1093*fb1b10abSAndroid Build Coastguard Worker /*
1094*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1095*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of halfword vector elements
1096*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
1097*fb1b10abSAndroid Build Coastguard Worker * Output - out
1098*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed word
1099*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned halfword elements from in_h are multiplied with
1100*fb1b10abSAndroid Build Coastguard Worker * signed halfword elements from in_l producing a result
1101*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed word.
1102*fb1b10abSAndroid Build Coastguard Worker * Multiplication result of adjacent odd-even elements
1103*fb1b10abSAndroid Build Coastguard Worker * are added to the in_c vector
1104*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1105*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1106*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2add_w_hu_h(__m256i in_c,__m256i in_h,__m256i in_l)1107*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
1108*fb1b10abSAndroid Build Coastguard Worker __m256i in_l) {
1109*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1110*fb1b10abSAndroid Build Coastguard Worker
1111*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
1112*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
1113*fb1b10abSAndroid Build Coastguard Worker return out;
1114*fb1b10abSAndroid Build Coastguard Worker }
1115*fb1b10abSAndroid Build Coastguard Worker
1116*fb1b10abSAndroid Build Coastguard Worker /*
1117*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1118*fb1b10abSAndroid Build Coastguard Worker * Description : Vector Unsigned Dot Product and Subtract
1119*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
1120*fb1b10abSAndroid Build Coastguard Worker * Output - out
1121*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed halfword
1122*fb1b10abSAndroid Build Coastguard Worker * Details : Unsigned byte elements from in_h are multiplied with
1123*fb1b10abSAndroid Build Coastguard Worker * unsigned byte elements from in_l producing a result
1124*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed halfword.
1125*fb1b10abSAndroid Build Coastguard Worker * Multiplication result of adjacent odd-even elements
1126*fb1b10abSAndroid Build Coastguard Worker * are added together and subtracted from double width elements
1127*fb1b10abSAndroid Build Coastguard Worker * in_c vector.
1128*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1129*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1130*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2sub_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)1131*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
1132*fb1b10abSAndroid Build Coastguard Worker __m256i in_l) {
1133*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1134*fb1b10abSAndroid Build Coastguard Worker
1135*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmulwev_h_bu(in_h, in_l);
1136*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
1137*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvsub_h(in_c, out);
1138*fb1b10abSAndroid Build Coastguard Worker return out;
1139*fb1b10abSAndroid Build Coastguard Worker }
1140*fb1b10abSAndroid Build Coastguard Worker
1141*fb1b10abSAndroid Build Coastguard Worker /*
1142*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1143*fb1b10abSAndroid Build Coastguard Worker * Description : Vector Signed Dot Product and Subtract
1144*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
1145*fb1b10abSAndroid Build Coastguard Worker * Output - out
1146*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed word
1147*fb1b10abSAndroid Build Coastguard Worker * Details : Signed halfword elements from in_h are multiplied with
1148*fb1b10abSAndroid Build Coastguard Worker * Signed halfword elements from in_l producing a result
1149*fb1b10abSAndroid Build Coastguard Worker * twice the size of input i.e. signed word.
1150*fb1b10abSAndroid Build Coastguard Worker * Multiplication result of adjacent odd-even elements
1151*fb1b10abSAndroid Build Coastguard Worker * are added together and subtracted from double width elements
1152*fb1b10abSAndroid Build Coastguard Worker * in_c vector.
1153*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1154*fb1b10abSAndroid Build Coastguard Worker * in_c : 0,0,0,0, 0,0,0,0
1155*fb1b10abSAndroid Build Coastguard Worker * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1156*fb1b10abSAndroid Build Coastguard Worker * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
1157*fb1b10abSAndroid Build Coastguard Worker * out : -7,-3,0,0, 0,-1,0,-1
1158*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1159*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp2sub_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1160*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
1161*fb1b10abSAndroid Build Coastguard Worker __m256i in_l) {
1162*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1163*fb1b10abSAndroid Build Coastguard Worker
1164*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmulwev_w_h(in_h, in_l);
1165*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1166*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvsub_w(in_c, out);
1167*fb1b10abSAndroid Build Coastguard Worker return out;
1168*fb1b10abSAndroid Build Coastguard Worker }
1169*fb1b10abSAndroid Build Coastguard Worker
1170*fb1b10abSAndroid Build Coastguard Worker /*
1171*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1172*fb1b10abSAndroid Build Coastguard Worker * Description : Dot product of halfword vector elements
1173*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1174*fb1b10abSAndroid Build Coastguard Worker * Output - out
1175*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed word
1176*fb1b10abSAndroid Build Coastguard Worker * Details : Signed halfword elements from in_h are multiplied with
1177*fb1b10abSAndroid Build Coastguard Worker * signed halfword elements from in_l producing a result
1178*fb1b10abSAndroid Build Coastguard Worker * four times the size of input i.e. signed doubleword.
1179*fb1b10abSAndroid Build Coastguard Worker * Then these multiplication results of four adjacent elements
1180*fb1b10abSAndroid Build Coastguard Worker * are added together and stored to the out vector.
1181*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
1182*fb1b10abSAndroid Build Coastguard Worker * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1183*fb1b10abSAndroid Build Coastguard Worker * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1184*fb1b10abSAndroid Build Coastguard Worker * out : -2,0,1,1
1185*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1186*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvdp4_d_h(__m256i in_h,__m256i in_l)1187*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1188*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1189*fb1b10abSAndroid Build Coastguard Worker
1190*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmulwev_w_h(in_h, in_l);
1191*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1192*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvhaddw_d_w(out, out);
1193*fb1b10abSAndroid Build Coastguard Worker return out;
1194*fb1b10abSAndroid Build Coastguard Worker }
1195*fb1b10abSAndroid Build Coastguard Worker
1196*fb1b10abSAndroid Build Coastguard Worker /*
1197*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1198*fb1b10abSAndroid Build Coastguard Worker * Description : The high half of the vector elements are expanded and
1199*fb1b10abSAndroid Build Coastguard Worker * added after being doubled.
1200*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1201*fb1b10abSAndroid Build Coastguard Worker * Output - out
1202*fb1b10abSAndroid Build Coastguard Worker * Details : The in_h vector and the in_l vector are added after the
1203*fb1b10abSAndroid Build Coastguard Worker * higher half of the two-fold sign extension (signed byte
1204*fb1b10abSAndroid Build Coastguard Worker * to signed halfword) and stored to the out vector.
1205*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1206*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1207*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvaddwh_h_b(__m256i in_h,__m256i in_l)1208*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1209*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1210*fb1b10abSAndroid Build Coastguard Worker
1211*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvilvh_b(in_h, in_l);
1212*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvhaddw_h_b(out, out);
1213*fb1b10abSAndroid Build Coastguard Worker return out;
1214*fb1b10abSAndroid Build Coastguard Worker }
1215*fb1b10abSAndroid Build Coastguard Worker
1216*fb1b10abSAndroid Build Coastguard Worker /*
1217*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1218*fb1b10abSAndroid Build Coastguard Worker * Description : The high half of the vector elements are expanded and
1219*fb1b10abSAndroid Build Coastguard Worker * added after being doubled.
1220*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1221*fb1b10abSAndroid Build Coastguard Worker * Output - out
1222*fb1b10abSAndroid Build Coastguard Worker * Details : The in_h vector and the in_l vector are added after the
1223*fb1b10abSAndroid Build Coastguard Worker * higher half of the two-fold sign extension (signed halfword
1224*fb1b10abSAndroid Build Coastguard Worker * to signed word) and stored to the out vector.
1225*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvaddwh_w_h(in_h, in_l)
1226*fb1b10abSAndroid Build Coastguard Worker * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1227*fb1b10abSAndroid Build Coastguard Worker * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1228*fb1b10abSAndroid Build Coastguard Worker * out : 1,0,0,-1, 1,0,0, 2
1229*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1230*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvaddwh_w_h(__m256i in_h,__m256i in_l)1231*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1232*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1233*fb1b10abSAndroid Build Coastguard Worker
1234*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvilvh_h(in_h, in_l);
1235*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvhaddw_w_h(out, out);
1236*fb1b10abSAndroid Build Coastguard Worker return out;
1237*fb1b10abSAndroid Build Coastguard Worker }
1238*fb1b10abSAndroid Build Coastguard Worker
1239*fb1b10abSAndroid Build Coastguard Worker /*
1240*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1241*fb1b10abSAndroid Build Coastguard Worker * Description : The low half of the vector elements are expanded and
1242*fb1b10abSAndroid Build Coastguard Worker * added after being doubled.
1243*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1244*fb1b10abSAndroid Build Coastguard Worker * Output - out
1245*fb1b10abSAndroid Build Coastguard Worker * Details : The in_h vector and the in_l vector are added after the
1246*fb1b10abSAndroid Build Coastguard Worker * lower half of the two-fold sign extension (signed byte
1247*fb1b10abSAndroid Build Coastguard Worker * to signed halfword) and stored to the out vector.
1248*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1249*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1250*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvaddwl_h_b(__m256i in_h,__m256i in_l)1251*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1252*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1253*fb1b10abSAndroid Build Coastguard Worker
1254*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvilvl_b(in_h, in_l);
1255*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvhaddw_h_b(out, out);
1256*fb1b10abSAndroid Build Coastguard Worker return out;
1257*fb1b10abSAndroid Build Coastguard Worker }
1258*fb1b10abSAndroid Build Coastguard Worker
1259*fb1b10abSAndroid Build Coastguard Worker /*
1260*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1261*fb1b10abSAndroid Build Coastguard Worker * Description : The low half of the vector elements are expanded and
1262*fb1b10abSAndroid Build Coastguard Worker * added after being doubled.
1263*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1264*fb1b10abSAndroid Build Coastguard Worker * Output - out
1265*fb1b10abSAndroid Build Coastguard Worker * Details : The in_h vector and the in_l vector are added after the
1266*fb1b10abSAndroid Build Coastguard Worker * lower half of the two-fold sign extension (signed halfword
1267*fb1b10abSAndroid Build Coastguard Worker * to signed word) and stored to the out vector.
1268*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvaddwl_w_h(in_h, in_l)
1269*fb1b10abSAndroid Build Coastguard Worker * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1270*fb1b10abSAndroid Build Coastguard Worker * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1271*fb1b10abSAndroid Build Coastguard Worker * out : 5,-1,4,2, 1,0,2,-1
1272*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1273*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvaddwl_w_h(__m256i in_h,__m256i in_l)1274*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1275*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1276*fb1b10abSAndroid Build Coastguard Worker
1277*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvilvl_h(in_h, in_l);
1278*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvhaddw_w_h(out, out);
1279*fb1b10abSAndroid Build Coastguard Worker return out;
1280*fb1b10abSAndroid Build Coastguard Worker }
1281*fb1b10abSAndroid Build Coastguard Worker
1282*fb1b10abSAndroid Build Coastguard Worker /*
1283*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1284*fb1b10abSAndroid Build Coastguard Worker * Description : The low half of the vector elements are expanded and
1285*fb1b10abSAndroid Build Coastguard Worker * added after being doubled.
1286*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1287*fb1b10abSAndroid Build Coastguard Worker * Output - out
1288*fb1b10abSAndroid Build Coastguard Worker * Details : The out vector and the out vector are added after the
1289*fb1b10abSAndroid Build Coastguard Worker * lower half of the two-fold zero extension (unsigned byte
1290*fb1b10abSAndroid Build Coastguard Worker * to unsigned halfword) and stored to the out vector.
1291*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1292*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1293*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvaddwl_h_bu(__m256i in_h,__m256i in_l)1294*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1295*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1296*fb1b10abSAndroid Build Coastguard Worker
1297*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvilvl_b(in_h, in_l);
1298*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvhaddw_hu_bu(out, out);
1299*fb1b10abSAndroid Build Coastguard Worker return out;
1300*fb1b10abSAndroid Build Coastguard Worker }
1301*fb1b10abSAndroid Build Coastguard Worker
1302*fb1b10abSAndroid Build Coastguard Worker /*
1303*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1304*fb1b10abSAndroid Build Coastguard Worker * Description : The low half of the vector elements are expanded and
1305*fb1b10abSAndroid Build Coastguard Worker * added after being doubled.
1306*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1307*fb1b10abSAndroid Build Coastguard Worker * Output - out
1308*fb1b10abSAndroid Build Coastguard Worker * Details : The in_l vector after double zero extension (unsigned byte to
1309*fb1b10abSAndroid Build Coastguard Worker * signed halfword),added to the in_h vector.
1310*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1311*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1312*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvaddw_h_h_bu(__m256i in_h,__m256i in_l)1313*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1314*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1315*fb1b10abSAndroid Build Coastguard Worker
1316*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvsllwil_hu_bu(in_l, 0);
1317*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvadd_h(in_h, out);
1318*fb1b10abSAndroid Build Coastguard Worker return out;
1319*fb1b10abSAndroid Build Coastguard Worker }
1320*fb1b10abSAndroid Build Coastguard Worker
1321*fb1b10abSAndroid Build Coastguard Worker /*
1322*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1323*fb1b10abSAndroid Build Coastguard Worker * Description : The low half of the vector elements are expanded and
1324*fb1b10abSAndroid Build Coastguard Worker * added after being doubled.
1325*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1326*fb1b10abSAndroid Build Coastguard Worker * Output - out
1327*fb1b10abSAndroid Build Coastguard Worker * Details : The in_l vector after double sign extension (signed halfword to
1328*fb1b10abSAndroid Build Coastguard Worker * signed word), added to the in_h vector.
1329*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1330*fb1b10abSAndroid Build Coastguard Worker * in_h : 0, 1,0,0, -1,0,0,1,
1331*fb1b10abSAndroid Build Coastguard Worker * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1,
1332*fb1b10abSAndroid Build Coastguard Worker * out : 2, 0,1,2, -1,0,1,1,
1333*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1334*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvaddw_w_w_h(__m256i in_h,__m256i in_l)1335*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1336*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1337*fb1b10abSAndroid Build Coastguard Worker
1338*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvsllwil_w_h(in_l, 0);
1339*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvadd_w(in_h, out);
1340*fb1b10abSAndroid Build Coastguard Worker return out;
1341*fb1b10abSAndroid Build Coastguard Worker }
1342*fb1b10abSAndroid Build Coastguard Worker
1343*fb1b10abSAndroid Build Coastguard Worker /*
1344*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1345*fb1b10abSAndroid Build Coastguard Worker * Description : Multiplication and addition calculation after expansion
1346*fb1b10abSAndroid Build Coastguard Worker * of the lower half of the vector.
1347*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
1348*fb1b10abSAndroid Build Coastguard Worker * Output - out
1349*fb1b10abSAndroid Build Coastguard Worker * Details : The in_h vector and the in_l vector are multiplied after
1350*fb1b10abSAndroid Build Coastguard Worker * the lower half of the two-fold sign extension (signed halfword
1351*fb1b10abSAndroid Build Coastguard Worker * to signed word), and the result is added to the vector in_c,
1352*fb1b10abSAndroid Build Coastguard Worker * then stored to the out vector.
1353*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1354*fb1b10abSAndroid Build Coastguard Worker * in_c : 1,2,3,4, 5,6,7,8
1355*fb1b10abSAndroid Build Coastguard Worker * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1356*fb1b10abSAndroid Build Coastguard Worker * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000,
1357*fb1b10abSAndroid Build Coastguard Worker * -200,-300,-400,-500, -2000,-3000,-4000,-5000
1358*fb1b10abSAndroid Build Coastguard Worker * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1359*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1360*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvmaddwl_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1361*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
1362*fb1b10abSAndroid Build Coastguard Worker __m256i in_l) {
1363*fb1b10abSAndroid Build Coastguard Worker __m256i tmp0, tmp1, out;
1364*fb1b10abSAndroid Build Coastguard Worker
1365*fb1b10abSAndroid Build Coastguard Worker tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1366*fb1b10abSAndroid Build Coastguard Worker tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1367*fb1b10abSAndroid Build Coastguard Worker tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1368*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvadd_w(tmp0, in_c);
1369*fb1b10abSAndroid Build Coastguard Worker return out;
1370*fb1b10abSAndroid Build Coastguard Worker }
1371*fb1b10abSAndroid Build Coastguard Worker
1372*fb1b10abSAndroid Build Coastguard Worker /*
1373*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1374*fb1b10abSAndroid Build Coastguard Worker * Description : Multiplication and addition calculation after expansion
1375*fb1b10abSAndroid Build Coastguard Worker * of the higher half of the vector.
1376*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_c, in_h, in_l
1377*fb1b10abSAndroid Build Coastguard Worker * Output - out
1378*fb1b10abSAndroid Build Coastguard Worker * Details : The in_h vector and the in_l vector are multiplied after
1379*fb1b10abSAndroid Build Coastguard Worker * the higher half of the two-fold sign extension (signed
1380*fb1b10abSAndroid Build Coastguard Worker * halfword to signed word), and the result is added to
1381*fb1b10abSAndroid Build Coastguard Worker * the vector in_c, then stored to the out vector.
1382*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1383*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1384*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvmaddwh_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1385*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
1386*fb1b10abSAndroid Build Coastguard Worker __m256i in_l) {
1387*fb1b10abSAndroid Build Coastguard Worker __m256i tmp0, tmp1, out;
1388*fb1b10abSAndroid Build Coastguard Worker
1389*fb1b10abSAndroid Build Coastguard Worker tmp0 = __lasx_xvilvh_h(in_h, in_h);
1390*fb1b10abSAndroid Build Coastguard Worker tmp1 = __lasx_xvilvh_h(in_l, in_l);
1391*fb1b10abSAndroid Build Coastguard Worker tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1392*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvadd_w(tmp0, in_c);
1393*fb1b10abSAndroid Build Coastguard Worker return out;
1394*fb1b10abSAndroid Build Coastguard Worker }
1395*fb1b10abSAndroid Build Coastguard Worker
1396*fb1b10abSAndroid Build Coastguard Worker /*
1397*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1398*fb1b10abSAndroid Build Coastguard Worker * Description : Multiplication calculation after expansion of the lower
1399*fb1b10abSAndroid Build Coastguard Worker * half of the vector.
1400*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1401*fb1b10abSAndroid Build Coastguard Worker * Output - out
1402*fb1b10abSAndroid Build Coastguard Worker * Details : The in_h vector and the in_l vector are multiplied after
1403*fb1b10abSAndroid Build Coastguard Worker * the lower half of the two-fold sign extension (signed
1404*fb1b10abSAndroid Build Coastguard Worker * halfword to signed word), then stored to the out vector.
1405*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvmulwl_w_h(in_h, in_l)
1406*fb1b10abSAndroid Build Coastguard Worker * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1407*fb1b10abSAndroid Build Coastguard Worker * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1408*fb1b10abSAndroid Build Coastguard Worker * out : 6,1,3,0, 0,0,1,0
1409*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1410*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvmulwl_w_h(__m256i in_h,__m256i in_l)1411*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1412*fb1b10abSAndroid Build Coastguard Worker __m256i tmp0, tmp1, out;
1413*fb1b10abSAndroid Build Coastguard Worker
1414*fb1b10abSAndroid Build Coastguard Worker tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1415*fb1b10abSAndroid Build Coastguard Worker tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1416*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmul_w(tmp0, tmp1);
1417*fb1b10abSAndroid Build Coastguard Worker return out;
1418*fb1b10abSAndroid Build Coastguard Worker }
1419*fb1b10abSAndroid Build Coastguard Worker
1420*fb1b10abSAndroid Build Coastguard Worker /*
1421*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1422*fb1b10abSAndroid Build Coastguard Worker * Description : Multiplication calculation after expansion of the lower
1423*fb1b10abSAndroid Build Coastguard Worker * half of the vector.
1424*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1425*fb1b10abSAndroid Build Coastguard Worker * Output - out
1426*fb1b10abSAndroid Build Coastguard Worker * Details : The in_h vector and the in_l vector are multiplied after
1427*fb1b10abSAndroid Build Coastguard Worker * the lower half of the two-fold sign extension (signed
1428*fb1b10abSAndroid Build Coastguard Worker * halfword to signed word), then stored to the out vector.
1429*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvmulwh_w_h(in_h, in_l)
1430*fb1b10abSAndroid Build Coastguard Worker * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1431*fb1b10abSAndroid Build Coastguard Worker * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1432*fb1b10abSAndroid Build Coastguard Worker * out : 0,0,0,0, 0,0,0,1
1433*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1434*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvmulwh_w_h(__m256i in_h,__m256i in_l)1435*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1436*fb1b10abSAndroid Build Coastguard Worker __m256i tmp0, tmp1, out;
1437*fb1b10abSAndroid Build Coastguard Worker
1438*fb1b10abSAndroid Build Coastguard Worker tmp0 = __lasx_xvilvh_h(in_h, in_h);
1439*fb1b10abSAndroid Build Coastguard Worker tmp1 = __lasx_xvilvh_h(in_l, in_l);
1440*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1441*fb1b10abSAndroid Build Coastguard Worker return out;
1442*fb1b10abSAndroid Build Coastguard Worker }
1443*fb1b10abSAndroid Build Coastguard Worker
1444*fb1b10abSAndroid Build Coastguard Worker /*
1445*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1446*fb1b10abSAndroid Build Coastguard Worker * Description : The low half of the vector elements are added to the high half
1447*fb1b10abSAndroid Build Coastguard Worker * after being doubled, then saturated.
1448*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in_h, in_l
1449*fb1b10abSAndroid Build Coastguard Worker * Output - out
1450*fb1b10abSAndroid Build Coastguard Worker * Details : The in_h vector adds the in_l vector after the lower half of
1451*fb1b10abSAndroid Build Coastguard Worker * the two-fold zero extension (unsigned byte to unsigned
1452*fb1b10abSAndroid Build Coastguard Worker * halfword) and then saturated. The results are stored to the out
1453*fb1b10abSAndroid Build Coastguard Worker * vector.
1454*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1455*fb1b10abSAndroid Build Coastguard Worker * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1456*fb1b10abSAndroid Build Coastguard Worker * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1457*fb1b10abSAndroid Build Coastguard Worker * 0,0,0,1
1458*fb1b10abSAndroid Build Coastguard Worker * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1459*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1460*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h,__m256i in_l)1461*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1462*fb1b10abSAndroid Build Coastguard Worker __m256i tmp1, out;
1463*fb1b10abSAndroid Build Coastguard Worker __m256i zero = { 0 };
1464*fb1b10abSAndroid Build Coastguard Worker
1465*fb1b10abSAndroid Build Coastguard Worker tmp1 = __lasx_xvilvl_b(zero, in_l);
1466*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvsadd_hu(in_h, tmp1);
1467*fb1b10abSAndroid Build Coastguard Worker return out;
1468*fb1b10abSAndroid Build Coastguard Worker }
1469*fb1b10abSAndroid Build Coastguard Worker
1470*fb1b10abSAndroid Build Coastguard Worker /*
1471*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1472*fb1b10abSAndroid Build Coastguard Worker * Description : Clip all halfword elements of input vector between min & max
1473*fb1b10abSAndroid Build Coastguard Worker * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1474*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in (input vector)
1475*fb1b10abSAndroid Build Coastguard Worker * - min (min threshold)
1476*fb1b10abSAndroid Build Coastguard Worker * - max (max threshold)
1477*fb1b10abSAndroid Build Coastguard Worker * Outputs - in (output vector with clipped elements)
1478*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed halfword
1479*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvclip_h(in, min, max)
1480*fb1b10abSAndroid Build Coastguard Worker * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1481*fb1b10abSAndroid Build Coastguard Worker * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1482*fb1b10abSAndroid Build Coastguard Worker * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1483*fb1b10abSAndroid Build Coastguard Worker * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1484*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1485*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvclip_h(__m256i in,__m256i min,__m256i max)1486*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1487*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1488*fb1b10abSAndroid Build Coastguard Worker
1489*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmax_h(min, in);
1490*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmin_h(max, out);
1491*fb1b10abSAndroid Build Coastguard Worker return out;
1492*fb1b10abSAndroid Build Coastguard Worker }
1493*fb1b10abSAndroid Build Coastguard Worker
1494*fb1b10abSAndroid Build Coastguard Worker /*
1495*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1496*fb1b10abSAndroid Build Coastguard Worker * Description : Clip all signed halfword elements of input vector
1497*fb1b10abSAndroid Build Coastguard Worker * between 0 & 255
1498*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in (input vector)
1499*fb1b10abSAndroid Build Coastguard Worker * Outputs - out (output vector with clipped elements)
1500*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed halfword
1501*fb1b10abSAndroid Build Coastguard Worker * Example : See out = __lasx_xvclip255_w(in)
1502*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1503*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvclip255_h(__m256i in)1504*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvclip255_h(__m256i in) {
1505*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1506*fb1b10abSAndroid Build Coastguard Worker
1507*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaxi_h(in, 0);
1508*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvsat_hu(out, 7);
1509*fb1b10abSAndroid Build Coastguard Worker return out;
1510*fb1b10abSAndroid Build Coastguard Worker }
1511*fb1b10abSAndroid Build Coastguard Worker
1512*fb1b10abSAndroid Build Coastguard Worker /*
1513*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1514*fb1b10abSAndroid Build Coastguard Worker * Description : Clip all signed word elements of input vector
1515*fb1b10abSAndroid Build Coastguard Worker * between 0 & 255
1516*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in (input vector)
1517*fb1b10abSAndroid Build Coastguard Worker * Output - out (output vector with clipped elements)
1518*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed word
1519*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvclip255_w(in)
1520*fb1b10abSAndroid Build Coastguard Worker * in : -8,255,280,249, -8,255,280,249
1521*fb1b10abSAndroid Build Coastguard Worker * out : 0,255,255,249, 0,255,255,249
1522*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1523*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvclip255_w(__m256i in)1524*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvclip255_w(__m256i in) {
1525*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1526*fb1b10abSAndroid Build Coastguard Worker
1527*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvmaxi_w(in, 0);
1528*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvsat_wu(out, 7);
1529*fb1b10abSAndroid Build Coastguard Worker return out;
1530*fb1b10abSAndroid Build Coastguard Worker }
1531*fb1b10abSAndroid Build Coastguard Worker
1532*fb1b10abSAndroid Build Coastguard Worker /*
1533*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1534*fb1b10abSAndroid Build Coastguard Worker * Description : Indexed halfword element values are replicated to all
1535*fb1b10abSAndroid Build Coastguard Worker * elements in output vector. If 'idx < 8' use xvsplati_l_*,
1536*fb1b10abSAndroid Build Coastguard Worker * if 'idx >= 8' use xvsplati_h_*.
1537*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in, idx
1538*fb1b10abSAndroid Build Coastguard Worker * Output - out
1539*fb1b10abSAndroid Build Coastguard Worker * Details : Idx element value from in vector is replicated to all
1540*fb1b10abSAndroid Build Coastguard Worker * elements in out vector.
1541*fb1b10abSAndroid Build Coastguard Worker * Valid index range for halfword operation is 0-7
1542*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvsplati_l_h(in, idx)
1543*fb1b10abSAndroid Build Coastguard Worker * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1544*fb1b10abSAndroid Build Coastguard Worker * idx : 0x02
1545*fb1b10abSAndroid Build Coastguard Worker * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1546*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1547*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvsplati_l_h(__m256i in,int idx)1548*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1549*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1550*fb1b10abSAndroid Build Coastguard Worker
1551*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvpermi_q(in, in, 0x02);
1552*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvreplve_h(out, idx);
1553*fb1b10abSAndroid Build Coastguard Worker return out;
1554*fb1b10abSAndroid Build Coastguard Worker }
1555*fb1b10abSAndroid Build Coastguard Worker
1556*fb1b10abSAndroid Build Coastguard Worker /*
1557*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1558*fb1b10abSAndroid Build Coastguard Worker * Description : Indexed halfword element values are replicated to all
1559*fb1b10abSAndroid Build Coastguard Worker * elements in output vector. If 'idx < 8' use xvsplati_l_*,
1560*fb1b10abSAndroid Build Coastguard Worker * if 'idx >= 8' use xvsplati_h_*.
1561*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - in, idx
1562*fb1b10abSAndroid Build Coastguard Worker * Output - out
1563*fb1b10abSAndroid Build Coastguard Worker * Details : Idx element value from in vector is replicated to all
1564*fb1b10abSAndroid Build Coastguard Worker * elements in out vector.
1565*fb1b10abSAndroid Build Coastguard Worker * Valid index range for halfword operation is 0-7
1566*fb1b10abSAndroid Build Coastguard Worker * Example : out = __lasx_xvsplati_h_h(in, idx)
1567*fb1b10abSAndroid Build Coastguard Worker * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1568*fb1b10abSAndroid Build Coastguard Worker * idx : 0x09
1569*fb1b10abSAndroid Build Coastguard Worker * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1570*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1571*fb1b10abSAndroid Build Coastguard Worker */
__lasx_xvsplati_h_h(__m256i in,int idx)1572*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1573*fb1b10abSAndroid Build Coastguard Worker __m256i out;
1574*fb1b10abSAndroid Build Coastguard Worker
1575*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvpermi_q(in, in, 0x13);
1576*fb1b10abSAndroid Build Coastguard Worker out = __lasx_xvreplve_h(out, idx);
1577*fb1b10abSAndroid Build Coastguard Worker return out;
1578*fb1b10abSAndroid Build Coastguard Worker }
1579*fb1b10abSAndroid Build Coastguard Worker
1580*fb1b10abSAndroid Build Coastguard Worker /*
1581*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1582*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose 4x4 block with double-word elements in vectors
1583*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3
1584*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3
1585*fb1b10abSAndroid Build Coastguard Worker * Example : LASX_TRANSPOSE4x4_D
1586*fb1b10abSAndroid Build Coastguard Worker * _in0 : 1,2,3,4
1587*fb1b10abSAndroid Build Coastguard Worker * _in1 : 1,2,3,4
1588*fb1b10abSAndroid Build Coastguard Worker * _in2 : 1,2,3,4
1589*fb1b10abSAndroid Build Coastguard Worker * _in3 : 1,2,3,4
1590*fb1b10abSAndroid Build Coastguard Worker *
1591*fb1b10abSAndroid Build Coastguard Worker * _out0 : 1,1,1,1
1592*fb1b10abSAndroid Build Coastguard Worker * _out1 : 2,2,2,2
1593*fb1b10abSAndroid Build Coastguard Worker * _out2 : 3,3,3,3
1594*fb1b10abSAndroid Build Coastguard Worker * _out3 : 4,4,4,4
1595*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1596*fb1b10abSAndroid Build Coastguard Worker */
1597*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1598*fb1b10abSAndroid Build Coastguard Worker _out3) \
1599*fb1b10abSAndroid Build Coastguard Worker { \
1600*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
1601*fb1b10abSAndroid Build Coastguard Worker _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
1602*fb1b10abSAndroid Build Coastguard Worker _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
1603*fb1b10abSAndroid Build Coastguard Worker _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
1604*fb1b10abSAndroid Build Coastguard Worker _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
1605*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
1606*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
1607*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
1608*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
1609*fb1b10abSAndroid Build Coastguard Worker }
1610*fb1b10abSAndroid Build Coastguard Worker
1611*fb1b10abSAndroid Build Coastguard Worker /*
1612*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1613*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose 8x8 block with word elements in vectors
1614*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1615*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1616*fb1b10abSAndroid Build Coastguard Worker * _out7
1617*fb1b10abSAndroid Build Coastguard Worker * Example : LASX_TRANSPOSE8x8_W
1618*fb1b10abSAndroid Build Coastguard Worker * _in0 : 1,2,3,4,5,6,7,8
1619*fb1b10abSAndroid Build Coastguard Worker * _in1 : 2,2,3,4,5,6,7,8
1620*fb1b10abSAndroid Build Coastguard Worker * _in2 : 3,2,3,4,5,6,7,8
1621*fb1b10abSAndroid Build Coastguard Worker * _in3 : 4,2,3,4,5,6,7,8
1622*fb1b10abSAndroid Build Coastguard Worker * _in4 : 5,2,3,4,5,6,7,8
1623*fb1b10abSAndroid Build Coastguard Worker * _in5 : 6,2,3,4,5,6,7,8
1624*fb1b10abSAndroid Build Coastguard Worker * _in6 : 7,2,3,4,5,6,7,8
1625*fb1b10abSAndroid Build Coastguard Worker * _in7 : 8,2,3,4,5,6,7,8
1626*fb1b10abSAndroid Build Coastguard Worker *
1627*fb1b10abSAndroid Build Coastguard Worker * _out0 : 1,2,3,4,5,6,7,8
1628*fb1b10abSAndroid Build Coastguard Worker * _out1 : 2,2,2,2,2,2,2,2
1629*fb1b10abSAndroid Build Coastguard Worker * _out2 : 3,3,3,3,3,3,3,3
1630*fb1b10abSAndroid Build Coastguard Worker * _out3 : 4,4,4,4,4,4,4,4
1631*fb1b10abSAndroid Build Coastguard Worker * _out4 : 5,5,5,5,5,5,5,5
1632*fb1b10abSAndroid Build Coastguard Worker * _out5 : 6,6,6,6,6,6,6,6
1633*fb1b10abSAndroid Build Coastguard Worker * _out6 : 7,7,7,7,7,7,7,7
1634*fb1b10abSAndroid Build Coastguard Worker * _out7 : 8,8,8,8,8,8,8,8
1635*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1636*fb1b10abSAndroid Build Coastguard Worker */
1637*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1638*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1639*fb1b10abSAndroid Build Coastguard Worker _out7) \
1640*fb1b10abSAndroid Build Coastguard Worker { \
1641*fb1b10abSAndroid Build Coastguard Worker __m256i _s0_m, _s1_m; \
1642*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1643*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1644*fb1b10abSAndroid Build Coastguard Worker \
1645*fb1b10abSAndroid Build Coastguard Worker _s0_m = __lasx_xvilvl_w(_in2, _in0); \
1646*fb1b10abSAndroid Build Coastguard Worker _s1_m = __lasx_xvilvl_w(_in3, _in1); \
1647*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1648*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1649*fb1b10abSAndroid Build Coastguard Worker _s0_m = __lasx_xvilvh_w(_in2, _in0); \
1650*fb1b10abSAndroid Build Coastguard Worker _s1_m = __lasx_xvilvh_w(_in3, _in1); \
1651*fb1b10abSAndroid Build Coastguard Worker _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1652*fb1b10abSAndroid Build Coastguard Worker _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1653*fb1b10abSAndroid Build Coastguard Worker _s0_m = __lasx_xvilvl_w(_in6, _in4); \
1654*fb1b10abSAndroid Build Coastguard Worker _s1_m = __lasx_xvilvl_w(_in7, _in5); \
1655*fb1b10abSAndroid Build Coastguard Worker _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1656*fb1b10abSAndroid Build Coastguard Worker _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1657*fb1b10abSAndroid Build Coastguard Worker _s0_m = __lasx_xvilvh_w(_in6, _in4); \
1658*fb1b10abSAndroid Build Coastguard Worker _s1_m = __lasx_xvilvh_w(_in7, _in5); \
1659*fb1b10abSAndroid Build Coastguard Worker _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1660*fb1b10abSAndroid Build Coastguard Worker _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1661*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
1662*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
1663*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
1664*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
1665*fb1b10abSAndroid Build Coastguard Worker _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
1666*fb1b10abSAndroid Build Coastguard Worker _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
1667*fb1b10abSAndroid Build Coastguard Worker _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
1668*fb1b10abSAndroid Build Coastguard Worker _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
1669*fb1b10abSAndroid Build Coastguard Worker }
1670*fb1b10abSAndroid Build Coastguard Worker
1671*fb1b10abSAndroid Build Coastguard Worker /*
1672*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1673*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose input 16x8 byte block
1674*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1675*fb1b10abSAndroid Build Coastguard Worker * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1676*fb1b10abSAndroid Build Coastguard Worker * (input 16x8 byte block)
1677*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1678*fb1b10abSAndroid Build Coastguard Worker * _out7 (output 8x16 byte block)
1679*fb1b10abSAndroid Build Coastguard Worker * Details : The rows of the matrix become columns, and the columns become
1680*fb1b10abSAndroid Build Coastguard Worker * rows.
1681*fb1b10abSAndroid Build Coastguard Worker * Example : See LASX_TRANSPOSE16x8_H
1682*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1683*fb1b10abSAndroid Build Coastguard Worker */
1684*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1685*fb1b10abSAndroid Build Coastguard Worker _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1686*fb1b10abSAndroid Build Coastguard Worker _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1687*fb1b10abSAndroid Build Coastguard Worker _out6, _out7) \
1688*fb1b10abSAndroid Build Coastguard Worker { \
1689*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1690*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1691*fb1b10abSAndroid Build Coastguard Worker \
1692*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1693*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1694*fb1b10abSAndroid Build Coastguard Worker _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1695*fb1b10abSAndroid Build Coastguard Worker _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1696*fb1b10abSAndroid Build Coastguard Worker _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
1697*fb1b10abSAndroid Build Coastguard Worker _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
1698*fb1b10abSAndroid Build Coastguard Worker _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
1699*fb1b10abSAndroid Build Coastguard Worker _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
1700*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1701*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1702*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1703*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1704*fb1b10abSAndroid Build Coastguard Worker _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
1705*fb1b10abSAndroid Build Coastguard Worker _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
1706*fb1b10abSAndroid Build Coastguard Worker _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
1707*fb1b10abSAndroid Build Coastguard Worker _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
1708*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
1709*fb1b10abSAndroid Build Coastguard Worker _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
1710*fb1b10abSAndroid Build Coastguard Worker _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
1711*fb1b10abSAndroid Build Coastguard Worker _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
1712*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
1713*fb1b10abSAndroid Build Coastguard Worker _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
1714*fb1b10abSAndroid Build Coastguard Worker _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
1715*fb1b10abSAndroid Build Coastguard Worker _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
1716*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
1717*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
1718*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
1719*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
1720*fb1b10abSAndroid Build Coastguard Worker _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
1721*fb1b10abSAndroid Build Coastguard Worker _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
1722*fb1b10abSAndroid Build Coastguard Worker _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
1723*fb1b10abSAndroid Build Coastguard Worker _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
1724*fb1b10abSAndroid Build Coastguard Worker }
1725*fb1b10abSAndroid Build Coastguard Worker
1726*fb1b10abSAndroid Build Coastguard Worker /*
1727*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1728*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose input 16x8 byte block
1729*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1730*fb1b10abSAndroid Build Coastguard Worker * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1731*fb1b10abSAndroid Build Coastguard Worker * (input 16x8 byte block)
1732*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1733*fb1b10abSAndroid Build Coastguard Worker * _out7 (output 8x16 byte block)
1734*fb1b10abSAndroid Build Coastguard Worker * Details : The rows of the matrix become columns, and the columns become
1735*fb1b10abSAndroid Build Coastguard Worker * rows.
1736*fb1b10abSAndroid Build Coastguard Worker * Example : LASX_TRANSPOSE16x8_H
1737*fb1b10abSAndroid Build Coastguard Worker * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1738*fb1b10abSAndroid Build Coastguard Worker * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1739*fb1b10abSAndroid Build Coastguard Worker * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1740*fb1b10abSAndroid Build Coastguard Worker * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1741*fb1b10abSAndroid Build Coastguard Worker * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1742*fb1b10abSAndroid Build Coastguard Worker * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1743*fb1b10abSAndroid Build Coastguard Worker * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1744*fb1b10abSAndroid Build Coastguard Worker * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1745*fb1b10abSAndroid Build Coastguard Worker * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1746*fb1b10abSAndroid Build Coastguard Worker * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1747*fb1b10abSAndroid Build Coastguard Worker * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1748*fb1b10abSAndroid Build Coastguard Worker * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1749*fb1b10abSAndroid Build Coastguard Worker * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1750*fb1b10abSAndroid Build Coastguard Worker * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1751*fb1b10abSAndroid Build Coastguard Worker * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1752*fb1b10abSAndroid Build Coastguard Worker * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1753*fb1b10abSAndroid Build Coastguard Worker *
1754*fb1b10abSAndroid Build Coastguard Worker * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1755*fb1b10abSAndroid Build Coastguard Worker * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1756*fb1b10abSAndroid Build Coastguard Worker * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1757*fb1b10abSAndroid Build Coastguard Worker * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1758*fb1b10abSAndroid Build Coastguard Worker * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1759*fb1b10abSAndroid Build Coastguard Worker * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1760*fb1b10abSAndroid Build Coastguard Worker * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1761*fb1b10abSAndroid Build Coastguard Worker * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1762*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1763*fb1b10abSAndroid Build Coastguard Worker */
1764*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1765*fb1b10abSAndroid Build Coastguard Worker _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1766*fb1b10abSAndroid Build Coastguard Worker _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1767*fb1b10abSAndroid Build Coastguard Worker _out6, _out7) \
1768*fb1b10abSAndroid Build Coastguard Worker { \
1769*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1770*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1771*fb1b10abSAndroid Build Coastguard Worker __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
1772*fb1b10abSAndroid Build Coastguard Worker \
1773*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
1774*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
1775*fb1b10abSAndroid Build Coastguard Worker _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
1776*fb1b10abSAndroid Build Coastguard Worker _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
1777*fb1b10abSAndroid Build Coastguard Worker _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
1778*fb1b10abSAndroid Build Coastguard Worker _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
1779*fb1b10abSAndroid Build Coastguard Worker _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
1780*fb1b10abSAndroid Build Coastguard Worker _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
1781*fb1b10abSAndroid Build Coastguard Worker _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1782*fb1b10abSAndroid Build Coastguard Worker _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1783*fb1b10abSAndroid Build Coastguard Worker _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1784*fb1b10abSAndroid Build Coastguard Worker _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1785*fb1b10abSAndroid Build Coastguard Worker _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1786*fb1b10abSAndroid Build Coastguard Worker _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1787*fb1b10abSAndroid Build Coastguard Worker _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1788*fb1b10abSAndroid Build Coastguard Worker _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1789*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1790*fb1b10abSAndroid Build Coastguard Worker _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1791*fb1b10abSAndroid Build Coastguard Worker _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1792*fb1b10abSAndroid Build Coastguard Worker _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1793*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1794*fb1b10abSAndroid Build Coastguard Worker _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1795*fb1b10abSAndroid Build Coastguard Worker _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1796*fb1b10abSAndroid Build Coastguard Worker _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1797*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1798*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1799*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1800*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1801*fb1b10abSAndroid Build Coastguard Worker \
1802*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
1803*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
1804*fb1b10abSAndroid Build Coastguard Worker _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
1805*fb1b10abSAndroid Build Coastguard Worker _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
1806*fb1b10abSAndroid Build Coastguard Worker _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
1807*fb1b10abSAndroid Build Coastguard Worker _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
1808*fb1b10abSAndroid Build Coastguard Worker _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
1809*fb1b10abSAndroid Build Coastguard Worker _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
1810*fb1b10abSAndroid Build Coastguard Worker _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1811*fb1b10abSAndroid Build Coastguard Worker _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1812*fb1b10abSAndroid Build Coastguard Worker _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1813*fb1b10abSAndroid Build Coastguard Worker _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1814*fb1b10abSAndroid Build Coastguard Worker _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1815*fb1b10abSAndroid Build Coastguard Worker _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1816*fb1b10abSAndroid Build Coastguard Worker _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1817*fb1b10abSAndroid Build Coastguard Worker _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1818*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1819*fb1b10abSAndroid Build Coastguard Worker _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1820*fb1b10abSAndroid Build Coastguard Worker _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1821*fb1b10abSAndroid Build Coastguard Worker _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1822*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1823*fb1b10abSAndroid Build Coastguard Worker _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1824*fb1b10abSAndroid Build Coastguard Worker _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1825*fb1b10abSAndroid Build Coastguard Worker _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1826*fb1b10abSAndroid Build Coastguard Worker _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1827*fb1b10abSAndroid Build Coastguard Worker _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1828*fb1b10abSAndroid Build Coastguard Worker _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1829*fb1b10abSAndroid Build Coastguard Worker _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1830*fb1b10abSAndroid Build Coastguard Worker }
1831*fb1b10abSAndroid Build Coastguard Worker
1832*fb1b10abSAndroid Build Coastguard Worker /*
1833*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1834*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose 4x4 block with halfword elements in vectors
1835*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3
1836*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3
1837*fb1b10abSAndroid Build Coastguard Worker * Return Type - signed halfword
1838*fb1b10abSAndroid Build Coastguard Worker * Details : The rows of the matrix become columns, and the columns become
1839*fb1b10abSAndroid Build Coastguard Worker * rows.
1840*fb1b10abSAndroid Build Coastguard Worker * Example : See LASX_TRANSPOSE8x8_H
1841*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1842*fb1b10abSAndroid Build Coastguard Worker */
1843*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1844*fb1b10abSAndroid Build Coastguard Worker _out3) \
1845*fb1b10abSAndroid Build Coastguard Worker { \
1846*fb1b10abSAndroid Build Coastguard Worker __m256i _s0_m, _s1_m; \
1847*fb1b10abSAndroid Build Coastguard Worker \
1848*fb1b10abSAndroid Build Coastguard Worker _s0_m = __lasx_xvilvl_h(_in1, _in0); \
1849*fb1b10abSAndroid Build Coastguard Worker _s1_m = __lasx_xvilvl_h(_in3, _in2); \
1850*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
1851*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
1852*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvilvh_d(_out0, _out0); \
1853*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvilvh_d(_out2, _out2); \
1854*fb1b10abSAndroid Build Coastguard Worker }
1855*fb1b10abSAndroid Build Coastguard Worker
1856*fb1b10abSAndroid Build Coastguard Worker /*
1857*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1858*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose input 8x8 byte block
1859*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1860*fb1b10abSAndroid Build Coastguard Worker * (input 8x8 byte block)
1861*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1862*fb1b10abSAndroid Build Coastguard Worker * _out7 (output 8x8 byte block)
1863*fb1b10abSAndroid Build Coastguard Worker * Example : See LASX_TRANSPOSE8x8_H
1864*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1865*fb1b10abSAndroid Build Coastguard Worker */
1866*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1867*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1868*fb1b10abSAndroid Build Coastguard Worker _out7) \
1869*fb1b10abSAndroid Build Coastguard Worker { \
1870*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1871*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1872*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1873*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1874*fb1b10abSAndroid Build Coastguard Worker _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1875*fb1b10abSAndroid Build Coastguard Worker _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1876*fb1b10abSAndroid Build Coastguard Worker _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1877*fb1b10abSAndroid Build Coastguard Worker _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1878*fb1b10abSAndroid Build Coastguard Worker _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1879*fb1b10abSAndroid Build Coastguard Worker _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1880*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
1881*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
1882*fb1b10abSAndroid Build Coastguard Worker _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
1883*fb1b10abSAndroid Build Coastguard Worker _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
1884*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvbsrl_v(_out0, 8); \
1885*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvbsrl_v(_out2, 8); \
1886*fb1b10abSAndroid Build Coastguard Worker _out5 = __lasx_xvbsrl_v(_out4, 8); \
1887*fb1b10abSAndroid Build Coastguard Worker _out7 = __lasx_xvbsrl_v(_out6, 8); \
1888*fb1b10abSAndroid Build Coastguard Worker }
1889*fb1b10abSAndroid Build Coastguard Worker
1890*fb1b10abSAndroid Build Coastguard Worker /*
1891*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1892*fb1b10abSAndroid Build Coastguard Worker * Description : Transpose 8x8 block with halfword elements in vectors.
1893*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, ~
1894*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, ~
1895*fb1b10abSAndroid Build Coastguard Worker * Details : The rows of the matrix become columns, and the columns become
1896*fb1b10abSAndroid Build Coastguard Worker * rows.
1897*fb1b10abSAndroid Build Coastguard Worker * Example : LASX_TRANSPOSE8x8_H
1898*fb1b10abSAndroid Build Coastguard Worker * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1899*fb1b10abSAndroid Build Coastguard Worker * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1900*fb1b10abSAndroid Build Coastguard Worker * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1901*fb1b10abSAndroid Build Coastguard Worker * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1902*fb1b10abSAndroid Build Coastguard Worker * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1903*fb1b10abSAndroid Build Coastguard Worker * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1904*fb1b10abSAndroid Build Coastguard Worker * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1905*fb1b10abSAndroid Build Coastguard Worker * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1906*fb1b10abSAndroid Build Coastguard Worker *
1907*fb1b10abSAndroid Build Coastguard Worker * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1908*fb1b10abSAndroid Build Coastguard Worker * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1909*fb1b10abSAndroid Build Coastguard Worker * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1910*fb1b10abSAndroid Build Coastguard Worker * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1911*fb1b10abSAndroid Build Coastguard Worker * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1912*fb1b10abSAndroid Build Coastguard Worker * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1913*fb1b10abSAndroid Build Coastguard Worker * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1914*fb1b10abSAndroid Build Coastguard Worker * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1915*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1916*fb1b10abSAndroid Build Coastguard Worker */
1917*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1918*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1919*fb1b10abSAndroid Build Coastguard Worker _out7) \
1920*fb1b10abSAndroid Build Coastguard Worker { \
1921*fb1b10abSAndroid Build Coastguard Worker __m256i _s0_m, _s1_m; \
1922*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1923*fb1b10abSAndroid Build Coastguard Worker __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1924*fb1b10abSAndroid Build Coastguard Worker \
1925*fb1b10abSAndroid Build Coastguard Worker _s0_m = __lasx_xvilvl_h(_in6, _in4); \
1926*fb1b10abSAndroid Build Coastguard Worker _s1_m = __lasx_xvilvl_h(_in7, _in5); \
1927*fb1b10abSAndroid Build Coastguard Worker _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1928*fb1b10abSAndroid Build Coastguard Worker _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1929*fb1b10abSAndroid Build Coastguard Worker _s0_m = __lasx_xvilvh_h(_in6, _in4); \
1930*fb1b10abSAndroid Build Coastguard Worker _s1_m = __lasx_xvilvh_h(_in7, _in5); \
1931*fb1b10abSAndroid Build Coastguard Worker _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1932*fb1b10abSAndroid Build Coastguard Worker _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1933*fb1b10abSAndroid Build Coastguard Worker \
1934*fb1b10abSAndroid Build Coastguard Worker _s0_m = __lasx_xvilvl_h(_in2, _in0); \
1935*fb1b10abSAndroid Build Coastguard Worker _s1_m = __lasx_xvilvl_h(_in3, _in1); \
1936*fb1b10abSAndroid Build Coastguard Worker _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1937*fb1b10abSAndroid Build Coastguard Worker _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1938*fb1b10abSAndroid Build Coastguard Worker _s0_m = __lasx_xvilvh_h(_in2, _in0); \
1939*fb1b10abSAndroid Build Coastguard Worker _s1_m = __lasx_xvilvh_h(_in3, _in1); \
1940*fb1b10abSAndroid Build Coastguard Worker _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1941*fb1b10abSAndroid Build Coastguard Worker _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1942*fb1b10abSAndroid Build Coastguard Worker \
1943*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
1944*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
1945*fb1b10abSAndroid Build Coastguard Worker _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
1946*fb1b10abSAndroid Build Coastguard Worker _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
1947*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
1948*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
1949*fb1b10abSAndroid Build Coastguard Worker _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
1950*fb1b10abSAndroid Build Coastguard Worker _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
1951*fb1b10abSAndroid Build Coastguard Worker }
1952*fb1b10abSAndroid Build Coastguard Worker
1953*fb1b10abSAndroid Build Coastguard Worker /*
1954*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1955*fb1b10abSAndroid Build Coastguard Worker * Description : Butterfly of 4 input vectors
1956*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3
1957*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3
1958*fb1b10abSAndroid Build Coastguard Worker * Details : Butterfly operation
1959*fb1b10abSAndroid Build Coastguard Worker * Example : LASX_BUTTERFLY_4
1960*fb1b10abSAndroid Build Coastguard Worker * _out0 = _in0 + _in3;
1961*fb1b10abSAndroid Build Coastguard Worker * _out1 = _in1 + _in2;
1962*fb1b10abSAndroid Build Coastguard Worker * _out2 = _in1 - _in2;
1963*fb1b10abSAndroid Build Coastguard Worker * _out3 = _in0 - _in3;
1964*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1965*fb1b10abSAndroid Build Coastguard Worker */
1966*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1967*fb1b10abSAndroid Build Coastguard Worker { \
1968*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvadd_b(_in0, _in3); \
1969*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvadd_b(_in1, _in2); \
1970*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvsub_b(_in1, _in2); \
1971*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvsub_b(_in0, _in3); \
1972*fb1b10abSAndroid Build Coastguard Worker }
1973*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1974*fb1b10abSAndroid Build Coastguard Worker { \
1975*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvadd_h(_in0, _in3); \
1976*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvadd_h(_in1, _in2); \
1977*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvsub_h(_in1, _in2); \
1978*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvsub_h(_in0, _in3); \
1979*fb1b10abSAndroid Build Coastguard Worker }
1980*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1981*fb1b10abSAndroid Build Coastguard Worker { \
1982*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvadd_w(_in0, _in3); \
1983*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvadd_w(_in1, _in2); \
1984*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvsub_w(_in1, _in2); \
1985*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvsub_w(_in0, _in3); \
1986*fb1b10abSAndroid Build Coastguard Worker }
1987*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1988*fb1b10abSAndroid Build Coastguard Worker { \
1989*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvadd_d(_in0, _in3); \
1990*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvadd_d(_in1, _in2); \
1991*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvsub_d(_in1, _in2); \
1992*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvsub_d(_in0, _in3); \
1993*fb1b10abSAndroid Build Coastguard Worker }
1994*fb1b10abSAndroid Build Coastguard Worker
1995*fb1b10abSAndroid Build Coastguard Worker /*
1996*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
1997*fb1b10abSAndroid Build Coastguard Worker * Description : Butterfly of 8 input vectors
1998*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
1999*fb1b10abSAndroid Build Coastguard Worker * Outputs - _out0, _out1, _out2, _out3, ~
2000*fb1b10abSAndroid Build Coastguard Worker * Details : Butterfly operation
2001*fb1b10abSAndroid Build Coastguard Worker * Example : LASX_BUTTERFLY_8
2002*fb1b10abSAndroid Build Coastguard Worker * _out0 = _in0 + _in7;
2003*fb1b10abSAndroid Build Coastguard Worker * _out1 = _in1 + _in6;
2004*fb1b10abSAndroid Build Coastguard Worker * _out2 = _in2 + _in5;
2005*fb1b10abSAndroid Build Coastguard Worker * _out3 = _in3 + _in4;
2006*fb1b10abSAndroid Build Coastguard Worker * _out4 = _in3 - _in4;
2007*fb1b10abSAndroid Build Coastguard Worker * _out5 = _in2 - _in5;
2008*fb1b10abSAndroid Build Coastguard Worker * _out6 = _in1 - _in6;
2009*fb1b10abSAndroid Build Coastguard Worker * _out7 = _in0 - _in7;
2010*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
2011*fb1b10abSAndroid Build Coastguard Worker */
2012*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
2013*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2014*fb1b10abSAndroid Build Coastguard Worker _out7) \
2015*fb1b10abSAndroid Build Coastguard Worker { \
2016*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvadd_b(_in0, _in7); \
2017*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvadd_b(_in1, _in6); \
2018*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvadd_b(_in2, _in5); \
2019*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvadd_b(_in3, _in4); \
2020*fb1b10abSAndroid Build Coastguard Worker _out4 = __lasx_xvsub_b(_in3, _in4); \
2021*fb1b10abSAndroid Build Coastguard Worker _out5 = __lasx_xvsub_b(_in2, _in5); \
2022*fb1b10abSAndroid Build Coastguard Worker _out6 = __lasx_xvsub_b(_in1, _in6); \
2023*fb1b10abSAndroid Build Coastguard Worker _out7 = __lasx_xvsub_b(_in0, _in7); \
2024*fb1b10abSAndroid Build Coastguard Worker }
2025*fb1b10abSAndroid Build Coastguard Worker
2026*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
2027*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2028*fb1b10abSAndroid Build Coastguard Worker _out7) \
2029*fb1b10abSAndroid Build Coastguard Worker { \
2030*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvadd_h(_in0, _in7); \
2031*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvadd_h(_in1, _in6); \
2032*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvadd_h(_in2, _in5); \
2033*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvadd_h(_in3, _in4); \
2034*fb1b10abSAndroid Build Coastguard Worker _out4 = __lasx_xvsub_h(_in3, _in4); \
2035*fb1b10abSAndroid Build Coastguard Worker _out5 = __lasx_xvsub_h(_in2, _in5); \
2036*fb1b10abSAndroid Build Coastguard Worker _out6 = __lasx_xvsub_h(_in1, _in6); \
2037*fb1b10abSAndroid Build Coastguard Worker _out7 = __lasx_xvsub_h(_in0, _in7); \
2038*fb1b10abSAndroid Build Coastguard Worker }
2039*fb1b10abSAndroid Build Coastguard Worker
2040*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
2041*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2042*fb1b10abSAndroid Build Coastguard Worker _out7) \
2043*fb1b10abSAndroid Build Coastguard Worker { \
2044*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvadd_w(_in0, _in7); \
2045*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvadd_w(_in1, _in6); \
2046*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvadd_w(_in2, _in5); \
2047*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvadd_w(_in3, _in4); \
2048*fb1b10abSAndroid Build Coastguard Worker _out4 = __lasx_xvsub_w(_in3, _in4); \
2049*fb1b10abSAndroid Build Coastguard Worker _out5 = __lasx_xvsub_w(_in2, _in5); \
2050*fb1b10abSAndroid Build Coastguard Worker _out6 = __lasx_xvsub_w(_in1, _in6); \
2051*fb1b10abSAndroid Build Coastguard Worker _out7 = __lasx_xvsub_w(_in0, _in7); \
2052*fb1b10abSAndroid Build Coastguard Worker }
2053*fb1b10abSAndroid Build Coastguard Worker
2054*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
2055*fb1b10abSAndroid Build Coastguard Worker _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2056*fb1b10abSAndroid Build Coastguard Worker _out7) \
2057*fb1b10abSAndroid Build Coastguard Worker { \
2058*fb1b10abSAndroid Build Coastguard Worker _out0 = __lasx_xvadd_d(_in0, _in7); \
2059*fb1b10abSAndroid Build Coastguard Worker _out1 = __lasx_xvadd_d(_in1, _in6); \
2060*fb1b10abSAndroid Build Coastguard Worker _out2 = __lasx_xvadd_d(_in2, _in5); \
2061*fb1b10abSAndroid Build Coastguard Worker _out3 = __lasx_xvadd_d(_in3, _in4); \
2062*fb1b10abSAndroid Build Coastguard Worker _out4 = __lasx_xvsub_d(_in3, _in4); \
2063*fb1b10abSAndroid Build Coastguard Worker _out5 = __lasx_xvsub_d(_in2, _in5); \
2064*fb1b10abSAndroid Build Coastguard Worker _out6 = __lasx_xvsub_d(_in1, _in6); \
2065*fb1b10abSAndroid Build Coastguard Worker _out7 = __lasx_xvsub_d(_in0, _in7); \
2066*fb1b10abSAndroid Build Coastguard Worker }
2067*fb1b10abSAndroid Build Coastguard Worker
2068*fb1b10abSAndroid Build Coastguard Worker #endif // LASX
2069*fb1b10abSAndroid Build Coastguard Worker
2070*fb1b10abSAndroid Build Coastguard Worker /*
2071*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
2072*fb1b10abSAndroid Build Coastguard Worker * Description : Print out elements in vector.
2073*fb1b10abSAndroid Build Coastguard Worker * Arguments : Inputs - RTYPE, _element_num, _in0, _enter
2074*fb1b10abSAndroid Build Coastguard Worker * Outputs -
2075*fb1b10abSAndroid Build Coastguard Worker * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
2076*fb1b10abSAndroid Build Coastguard Worker * '_enter' is TRUE, prefix "\nVP:" will be added first.
2077*fb1b10abSAndroid Build Coastguard Worker * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
2078*fb1b10abSAndroid Build Coastguard Worker * VP:1,2,3,4,
2079*fb1b10abSAndroid Build Coastguard Worker * =============================================================================
2080*fb1b10abSAndroid Build Coastguard Worker */
2081*fb1b10abSAndroid Build Coastguard Worker #define VECT_PRINT(RTYPE, element_num, in0, enter) \
2082*fb1b10abSAndroid Build Coastguard Worker { \
2083*fb1b10abSAndroid Build Coastguard Worker RTYPE _tmp0 = (RTYPE)in0; \
2084*fb1b10abSAndroid Build Coastguard Worker int _i = 0; \
2085*fb1b10abSAndroid Build Coastguard Worker if (enter) printf("\nVP:"); \
2086*fb1b10abSAndroid Build Coastguard Worker for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
2087*fb1b10abSAndroid Build Coastguard Worker }
2088*fb1b10abSAndroid Build Coastguard Worker
2089*fb1b10abSAndroid Build Coastguard Worker #endif /* LOONGSON_INTRINSICS_H */
2090*fb1b10abSAndroid Build Coastguard Worker #endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */
2091