xref: /aosp_15_r20/external/libvpx/vpx_util/loongson_intrinsics.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  * in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  *
10*fb1b10abSAndroid Build Coastguard Worker  */
11*fb1b10abSAndroid Build Coastguard Worker 
12*fb1b10abSAndroid Build Coastguard Worker #ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
13*fb1b10abSAndroid Build Coastguard Worker #define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
14*fb1b10abSAndroid Build Coastguard Worker 
15*fb1b10abSAndroid Build Coastguard Worker /*
16*fb1b10abSAndroid Build Coastguard Worker  * Copyright (c) 2021 Loongson Technology Corporation Limited
17*fb1b10abSAndroid Build Coastguard Worker  * All rights reserved.
18*fb1b10abSAndroid Build Coastguard Worker  *
19*fb1b10abSAndroid Build Coastguard Worker  * Use of this source code is governed by a BSD-style license
20*fb1b10abSAndroid Build Coastguard Worker  * that can be found in the LICENSE file in the root of the source
21*fb1b10abSAndroid Build Coastguard Worker  * tree. An additional intellectual property rights grant can be found
22*fb1b10abSAndroid Build Coastguard Worker  * in the file PATENTS.  All contributing project authors may
23*fb1b10abSAndroid Build Coastguard Worker  * be found in the AUTHORS file in the root of the source tree.
24*fb1b10abSAndroid Build Coastguard Worker  *
25*fb1b10abSAndroid Build Coastguard Worker  * Contributed by Shiyou Yin <[email protected]>
26*fb1b10abSAndroid Build Coastguard Worker  *                Xiwei Gu   <[email protected]>
27*fb1b10abSAndroid Build Coastguard Worker  *                Lu Wang    <[email protected]>
28*fb1b10abSAndroid Build Coastguard Worker  *
29*fb1b10abSAndroid Build Coastguard Worker  * This file is a header file for loongarch builtin extension.
30*fb1b10abSAndroid Build Coastguard Worker  *
31*fb1b10abSAndroid Build Coastguard Worker  */
32*fb1b10abSAndroid Build Coastguard Worker 
33*fb1b10abSAndroid Build Coastguard Worker #ifndef LOONGSON_INTRINSICS_H
34*fb1b10abSAndroid Build Coastguard Worker #define LOONGSON_INTRINSICS_H
35*fb1b10abSAndroid Build Coastguard Worker 
36*fb1b10abSAndroid Build Coastguard Worker /**
37*fb1b10abSAndroid Build Coastguard Worker  * MAJOR version: Macro usage changes.
38*fb1b10abSAndroid Build Coastguard Worker  * MINOR version: Add new functions, or bug fixes.
39*fb1b10abSAndroid Build Coastguard Worker  * MICRO version: Comment changes or implementation changes.
40*fb1b10abSAndroid Build Coastguard Worker  */
41*fb1b10abSAndroid Build Coastguard Worker #define LSOM_VERSION_MAJOR 1
42*fb1b10abSAndroid Build Coastguard Worker #define LSOM_VERSION_MINOR 2
43*fb1b10abSAndroid Build Coastguard Worker #define LSOM_VERSION_MICRO 1
44*fb1b10abSAndroid Build Coastguard Worker 
45*fb1b10abSAndroid Build Coastguard Worker #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
46*fb1b10abSAndroid Build Coastguard Worker   {                                               \
47*fb1b10abSAndroid Build Coastguard Worker     _OUT0 = _INS(_IN0);                           \
48*fb1b10abSAndroid Build Coastguard Worker     _OUT1 = _INS(_IN1);                           \
49*fb1b10abSAndroid Build Coastguard Worker   }
50*fb1b10abSAndroid Build Coastguard Worker 
51*fb1b10abSAndroid Build Coastguard Worker #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
52*fb1b10abSAndroid Build Coastguard Worker   {                                                           \
53*fb1b10abSAndroid Build Coastguard Worker     _OUT0 = _INS(_IN0, _IN1);                                 \
54*fb1b10abSAndroid Build Coastguard Worker     _OUT1 = _INS(_IN2, _IN3);                                 \
55*fb1b10abSAndroid Build Coastguard Worker   }
56*fb1b10abSAndroid Build Coastguard Worker 
57*fb1b10abSAndroid Build Coastguard Worker #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
58*fb1b10abSAndroid Build Coastguard Worker   {                                                                       \
59*fb1b10abSAndroid Build Coastguard Worker     _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
60*fb1b10abSAndroid Build Coastguard Worker     _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
61*fb1b10abSAndroid Build Coastguard Worker   }
62*fb1b10abSAndroid Build Coastguard Worker 
63*fb1b10abSAndroid Build Coastguard Worker #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
64*fb1b10abSAndroid Build Coastguard Worker   {                                                                         \
65*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
66*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
67*fb1b10abSAndroid Build Coastguard Worker   }
68*fb1b10abSAndroid Build Coastguard Worker 
69*fb1b10abSAndroid Build Coastguard Worker #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
70*fb1b10abSAndroid Build Coastguard Worker                   _OUT1, _OUT2, _OUT3)                                         \
71*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
72*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
73*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
74*fb1b10abSAndroid Build Coastguard Worker   }
75*fb1b10abSAndroid Build Coastguard Worker 
76*fb1b10abSAndroid Build Coastguard Worker #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
77*fb1b10abSAndroid Build Coastguard Worker                   _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
78*fb1b10abSAndroid Build Coastguard Worker   {                                                                           \
79*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
80*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
81*fb1b10abSAndroid Build Coastguard Worker   }
82*fb1b10abSAndroid Build Coastguard Worker 
83*fb1b10abSAndroid Build Coastguard Worker #ifdef __loongarch_sx
84*fb1b10abSAndroid Build Coastguard Worker #include <lsxintrin.h>
85*fb1b10abSAndroid Build Coastguard Worker /*
86*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
87*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product & addition of byte vector elements
88*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in_c, in_h, in_l
89*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
90*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
91*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed byte elements from in_h are multiplied by
92*fb1b10abSAndroid Build Coastguard Worker  *               signed byte elements from in_l, and then added adjacent to
93*fb1b10abSAndroid Build Coastguard Worker  *               each other to get a result twice the size of input. Then
94*fb1b10abSAndroid Build Coastguard Worker  *               the results are added to signed half-word elements from in_c.
95*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
96*fb1b10abSAndroid Build Coastguard Worker  *        in_c : 1,2,3,4, 1,2,3,4
97*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
98*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
99*fb1b10abSAndroid Build Coastguard Worker  *         out : 23,40,41,26, 23,40,41,26
100*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
101*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vdp2add_h_b(__m128i in_c,__m128i in_h,__m128i in_l)102*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
103*fb1b10abSAndroid Build Coastguard Worker                                         __m128i in_l) {
104*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
105*fb1b10abSAndroid Build Coastguard Worker 
106*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
107*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwod_h_b(out, in_h, in_l);
108*fb1b10abSAndroid Build Coastguard Worker   return out;
109*fb1b10abSAndroid Build Coastguard Worker }
110*fb1b10abSAndroid Build Coastguard Worker 
111*fb1b10abSAndroid Build Coastguard Worker /*
112*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
113*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product & addition of byte vector elements
114*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in_c, in_h, in_l
115*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
116*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
117*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned byte elements from in_h are multiplied by
118*fb1b10abSAndroid Build Coastguard Worker  *               unsigned byte elements from in_l, and then added adjacent to
119*fb1b10abSAndroid Build Coastguard Worker  *               each other to get a result twice the size of input.
120*fb1b10abSAndroid Build Coastguard Worker  *               The results are added to signed half-word elements from in_c.
121*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
122*fb1b10abSAndroid Build Coastguard Worker  *        in_c : 1,2,3,4, 1,2,3,4
123*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
124*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
125*fb1b10abSAndroid Build Coastguard Worker  *         out : 23,40,41,26, 23,40,41,26
126*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
127*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vdp2add_h_bu(__m128i in_c,__m128i in_h,__m128i in_l)128*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
129*fb1b10abSAndroid Build Coastguard Worker                                          __m128i in_l) {
130*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
131*fb1b10abSAndroid Build Coastguard Worker 
132*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
133*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
134*fb1b10abSAndroid Build Coastguard Worker   return out;
135*fb1b10abSAndroid Build Coastguard Worker }
136*fb1b10abSAndroid Build Coastguard Worker 
137*fb1b10abSAndroid Build Coastguard Worker /*
138*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
139*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product & addition of byte vector elements
140*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in_c, in_h, in_l
141*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
142*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
143*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned byte elements from in_h are multiplied by
144*fb1b10abSAndroid Build Coastguard Worker  *               signed byte elements from in_l, and then added adjacent to
145*fb1b10abSAndroid Build Coastguard Worker  *               each other to get a result twice the size of input.
146*fb1b10abSAndroid Build Coastguard Worker  *               The results are added to signed half-word elements from in_c.
147*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
148*fb1b10abSAndroid Build Coastguard Worker  *        in_c : 1,1,1,1, 1,1,1,1
149*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
150*fb1b10abSAndroid Build Coastguard Worker  *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
151*fb1b10abSAndroid Build Coastguard Worker  *         out : -4,-24,-60,-112, 6,26,62,114
152*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
153*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vdp2add_h_bu_b(__m128i in_c,__m128i in_h,__m128i in_l)154*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
155*fb1b10abSAndroid Build Coastguard Worker                                            __m128i in_l) {
156*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
157*fb1b10abSAndroid Build Coastguard Worker 
158*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
159*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
160*fb1b10abSAndroid Build Coastguard Worker   return out;
161*fb1b10abSAndroid Build Coastguard Worker }
162*fb1b10abSAndroid Build Coastguard Worker 
163*fb1b10abSAndroid Build Coastguard Worker /*
164*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
165*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product & addition of half-word vector elements
166*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in_c, in_h, in_l
167*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
168*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - __m128i
169*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed half-word elements from in_h are multiplied by
170*fb1b10abSAndroid Build Coastguard Worker  *               signed half-word elements from in_l, and then added adjacent to
171*fb1b10abSAndroid Build Coastguard Worker  *               each other to get a result twice the size of input.
172*fb1b10abSAndroid Build Coastguard Worker  *               Then the results are added to signed word elements from in_c.
173*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
174*fb1b10abSAndroid Build Coastguard Worker  *        in_c : 1,2,3,4
175*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 5,6,7,8
176*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 8,7,6,5, 4,3,2,1
177*fb1b10abSAndroid Build Coastguard Worker  *         out : 23,40,41,26
178*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
179*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vdp2add_w_h(__m128i in_c,__m128i in_h,__m128i in_l)180*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
181*fb1b10abSAndroid Build Coastguard Worker                                         __m128i in_l) {
182*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
183*fb1b10abSAndroid Build Coastguard Worker 
184*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
185*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwod_w_h(out, in_h, in_l);
186*fb1b10abSAndroid Build Coastguard Worker   return out;
187*fb1b10abSAndroid Build Coastguard Worker }
188*fb1b10abSAndroid Build Coastguard Worker 
189*fb1b10abSAndroid Build Coastguard Worker /*
190*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
191*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of byte vector elements
192*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in_h, in_l
193*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
194*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
195*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed byte elements from in_h are multiplied by
196*fb1b10abSAndroid Build Coastguard Worker  *               signed byte elements from in_l, and then added adjacent to
197*fb1b10abSAndroid Build Coastguard Worker  *               each other to get a result twice the size of input.
198*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
199*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
200*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
201*fb1b10abSAndroid Build Coastguard Worker  *         out : 22,38,38,22, 22,38,38,22
202*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
203*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vdp2_h_b(__m128i in_h,__m128i in_l)204*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
205*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
206*fb1b10abSAndroid Build Coastguard Worker 
207*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmulwev_h_b(in_h, in_l);
208*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwod_h_b(out, in_h, in_l);
209*fb1b10abSAndroid Build Coastguard Worker   return out;
210*fb1b10abSAndroid Build Coastguard Worker }
211*fb1b10abSAndroid Build Coastguard Worker 
212*fb1b10abSAndroid Build Coastguard Worker /*
213*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
214*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of byte vector elements
215*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in_h, in_l
216*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
217*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
218*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned byte elements from in_h are multiplied by
219*fb1b10abSAndroid Build Coastguard Worker  *               unsigned byte elements from in_l, and then added adjacent to
220*fb1b10abSAndroid Build Coastguard Worker  *               each other to get a result twice the size of input.
221*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
222*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
223*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
224*fb1b10abSAndroid Build Coastguard Worker  *         out : 22,38,38,22, 22,38,38,22
225*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
226*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vdp2_h_bu(__m128i in_h,__m128i in_l)227*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
228*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
229*fb1b10abSAndroid Build Coastguard Worker 
230*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmulwev_h_bu(in_h, in_l);
231*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
232*fb1b10abSAndroid Build Coastguard Worker   return out;
233*fb1b10abSAndroid Build Coastguard Worker }
234*fb1b10abSAndroid Build Coastguard Worker 
235*fb1b10abSAndroid Build Coastguard Worker /*
236*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
237*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of byte vector elements
238*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in_h, in_l
239*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
240*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
241*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned byte elements from in_h are multiplied by
242*fb1b10abSAndroid Build Coastguard Worker  *               signed byte elements from in_l, and then added adjacent to
243*fb1b10abSAndroid Build Coastguard Worker  *               each other to get a result twice the size of input.
244*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
245*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
246*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
247*fb1b10abSAndroid Build Coastguard Worker  *         out : 22,38,38,22, 22,38,38,6
248*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
249*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vdp2_h_bu_b(__m128i in_h,__m128i in_l)250*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
251*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
252*fb1b10abSAndroid Build Coastguard Worker 
253*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmulwev_h_bu_b(in_h, in_l);
254*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
255*fb1b10abSAndroid Build Coastguard Worker   return out;
256*fb1b10abSAndroid Build Coastguard Worker }
257*fb1b10abSAndroid Build Coastguard Worker 
258*fb1b10abSAndroid Build Coastguard Worker /*
259*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
260*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of byte vector elements
261*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in_h, in_l
262*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
263*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
264*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed byte elements from in_h are multiplied by
265*fb1b10abSAndroid Build Coastguard Worker  *               signed byte elements from in_l, and then added adjacent to
266*fb1b10abSAndroid Build Coastguard Worker  *               each other to get a result twice the size of input.
267*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
268*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 5,6,7,8
269*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 8,7,6,5, 4,3,2,1
270*fb1b10abSAndroid Build Coastguard Worker  *         out : 22,38,38,22
271*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
272*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vdp2_w_h(__m128i in_h,__m128i in_l)273*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
274*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
275*fb1b10abSAndroid Build Coastguard Worker 
276*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmulwev_w_h(in_h, in_l);
277*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwod_w_h(out, in_h, in_l);
278*fb1b10abSAndroid Build Coastguard Worker   return out;
279*fb1b10abSAndroid Build Coastguard Worker }
280*fb1b10abSAndroid Build Coastguard Worker 
281*fb1b10abSAndroid Build Coastguard Worker /*
282*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
283*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of byte vector elements
284*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in_h, in_l
285*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
286*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - double
287*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed byte elements from in_h are multiplied by
288*fb1b10abSAndroid Build Coastguard Worker  *               signed byte elements from in_l, and then added adjacent to
289*fb1b10abSAndroid Build Coastguard Worker  *               each other to get a result twice the size of input.
290*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vdp2_d_w(in_h, in_l)
291*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4
292*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 8,7,6,5
293*fb1b10abSAndroid Build Coastguard Worker  *         out : 22,38
294*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
295*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vdp2_d_w(__m128i in_h,__m128i in_l)296*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) {
297*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
298*fb1b10abSAndroid Build Coastguard Worker 
299*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmulwev_d_w(in_h, in_l);
300*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaddwod_d_w(out, in_h, in_l);
301*fb1b10abSAndroid Build Coastguard Worker   return out;
302*fb1b10abSAndroid Build Coastguard Worker }
303*fb1b10abSAndroid Build Coastguard Worker 
304*fb1b10abSAndroid Build Coastguard Worker /*
305*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
306*fb1b10abSAndroid Build Coastguard Worker  * Description : Clip all halfword elements of input vector between min & max
307*fb1b10abSAndroid Build Coastguard Worker  *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
308*fb1b10abSAndroid Build Coastguard Worker  *               (_in))
309*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in  (input vector)
310*fb1b10abSAndroid Build Coastguard Worker  *                       - min  (min threshold)
311*fb1b10abSAndroid Build Coastguard Worker  *                       - max  (max threshold)
312*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out  (output vector with clipped elements)
313*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed halfword
314*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vclip_h(_in)
315*fb1b10abSAndroid Build Coastguard Worker  *         _in : -8,2,280,249, -8,255,280,249
316*fb1b10abSAndroid Build Coastguard Worker  *         min : 1,1,1,1, 1,1,1,1
317*fb1b10abSAndroid Build Coastguard Worker  *         max : 9,9,9,9, 9,9,9,9
318*fb1b10abSAndroid Build Coastguard Worker  *         out : 1,2,9,9, 1,9,9,9
319*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
320*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vclip_h(__m128i _in,__m128i min,__m128i max)321*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
322*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
323*fb1b10abSAndroid Build Coastguard Worker 
324*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmax_h(min, _in);
325*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmin_h(max, out);
326*fb1b10abSAndroid Build Coastguard Worker   return out;
327*fb1b10abSAndroid Build Coastguard Worker }
328*fb1b10abSAndroid Build Coastguard Worker 
329*fb1b10abSAndroid Build Coastguard Worker /*
330*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
331*fb1b10abSAndroid Build Coastguard Worker  * Description : Set each element of vector between 0 and 255
332*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in
333*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
334*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
335*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed byte elements from _in are clamped between 0 and 255.
336*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vclip255_h(_in)
337*fb1b10abSAndroid Build Coastguard Worker  *         _in : -8,255,280,249, -8,255,280,249
338*fb1b10abSAndroid Build Coastguard Worker  *         out : 0,255,255,249, 0,255,255,249
339*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
340*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vclip255_h(__m128i _in)341*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vclip255_h(__m128i _in) {
342*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
343*fb1b10abSAndroid Build Coastguard Worker 
344*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaxi_h(_in, 0);
345*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vsat_hu(out, 7);
346*fb1b10abSAndroid Build Coastguard Worker   return out;
347*fb1b10abSAndroid Build Coastguard Worker }
348*fb1b10abSAndroid Build Coastguard Worker 
349*fb1b10abSAndroid Build Coastguard Worker /*
350*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
351*fb1b10abSAndroid Build Coastguard Worker  * Description : Set each element of vector between 0 and 255
352*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in
353*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out
354*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - word
355*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed byte elements from _in are clamped between 0 and 255.
356*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lsx_vclip255_w(_in)
357*fb1b10abSAndroid Build Coastguard Worker  *         _in : -8,255,280,249
358*fb1b10abSAndroid Build Coastguard Worker  *         out : 0,255,255,249
359*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
360*fb1b10abSAndroid Build Coastguard Worker  */
__lsx_vclip255_w(__m128i _in)361*fb1b10abSAndroid Build Coastguard Worker static inline __m128i __lsx_vclip255_w(__m128i _in) {
362*fb1b10abSAndroid Build Coastguard Worker   __m128i out;
363*fb1b10abSAndroid Build Coastguard Worker 
364*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vmaxi_w(_in, 0);
365*fb1b10abSAndroid Build Coastguard Worker   out = __lsx_vsat_wu(out, 7);
366*fb1b10abSAndroid Build Coastguard Worker   return out;
367*fb1b10abSAndroid Build Coastguard Worker }
368*fb1b10abSAndroid Build Coastguard Worker 
369*fb1b10abSAndroid Build Coastguard Worker /*
370*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
371*fb1b10abSAndroid Build Coastguard Worker  * Description : Swap two variables
372*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1
373*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _in0, _in1 (in-place)
374*fb1b10abSAndroid Build Coastguard Worker  * Details     : Swapping of two input variables using xor
375*fb1b10abSAndroid Build Coastguard Worker  * Example     : LSX_SWAP(_in0, _in1)
376*fb1b10abSAndroid Build Coastguard Worker  *        _in0 : 1,2,3,4
377*fb1b10abSAndroid Build Coastguard Worker  *        _in1 : 5,6,7,8
378*fb1b10abSAndroid Build Coastguard Worker  *   _in0(out) : 5,6,7,8
379*fb1b10abSAndroid Build Coastguard Worker  *   _in1(out) : 1,2,3,4
380*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
381*fb1b10abSAndroid Build Coastguard Worker  */
382*fb1b10abSAndroid Build Coastguard Worker #define LSX_SWAP(_in0, _in1)         \
383*fb1b10abSAndroid Build Coastguard Worker   {                                  \
384*fb1b10abSAndroid Build Coastguard Worker     _in0 = __lsx_vxor_v(_in0, _in1); \
385*fb1b10abSAndroid Build Coastguard Worker     _in1 = __lsx_vxor_v(_in0, _in1); \
386*fb1b10abSAndroid Build Coastguard Worker     _in0 = __lsx_vxor_v(_in0, _in1); \
387*fb1b10abSAndroid Build Coastguard Worker   }
388*fb1b10abSAndroid Build Coastguard Worker 
389*fb1b10abSAndroid Build Coastguard Worker /*
390*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
391*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose 4x4 block with word elements in vectors
392*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in0, in1, in2, in3
393*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out0, out1, out2, out3
394*fb1b10abSAndroid Build Coastguard Worker  * Details     :
395*fb1b10abSAndroid Build Coastguard Worker  * Example     :
396*fb1b10abSAndroid Build Coastguard Worker  *               1, 2, 3, 4            1, 5, 9,13
397*fb1b10abSAndroid Build Coastguard Worker  *               5, 6, 7, 8    to      2, 6,10,14
398*fb1b10abSAndroid Build Coastguard Worker  *               9,10,11,12  =====>    3, 7,11,15
399*fb1b10abSAndroid Build Coastguard Worker  *              13,14,15,16            4, 8,12,16
400*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
401*fb1b10abSAndroid Build Coastguard Worker  */
402*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
403*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
404*fb1b10abSAndroid Build Coastguard Worker     __m128i _t0, _t1, _t2, _t3;                                                \
405*fb1b10abSAndroid Build Coastguard Worker                                                                                \
406*fb1b10abSAndroid Build Coastguard Worker     _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
407*fb1b10abSAndroid Build Coastguard Worker     _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
408*fb1b10abSAndroid Build Coastguard Worker     _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
409*fb1b10abSAndroid Build Coastguard Worker     _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
410*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
411*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
412*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
413*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
414*fb1b10abSAndroid Build Coastguard Worker   }
415*fb1b10abSAndroid Build Coastguard Worker 
416*fb1b10abSAndroid Build Coastguard Worker /*
417*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
418*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose 8x8 block with byte elements in vectors
419*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
420*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
421*fb1b10abSAndroid Build Coastguard Worker  *               _out7
422*fb1b10abSAndroid Build Coastguard Worker  * Details     : The rows of the matrix become columns, and the columns
423*fb1b10abSAndroid Build Coastguard Worker  *               become rows.
424*fb1b10abSAndroid Build Coastguard Worker  * Example     : LSX_TRANSPOSE8x8_B
425*fb1b10abSAndroid Build Coastguard Worker  *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
426*fb1b10abSAndroid Build Coastguard Worker  *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
427*fb1b10abSAndroid Build Coastguard Worker  *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
428*fb1b10abSAndroid Build Coastguard Worker  *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
429*fb1b10abSAndroid Build Coastguard Worker  *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
430*fb1b10abSAndroid Build Coastguard Worker  *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
431*fb1b10abSAndroid Build Coastguard Worker  *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
432*fb1b10abSAndroid Build Coastguard Worker  *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
433*fb1b10abSAndroid Build Coastguard Worker  *
434*fb1b10abSAndroid Build Coastguard Worker  *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
435*fb1b10abSAndroid Build Coastguard Worker  *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
436*fb1b10abSAndroid Build Coastguard Worker  *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
437*fb1b10abSAndroid Build Coastguard Worker  *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
438*fb1b10abSAndroid Build Coastguard Worker  *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
439*fb1b10abSAndroid Build Coastguard Worker  *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
440*fb1b10abSAndroid Build Coastguard Worker  *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
441*fb1b10abSAndroid Build Coastguard Worker  *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
442*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
443*fb1b10abSAndroid Build Coastguard Worker  */
444*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
445*fb1b10abSAndroid Build Coastguard Worker                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
446*fb1b10abSAndroid Build Coastguard Worker                            _out7)                                           \
447*fb1b10abSAndroid Build Coastguard Worker   {                                                                         \
448*fb1b10abSAndroid Build Coastguard Worker     __m128i zero = { 0 };                                                   \
449*fb1b10abSAndroid Build Coastguard Worker     __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };             \
450*fb1b10abSAndroid Build Coastguard Worker     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
451*fb1b10abSAndroid Build Coastguard Worker                                                                             \
452*fb1b10abSAndroid Build Coastguard Worker     _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
453*fb1b10abSAndroid Build Coastguard Worker     _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
454*fb1b10abSAndroid Build Coastguard Worker     _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
455*fb1b10abSAndroid Build Coastguard Worker     _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
456*fb1b10abSAndroid Build Coastguard Worker     _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
457*fb1b10abSAndroid Build Coastguard Worker     _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
458*fb1b10abSAndroid Build Coastguard Worker     _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
459*fb1b10abSAndroid Build Coastguard Worker     _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
460*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
461*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
462*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
463*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
464*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
465*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
466*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
467*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
468*fb1b10abSAndroid Build Coastguard Worker   }
469*fb1b10abSAndroid Build Coastguard Worker 
470*fb1b10abSAndroid Build Coastguard Worker /*
471*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
472*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose 8x8 block with half-word elements in vectors
473*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
474*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
475*fb1b10abSAndroid Build Coastguard Worker  * Details     :
476*fb1b10abSAndroid Build Coastguard Worker  * Example     :
477*fb1b10abSAndroid Build Coastguard Worker  *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
478*fb1b10abSAndroid Build Coastguard Worker  *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
479*fb1b10abSAndroid Build Coastguard Worker  *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
480*fb1b10abSAndroid Build Coastguard Worker  *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
481*fb1b10abSAndroid Build Coastguard Worker  *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
482*fb1b10abSAndroid Build Coastguard Worker  *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
483*fb1b10abSAndroid Build Coastguard Worker  *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
484*fb1b10abSAndroid Build Coastguard Worker  *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
485*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
486*fb1b10abSAndroid Build Coastguard Worker  */
487*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
488*fb1b10abSAndroid Build Coastguard Worker                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
489*fb1b10abSAndroid Build Coastguard Worker                            _out7)                                           \
490*fb1b10abSAndroid Build Coastguard Worker   {                                                                         \
491*fb1b10abSAndroid Build Coastguard Worker     __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
492*fb1b10abSAndroid Build Coastguard Worker                                                                             \
493*fb1b10abSAndroid Build Coastguard Worker     _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
494*fb1b10abSAndroid Build Coastguard Worker     _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
495*fb1b10abSAndroid Build Coastguard Worker     _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
496*fb1b10abSAndroid Build Coastguard Worker     _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
497*fb1b10abSAndroid Build Coastguard Worker     _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
498*fb1b10abSAndroid Build Coastguard Worker     _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
499*fb1b10abSAndroid Build Coastguard Worker     _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
500*fb1b10abSAndroid Build Coastguard Worker     _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
501*fb1b10abSAndroid Build Coastguard Worker     _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
502*fb1b10abSAndroid Build Coastguard Worker     _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
503*fb1b10abSAndroid Build Coastguard Worker     _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
504*fb1b10abSAndroid Build Coastguard Worker     _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
505*fb1b10abSAndroid Build Coastguard Worker     _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
506*fb1b10abSAndroid Build Coastguard Worker     _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
507*fb1b10abSAndroid Build Coastguard Worker     _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
508*fb1b10abSAndroid Build Coastguard Worker     _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
509*fb1b10abSAndroid Build Coastguard Worker                                                                             \
510*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
511*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
512*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
513*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
514*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
515*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
516*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
517*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
518*fb1b10abSAndroid Build Coastguard Worker   }
519*fb1b10abSAndroid Build Coastguard Worker 
520*fb1b10abSAndroid Build Coastguard Worker /*
521*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
522*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose input 8x4 byte block into 4x8
523*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
524*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
525*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - as per RTYPE
526*fb1b10abSAndroid Build Coastguard Worker  * Details     : The rows of the matrix become columns, and the columns become
527*fb1b10abSAndroid Build Coastguard Worker  *               rows.
528*fb1b10abSAndroid Build Coastguard Worker  * Example     : LSX_TRANSPOSE8x4_B
529*fb1b10abSAndroid Build Coastguard Worker  *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
530*fb1b10abSAndroid Build Coastguard Worker  *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
531*fb1b10abSAndroid Build Coastguard Worker  *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
532*fb1b10abSAndroid Build Coastguard Worker  *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
533*fb1b10abSAndroid Build Coastguard Worker  *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
534*fb1b10abSAndroid Build Coastguard Worker  *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
535*fb1b10abSAndroid Build Coastguard Worker  *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
536*fb1b10abSAndroid Build Coastguard Worker  *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
537*fb1b10abSAndroid Build Coastguard Worker  *
538*fb1b10abSAndroid Build Coastguard Worker  *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
539*fb1b10abSAndroid Build Coastguard Worker  *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
540*fb1b10abSAndroid Build Coastguard Worker  *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
541*fb1b10abSAndroid Build Coastguard Worker  *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
542*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
543*fb1b10abSAndroid Build Coastguard Worker  */
544*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
545*fb1b10abSAndroid Build Coastguard Worker                            _out0, _out1, _out2, _out3)                     \
546*fb1b10abSAndroid Build Coastguard Worker   {                                                                        \
547*fb1b10abSAndroid Build Coastguard Worker     __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
548*fb1b10abSAndroid Build Coastguard Worker                                                                            \
549*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
550*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
551*fb1b10abSAndroid Build Coastguard Worker     _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
552*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
553*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
554*fb1b10abSAndroid Build Coastguard Worker                                                                            \
555*fb1b10abSAndroid Build Coastguard Worker     _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
556*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
557*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
558*fb1b10abSAndroid Build Coastguard Worker                                                                            \
559*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
560*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
561*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
562*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
563*fb1b10abSAndroid Build Coastguard Worker   }
564*fb1b10abSAndroid Build Coastguard Worker 
565*fb1b10abSAndroid Build Coastguard Worker /*
566*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
567*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose 16x8 block with byte elements in vectors
568*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
569*fb1b10abSAndroid Build Coastguard Worker  *                         in9, in10, in11, in12, in13, in14, in15
570*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
571*fb1b10abSAndroid Build Coastguard Worker  * Details     :
572*fb1b10abSAndroid Build Coastguard Worker  * Example     :
573*fb1b10abSAndroid Build Coastguard Worker  *              000,001,002,003,004,005,006,007
574*fb1b10abSAndroid Build Coastguard Worker  *              008,009,010,011,012,013,014,015
575*fb1b10abSAndroid Build Coastguard Worker  *              016,017,018,019,020,021,022,023
576*fb1b10abSAndroid Build Coastguard Worker  *              024,025,026,027,028,029,030,031
577*fb1b10abSAndroid Build Coastguard Worker  *              032,033,034,035,036,037,038,039
578*fb1b10abSAndroid Build Coastguard Worker  *              040,041,042,043,044,045,046,047        000,008,...,112,120
579*fb1b10abSAndroid Build Coastguard Worker  *              048,049,050,051,052,053,054,055        001,009,...,113,121
580*fb1b10abSAndroid Build Coastguard Worker  *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
581*fb1b10abSAndroid Build Coastguard Worker  *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
582*fb1b10abSAndroid Build Coastguard Worker  *              072,073,074,075,076,077,078,079        004,012,...,116,124
583*fb1b10abSAndroid Build Coastguard Worker  *              080,081,082,083,084,085,086,087        005,013,...,117,125
584*fb1b10abSAndroid Build Coastguard Worker  *              088,089,090,091,092,093,094,095        006,014,...,118,126
585*fb1b10abSAndroid Build Coastguard Worker  *              096,097,098,099,100,101,102,103        007,015,...,119,127
586*fb1b10abSAndroid Build Coastguard Worker  *              104,105,106,107,108,109,110,111
587*fb1b10abSAndroid Build Coastguard Worker  *              112,113,114,115,116,117,118,119
588*fb1b10abSAndroid Build Coastguard Worker  *              120,121,122,123,124,125,126,127
589*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
590*fb1b10abSAndroid Build Coastguard Worker  */
591*fb1b10abSAndroid Build Coastguard Worker #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
592*fb1b10abSAndroid Build Coastguard Worker                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
593*fb1b10abSAndroid Build Coastguard Worker                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
594*fb1b10abSAndroid Build Coastguard Worker                             _out6, _out7)                                    \
595*fb1b10abSAndroid Build Coastguard Worker   {                                                                          \
596*fb1b10abSAndroid Build Coastguard Worker     __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
597*fb1b10abSAndroid Build Coastguard Worker     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
598*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
599*fb1b10abSAndroid Build Coastguard Worker               _tmp0, _tmp1, _tmp2, _tmp3);                                   \
600*fb1b10abSAndroid Build Coastguard Worker     DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
601*fb1b10abSAndroid Build Coastguard Worker               _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
602*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
603*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
604*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
605*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
606*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
607*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
608*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
609*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
610*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
611*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
612*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
613*fb1b10abSAndroid Build Coastguard Worker     DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
614*fb1b10abSAndroid Build Coastguard Worker   }
615*fb1b10abSAndroid Build Coastguard Worker 
616*fb1b10abSAndroid Build Coastguard Worker /*
617*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
618*fb1b10abSAndroid Build Coastguard Worker  * Description : Butterfly of 4 input vectors
619*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in0, in1, in2, in3
620*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out0, out1, out2, out3
621*fb1b10abSAndroid Build Coastguard Worker  * Details     : Butterfly operation
622*fb1b10abSAndroid Build Coastguard Worker  * Example     :
623*fb1b10abSAndroid Build Coastguard Worker  *               out0 = in0 + in3;
624*fb1b10abSAndroid Build Coastguard Worker  *               out1 = in1 + in2;
625*fb1b10abSAndroid Build Coastguard Worker  *               out2 = in1 - in2;
626*fb1b10abSAndroid Build Coastguard Worker  *               out3 = in0 - in3;
627*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
628*fb1b10abSAndroid Build Coastguard Worker  */
629*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
630*fb1b10abSAndroid Build Coastguard Worker   {                                                                           \
631*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_b(_in0, _in3);                                         \
632*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_b(_in1, _in2);                                         \
633*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vsub_b(_in1, _in2);                                         \
634*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vsub_b(_in0, _in3);                                         \
635*fb1b10abSAndroid Build Coastguard Worker   }
636*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
637*fb1b10abSAndroid Build Coastguard Worker   {                                                                           \
638*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_h(_in0, _in3);                                         \
639*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_h(_in1, _in2);                                         \
640*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vsub_h(_in1, _in2);                                         \
641*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vsub_h(_in0, _in3);                                         \
642*fb1b10abSAndroid Build Coastguard Worker   }
643*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
644*fb1b10abSAndroid Build Coastguard Worker   {                                                                           \
645*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_w(_in0, _in3);                                         \
646*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_w(_in1, _in2);                                         \
647*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vsub_w(_in1, _in2);                                         \
648*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vsub_w(_in0, _in3);                                         \
649*fb1b10abSAndroid Build Coastguard Worker   }
650*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
651*fb1b10abSAndroid Build Coastguard Worker   {                                                                           \
652*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_d(_in0, _in3);                                         \
653*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_d(_in1, _in2);                                         \
654*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vsub_d(_in1, _in2);                                         \
655*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vsub_d(_in0, _in3);                                         \
656*fb1b10abSAndroid Build Coastguard Worker   }
657*fb1b10abSAndroid Build Coastguard Worker 
658*fb1b10abSAndroid Build Coastguard Worker /*
659*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
660*fb1b10abSAndroid Build Coastguard Worker  * Description : Butterfly of 8 input vectors
661*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
662*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3, ~
663*fb1b10abSAndroid Build Coastguard Worker  * Details     : Butterfly operation
664*fb1b10abSAndroid Build Coastguard Worker  * Example     :
665*fb1b10abSAndroid Build Coastguard Worker  *              _out0 = _in0 + _in7;
666*fb1b10abSAndroid Build Coastguard Worker  *              _out1 = _in1 + _in6;
667*fb1b10abSAndroid Build Coastguard Worker  *              _out2 = _in2 + _in5;
668*fb1b10abSAndroid Build Coastguard Worker  *              _out3 = _in3 + _in4;
669*fb1b10abSAndroid Build Coastguard Worker  *              _out4 = _in3 - _in4;
670*fb1b10abSAndroid Build Coastguard Worker  *              _out5 = _in2 - _in5;
671*fb1b10abSAndroid Build Coastguard Worker  *              _out6 = _in1 - _in6;
672*fb1b10abSAndroid Build Coastguard Worker  *              _out7 = _in0 - _in7;
673*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
674*fb1b10abSAndroid Build Coastguard Worker  */
675*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
676*fb1b10abSAndroid Build Coastguard Worker                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
677*fb1b10abSAndroid Build Coastguard Worker                           _out7)                                           \
678*fb1b10abSAndroid Build Coastguard Worker   {                                                                        \
679*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_b(_in0, _in7);                                      \
680*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_b(_in1, _in6);                                      \
681*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vadd_b(_in2, _in5);                                      \
682*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vadd_b(_in3, _in4);                                      \
683*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lsx_vsub_b(_in3, _in4);                                      \
684*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lsx_vsub_b(_in2, _in5);                                      \
685*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lsx_vsub_b(_in1, _in6);                                      \
686*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lsx_vsub_b(_in0, _in7);                                      \
687*fb1b10abSAndroid Build Coastguard Worker   }
688*fb1b10abSAndroid Build Coastguard Worker 
689*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
690*fb1b10abSAndroid Build Coastguard Worker                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
691*fb1b10abSAndroid Build Coastguard Worker                           _out7)                                           \
692*fb1b10abSAndroid Build Coastguard Worker   {                                                                        \
693*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_h(_in0, _in7);                                      \
694*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_h(_in1, _in6);                                      \
695*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vadd_h(_in2, _in5);                                      \
696*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vadd_h(_in3, _in4);                                      \
697*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lsx_vsub_h(_in3, _in4);                                      \
698*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lsx_vsub_h(_in2, _in5);                                      \
699*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lsx_vsub_h(_in1, _in6);                                      \
700*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lsx_vsub_h(_in0, _in7);                                      \
701*fb1b10abSAndroid Build Coastguard Worker   }
702*fb1b10abSAndroid Build Coastguard Worker 
703*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
704*fb1b10abSAndroid Build Coastguard Worker                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
705*fb1b10abSAndroid Build Coastguard Worker                           _out7)                                           \
706*fb1b10abSAndroid Build Coastguard Worker   {                                                                        \
707*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_w(_in0, _in7);                                      \
708*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_w(_in1, _in6);                                      \
709*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vadd_w(_in2, _in5);                                      \
710*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vadd_w(_in3, _in4);                                      \
711*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lsx_vsub_w(_in3, _in4);                                      \
712*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lsx_vsub_w(_in2, _in5);                                      \
713*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lsx_vsub_w(_in1, _in6);                                      \
714*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lsx_vsub_w(_in0, _in7);                                      \
715*fb1b10abSAndroid Build Coastguard Worker   }
716*fb1b10abSAndroid Build Coastguard Worker 
717*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
718*fb1b10abSAndroid Build Coastguard Worker                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
719*fb1b10abSAndroid Build Coastguard Worker                           _out7)                                           \
720*fb1b10abSAndroid Build Coastguard Worker   {                                                                        \
721*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_d(_in0, _in7);                                      \
722*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_d(_in1, _in6);                                      \
723*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vadd_d(_in2, _in5);                                      \
724*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vadd_d(_in3, _in4);                                      \
725*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lsx_vsub_d(_in3, _in4);                                      \
726*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lsx_vsub_d(_in2, _in5);                                      \
727*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lsx_vsub_d(_in1, _in6);                                      \
728*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lsx_vsub_d(_in0, _in7);                                      \
729*fb1b10abSAndroid Build Coastguard Worker   }
730*fb1b10abSAndroid Build Coastguard Worker 
731*fb1b10abSAndroid Build Coastguard Worker /*
732*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
733*fb1b10abSAndroid Build Coastguard Worker  * Description : Butterfly of 16 input vectors
734*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
735*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3, ~
736*fb1b10abSAndroid Build Coastguard Worker  * Details     : Butterfly operation
737*fb1b10abSAndroid Build Coastguard Worker  * Example     :
738*fb1b10abSAndroid Build Coastguard Worker  *              _out0 = _in0 + _in15;
739*fb1b10abSAndroid Build Coastguard Worker  *              _out1 = _in1 + _in14;
740*fb1b10abSAndroid Build Coastguard Worker  *              _out2 = _in2 + _in13;
741*fb1b10abSAndroid Build Coastguard Worker  *              _out3 = _in3 + _in12;
742*fb1b10abSAndroid Build Coastguard Worker  *              _out4 = _in4 + _in11;
743*fb1b10abSAndroid Build Coastguard Worker  *              _out5 = _in5 + _in10;
744*fb1b10abSAndroid Build Coastguard Worker  *              _out6 = _in6 + _in9;
745*fb1b10abSAndroid Build Coastguard Worker  *              _out7 = _in7 + _in8;
746*fb1b10abSAndroid Build Coastguard Worker  *              _out8 = _in7 - _in8;
747*fb1b10abSAndroid Build Coastguard Worker  *              _out9 = _in6 - _in9;
748*fb1b10abSAndroid Build Coastguard Worker  *              _out10 = _in5 - _in10;
749*fb1b10abSAndroid Build Coastguard Worker  *              _out11 = _in4 - _in11;
750*fb1b10abSAndroid Build Coastguard Worker  *              _out12 = _in3 - _in12;
751*fb1b10abSAndroid Build Coastguard Worker  *              _out13 = _in2 - _in13;
752*fb1b10abSAndroid Build Coastguard Worker  *              _out14 = _in1 - _in14;
753*fb1b10abSAndroid Build Coastguard Worker  *              _out15 = _in0 - _in15;
754*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
755*fb1b10abSAndroid Build Coastguard Worker  */
756*fb1b10abSAndroid Build Coastguard Worker 
757*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
758*fb1b10abSAndroid Build Coastguard Worker                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
759*fb1b10abSAndroid Build Coastguard Worker                            _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
760*fb1b10abSAndroid Build Coastguard Worker                            _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
761*fb1b10abSAndroid Build Coastguard Worker                            _out13, _out14, _out15)                             \
762*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
763*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_b(_in0, _in15);                                         \
764*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_b(_in1, _in14);                                         \
765*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vadd_b(_in2, _in13);                                         \
766*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vadd_b(_in3, _in12);                                         \
767*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lsx_vadd_b(_in4, _in11);                                         \
768*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lsx_vadd_b(_in5, _in10);                                         \
769*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lsx_vadd_b(_in6, _in9);                                          \
770*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lsx_vadd_b(_in7, _in8);                                          \
771*fb1b10abSAndroid Build Coastguard Worker                                                                                \
772*fb1b10abSAndroid Build Coastguard Worker     _out8 = __lsx_vsub_b(_in7, _in8);                                          \
773*fb1b10abSAndroid Build Coastguard Worker     _out9 = __lsx_vsub_b(_in6, _in9);                                          \
774*fb1b10abSAndroid Build Coastguard Worker     _out10 = __lsx_vsub_b(_in5, _in10);                                        \
775*fb1b10abSAndroid Build Coastguard Worker     _out11 = __lsx_vsub_b(_in4, _in11);                                        \
776*fb1b10abSAndroid Build Coastguard Worker     _out12 = __lsx_vsub_b(_in3, _in12);                                        \
777*fb1b10abSAndroid Build Coastguard Worker     _out13 = __lsx_vsub_b(_in2, _in13);                                        \
778*fb1b10abSAndroid Build Coastguard Worker     _out14 = __lsx_vsub_b(_in1, _in14);                                        \
779*fb1b10abSAndroid Build Coastguard Worker     _out15 = __lsx_vsub_b(_in0, _in15);                                        \
780*fb1b10abSAndroid Build Coastguard Worker   }
781*fb1b10abSAndroid Build Coastguard Worker 
782*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
783*fb1b10abSAndroid Build Coastguard Worker                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
784*fb1b10abSAndroid Build Coastguard Worker                            _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
785*fb1b10abSAndroid Build Coastguard Worker                            _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
786*fb1b10abSAndroid Build Coastguard Worker                            _out13, _out14, _out15)                             \
787*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
788*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_h(_in0, _in15);                                         \
789*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_h(_in1, _in14);                                         \
790*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vadd_h(_in2, _in13);                                         \
791*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vadd_h(_in3, _in12);                                         \
792*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lsx_vadd_h(_in4, _in11);                                         \
793*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lsx_vadd_h(_in5, _in10);                                         \
794*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lsx_vadd_h(_in6, _in9);                                          \
795*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lsx_vadd_h(_in7, _in8);                                          \
796*fb1b10abSAndroid Build Coastguard Worker                                                                                \
797*fb1b10abSAndroid Build Coastguard Worker     _out8 = __lsx_vsub_h(_in7, _in8);                                          \
798*fb1b10abSAndroid Build Coastguard Worker     _out9 = __lsx_vsub_h(_in6, _in9);                                          \
799*fb1b10abSAndroid Build Coastguard Worker     _out10 = __lsx_vsub_h(_in5, _in10);                                        \
800*fb1b10abSAndroid Build Coastguard Worker     _out11 = __lsx_vsub_h(_in4, _in11);                                        \
801*fb1b10abSAndroid Build Coastguard Worker     _out12 = __lsx_vsub_h(_in3, _in12);                                        \
802*fb1b10abSAndroid Build Coastguard Worker     _out13 = __lsx_vsub_h(_in2, _in13);                                        \
803*fb1b10abSAndroid Build Coastguard Worker     _out14 = __lsx_vsub_h(_in1, _in14);                                        \
804*fb1b10abSAndroid Build Coastguard Worker     _out15 = __lsx_vsub_h(_in0, _in15);                                        \
805*fb1b10abSAndroid Build Coastguard Worker   }
806*fb1b10abSAndroid Build Coastguard Worker 
807*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
808*fb1b10abSAndroid Build Coastguard Worker                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
809*fb1b10abSAndroid Build Coastguard Worker                            _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
810*fb1b10abSAndroid Build Coastguard Worker                            _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
811*fb1b10abSAndroid Build Coastguard Worker                            _out13, _out14, _out15)                             \
812*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
813*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_w(_in0, _in15);                                         \
814*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_w(_in1, _in14);                                         \
815*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vadd_w(_in2, _in13);                                         \
816*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vadd_w(_in3, _in12);                                         \
817*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lsx_vadd_w(_in4, _in11);                                         \
818*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lsx_vadd_w(_in5, _in10);                                         \
819*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lsx_vadd_w(_in6, _in9);                                          \
820*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lsx_vadd_w(_in7, _in8);                                          \
821*fb1b10abSAndroid Build Coastguard Worker                                                                                \
822*fb1b10abSAndroid Build Coastguard Worker     _out8 = __lsx_vsub_w(_in7, _in8);                                          \
823*fb1b10abSAndroid Build Coastguard Worker     _out9 = __lsx_vsub_w(_in6, _in9);                                          \
824*fb1b10abSAndroid Build Coastguard Worker     _out10 = __lsx_vsub_w(_in5, _in10);                                        \
825*fb1b10abSAndroid Build Coastguard Worker     _out11 = __lsx_vsub_w(_in4, _in11);                                        \
826*fb1b10abSAndroid Build Coastguard Worker     _out12 = __lsx_vsub_w(_in3, _in12);                                        \
827*fb1b10abSAndroid Build Coastguard Worker     _out13 = __lsx_vsub_w(_in2, _in13);                                        \
828*fb1b10abSAndroid Build Coastguard Worker     _out14 = __lsx_vsub_w(_in1, _in14);                                        \
829*fb1b10abSAndroid Build Coastguard Worker     _out15 = __lsx_vsub_w(_in0, _in15);                                        \
830*fb1b10abSAndroid Build Coastguard Worker   }
831*fb1b10abSAndroid Build Coastguard Worker 
832*fb1b10abSAndroid Build Coastguard Worker #define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
833*fb1b10abSAndroid Build Coastguard Worker                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
834*fb1b10abSAndroid Build Coastguard Worker                            _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
835*fb1b10abSAndroid Build Coastguard Worker                            _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
836*fb1b10abSAndroid Build Coastguard Worker                            _out13, _out14, _out15)                             \
837*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
838*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lsx_vadd_d(_in0, _in15);                                         \
839*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lsx_vadd_d(_in1, _in14);                                         \
840*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lsx_vadd_d(_in2, _in13);                                         \
841*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lsx_vadd_d(_in3, _in12);                                         \
842*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lsx_vadd_d(_in4, _in11);                                         \
843*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lsx_vadd_d(_in5, _in10);                                         \
844*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lsx_vadd_d(_in6, _in9);                                          \
845*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lsx_vadd_d(_in7, _in8);                                          \
846*fb1b10abSAndroid Build Coastguard Worker                                                                                \
847*fb1b10abSAndroid Build Coastguard Worker     _out8 = __lsx_vsub_d(_in7, _in8);                                          \
848*fb1b10abSAndroid Build Coastguard Worker     _out9 = __lsx_vsub_d(_in6, _in9);                                          \
849*fb1b10abSAndroid Build Coastguard Worker     _out10 = __lsx_vsub_d(_in5, _in10);                                        \
850*fb1b10abSAndroid Build Coastguard Worker     _out11 = __lsx_vsub_d(_in4, _in11);                                        \
851*fb1b10abSAndroid Build Coastguard Worker     _out12 = __lsx_vsub_d(_in3, _in12);                                        \
852*fb1b10abSAndroid Build Coastguard Worker     _out13 = __lsx_vsub_d(_in2, _in13);                                        \
853*fb1b10abSAndroid Build Coastguard Worker     _out14 = __lsx_vsub_d(_in1, _in14);                                        \
854*fb1b10abSAndroid Build Coastguard Worker     _out15 = __lsx_vsub_d(_in0, _in15);                                        \
855*fb1b10abSAndroid Build Coastguard Worker   }
856*fb1b10abSAndroid Build Coastguard Worker 
857*fb1b10abSAndroid Build Coastguard Worker #endif  // LSX
858*fb1b10abSAndroid Build Coastguard Worker 
859*fb1b10abSAndroid Build Coastguard Worker #ifdef __loongarch_asx
860*fb1b10abSAndroid Build Coastguard Worker #include <lasxintrin.h>
861*fb1b10abSAndroid Build Coastguard Worker /*
862*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
863*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of byte vector elements
864*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
865*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
866*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed halfword
867*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned byte elements from in_h are multiplied with
868*fb1b10abSAndroid Build Coastguard Worker  *               unsigned byte elements from in_l producing a result
869*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed halfword.
870*fb1b10abSAndroid Build Coastguard Worker  *               Then these multiplied results of adjacent odd-even elements
871*fb1b10abSAndroid Build Coastguard Worker  *               are added to the out vector
872*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
873*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
874*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2_h_bu(__m256i in_h,__m256i in_l)875*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
876*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
877*fb1b10abSAndroid Build Coastguard Worker 
878*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmulwev_h_bu(in_h, in_l);
879*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
880*fb1b10abSAndroid Build Coastguard Worker   return out;
881*fb1b10abSAndroid Build Coastguard Worker }
882*fb1b10abSAndroid Build Coastguard Worker 
883*fb1b10abSAndroid Build Coastguard Worker /*
884*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
885*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of byte vector elements
886*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
887*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
888*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed halfword
889*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed byte elements from in_h are multiplied with
890*fb1b10abSAndroid Build Coastguard Worker  *               signed byte elements from in_l producing a result
891*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed halfword.
892*fb1b10abSAndroid Build Coastguard Worker  *               Then these multiplication results of adjacent odd-even elements
893*fb1b10abSAndroid Build Coastguard Worker  *               are added to the out vector
894*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
895*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
896*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2_h_b(__m256i in_h,__m256i in_l)897*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
898*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
899*fb1b10abSAndroid Build Coastguard Worker 
900*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmulwev_h_b(in_h, in_l);
901*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
902*fb1b10abSAndroid Build Coastguard Worker   return out;
903*fb1b10abSAndroid Build Coastguard Worker }
904*fb1b10abSAndroid Build Coastguard Worker 
905*fb1b10abSAndroid Build Coastguard Worker /*
906*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
907*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of halfword vector elements
908*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
909*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
910*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed word
911*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed halfword elements from in_h are multiplied with
912*fb1b10abSAndroid Build Coastguard Worker  *               signed halfword elements from in_l producing a result
913*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed word.
914*fb1b10abSAndroid Build Coastguard Worker  *               Then these multiplied results of adjacent odd-even elements
915*fb1b10abSAndroid Build Coastguard Worker  *               are added to the out vector.
916*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
917*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
918*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
919*fb1b10abSAndroid Build Coastguard Worker  *         out : 22,38,38,22, 22,38,38,22
920*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
921*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2_w_h(__m256i in_h,__m256i in_l)922*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
923*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
924*fb1b10abSAndroid Build Coastguard Worker 
925*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmulwev_w_h(in_h, in_l);
926*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
927*fb1b10abSAndroid Build Coastguard Worker   return out;
928*fb1b10abSAndroid Build Coastguard Worker }
929*fb1b10abSAndroid Build Coastguard Worker 
930*fb1b10abSAndroid Build Coastguard Worker /*
931*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
932*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of word vector elements
933*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
934*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
935*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed double
936*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed word elements from in_h are multiplied with
937*fb1b10abSAndroid Build Coastguard Worker  *               signed word elements from in_l producing a result
938*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed double-word.
939*fb1b10abSAndroid Build Coastguard Worker  *               Then these multiplied results of adjacent odd-even elements
940*fb1b10abSAndroid Build Coastguard Worker  *               are added to the out vector.
941*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
942*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
943*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2_d_w(__m256i in_h,__m256i in_l)944*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
945*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
946*fb1b10abSAndroid Build Coastguard Worker 
947*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmulwev_d_w(in_h, in_l);
948*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
949*fb1b10abSAndroid Build Coastguard Worker   return out;
950*fb1b10abSAndroid Build Coastguard Worker }
951*fb1b10abSAndroid Build Coastguard Worker 
952*fb1b10abSAndroid Build Coastguard Worker /*
953*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
954*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of halfword vector elements
955*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
956*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
957*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed word
958*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned halfword elements from in_h are multiplied with
959*fb1b10abSAndroid Build Coastguard Worker  *               signed halfword elements from in_l producing a result
960*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. unsigned word.
961*fb1b10abSAndroid Build Coastguard Worker  *               Multiplication result of adjacent odd-even elements
962*fb1b10abSAndroid Build Coastguard Worker  *               are added to the out vector
963*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
964*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
965*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2_w_hu_h(__m256i in_h,__m256i in_l)966*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
967*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
968*fb1b10abSAndroid Build Coastguard Worker 
969*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
970*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
971*fb1b10abSAndroid Build Coastguard Worker   return out;
972*fb1b10abSAndroid Build Coastguard Worker }
973*fb1b10abSAndroid Build Coastguard Worker 
974*fb1b10abSAndroid Build Coastguard Worker /*
975*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
976*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product & addition of byte vector elements
977*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
978*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
979*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
980*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed byte elements from in_h are multiplied with
981*fb1b10abSAndroid Build Coastguard Worker  *               signed byte elements from in_l producing a result
982*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed halfword.
983*fb1b10abSAndroid Build Coastguard Worker  *               Then these multiplied results of adjacent odd-even elements
984*fb1b10abSAndroid Build Coastguard Worker  *               are added to the in_c vector.
985*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
986*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
987*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h,__m256i in_l)988*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
989*fb1b10abSAndroid Build Coastguard Worker                                           __m256i in_l) {
990*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
991*fb1b10abSAndroid Build Coastguard Worker 
992*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
993*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
994*fb1b10abSAndroid Build Coastguard Worker   return out;
995*fb1b10abSAndroid Build Coastguard Worker }
996*fb1b10abSAndroid Build Coastguard Worker 
997*fb1b10abSAndroid Build Coastguard Worker /*
998*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
999*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product & addition of byte vector elements
1000*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1001*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1002*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
1003*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned byte elements from in_h are multiplied with
1004*fb1b10abSAndroid Build Coastguard Worker  *               unsigned byte elements from in_l producing a result
1005*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed halfword.
1006*fb1b10abSAndroid Build Coastguard Worker  *               Then these multiplied results of adjacent odd-even elements
1007*fb1b10abSAndroid Build Coastguard Worker  *               are added to the in_c vector.
1008*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1009*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1010*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2add_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)1011*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
1012*fb1b10abSAndroid Build Coastguard Worker                                            __m256i in_l) {
1013*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1014*fb1b10abSAndroid Build Coastguard Worker 
1015*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
1016*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
1017*fb1b10abSAndroid Build Coastguard Worker   return out;
1018*fb1b10abSAndroid Build Coastguard Worker }
1019*fb1b10abSAndroid Build Coastguard Worker 
1020*fb1b10abSAndroid Build Coastguard Worker /*
1021*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1022*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product & addition of byte vector elements
1023*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1024*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1025*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - halfword
1026*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned byte elements from in_h are multiplied with
1027*fb1b10abSAndroid Build Coastguard Worker  *               signed byte elements from in_l producing a result
1028*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed halfword.
1029*fb1b10abSAndroid Build Coastguard Worker  *               Then these multiplied results of adjacent odd-even elements
1030*fb1b10abSAndroid Build Coastguard Worker  *               are added to the in_c vector.
1031*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1032*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1033*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2add_h_bu_b(__m256i in_c,__m256i in_h,__m256i in_l)1034*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
1035*fb1b10abSAndroid Build Coastguard Worker                                              __m256i in_l) {
1036*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1037*fb1b10abSAndroid Build Coastguard Worker 
1038*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
1039*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
1040*fb1b10abSAndroid Build Coastguard Worker   return out;
1041*fb1b10abSAndroid Build Coastguard Worker }
1042*fb1b10abSAndroid Build Coastguard Worker 
1043*fb1b10abSAndroid Build Coastguard Worker /*
1044*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1045*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of halfword vector elements
1046*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_c, in_h, in_l
1047*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1048*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - per RTYPE
1049*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed halfword elements from in_h are multiplied with
1050*fb1b10abSAndroid Build Coastguard Worker  *               signed halfword elements from in_l producing a result
1051*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed word.
1052*fb1b10abSAndroid Build Coastguard Worker  *               Multiplication result of adjacent odd-even elements
1053*fb1b10abSAndroid Build Coastguard Worker  *               are added to the in_c vector.
1054*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1055*fb1b10abSAndroid Build Coastguard Worker  *        in_c : 1,2,3,4, 1,2,3,4
1056*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
1057*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
1058*fb1b10abSAndroid Build Coastguard Worker  *         out : 23,40,41,26, 23,40,41,26
1059*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1060*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2add_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1061*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
1062*fb1b10abSAndroid Build Coastguard Worker                                           __m256i in_l) {
1063*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1064*fb1b10abSAndroid Build Coastguard Worker 
1065*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
1066*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1067*fb1b10abSAndroid Build Coastguard Worker   return out;
1068*fb1b10abSAndroid Build Coastguard Worker }
1069*fb1b10abSAndroid Build Coastguard Worker 
1070*fb1b10abSAndroid Build Coastguard Worker /*
1071*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1072*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of halfword vector elements
1073*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_c, in_h, in_l
1074*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1075*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed word
1076*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned halfword elements from in_h are multiplied with
1077*fb1b10abSAndroid Build Coastguard Worker  *               unsigned halfword elements from in_l producing a result
1078*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed word.
1079*fb1b10abSAndroid Build Coastguard Worker  *               Multiplication result of adjacent odd-even elements
1080*fb1b10abSAndroid Build Coastguard Worker  *               are added to the in_c vector.
1081*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1082*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1083*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2add_w_hu(__m256i in_c,__m256i in_h,__m256i in_l)1084*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
1085*fb1b10abSAndroid Build Coastguard Worker                                            __m256i in_l) {
1086*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1087*fb1b10abSAndroid Build Coastguard Worker 
1088*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
1089*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
1090*fb1b10abSAndroid Build Coastguard Worker   return out;
1091*fb1b10abSAndroid Build Coastguard Worker }
1092*fb1b10abSAndroid Build Coastguard Worker 
1093*fb1b10abSAndroid Build Coastguard Worker /*
1094*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1095*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of halfword vector elements
1096*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_c, in_h, in_l
1097*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1098*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed word
1099*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned halfword elements from in_h are multiplied with
1100*fb1b10abSAndroid Build Coastguard Worker  *               signed halfword elements from in_l producing a result
1101*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed word.
1102*fb1b10abSAndroid Build Coastguard Worker  *               Multiplication result of adjacent odd-even elements
1103*fb1b10abSAndroid Build Coastguard Worker  *               are added to the in_c vector
1104*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1105*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1106*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2add_w_hu_h(__m256i in_c,__m256i in_h,__m256i in_l)1107*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
1108*fb1b10abSAndroid Build Coastguard Worker                                              __m256i in_l) {
1109*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1110*fb1b10abSAndroid Build Coastguard Worker 
1111*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
1112*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
1113*fb1b10abSAndroid Build Coastguard Worker   return out;
1114*fb1b10abSAndroid Build Coastguard Worker }
1115*fb1b10abSAndroid Build Coastguard Worker 
1116*fb1b10abSAndroid Build Coastguard Worker /*
1117*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1118*fb1b10abSAndroid Build Coastguard Worker  * Description : Vector Unsigned Dot Product and Subtract
1119*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_c, in_h, in_l
1120*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1121*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed halfword
1122*fb1b10abSAndroid Build Coastguard Worker  * Details     : Unsigned byte elements from in_h are multiplied with
1123*fb1b10abSAndroid Build Coastguard Worker  *               unsigned byte elements from in_l producing a result
1124*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed halfword.
1125*fb1b10abSAndroid Build Coastguard Worker  *               Multiplication result of adjacent odd-even elements
1126*fb1b10abSAndroid Build Coastguard Worker  *               are added together and subtracted from double width elements
1127*fb1b10abSAndroid Build Coastguard Worker  *               in_c vector.
1128*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1129*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1130*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2sub_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)1131*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
1132*fb1b10abSAndroid Build Coastguard Worker                                            __m256i in_l) {
1133*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1134*fb1b10abSAndroid Build Coastguard Worker 
1135*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmulwev_h_bu(in_h, in_l);
1136*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
1137*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvsub_h(in_c, out);
1138*fb1b10abSAndroid Build Coastguard Worker   return out;
1139*fb1b10abSAndroid Build Coastguard Worker }
1140*fb1b10abSAndroid Build Coastguard Worker 
1141*fb1b10abSAndroid Build Coastguard Worker /*
1142*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1143*fb1b10abSAndroid Build Coastguard Worker  * Description : Vector Signed Dot Product and Subtract
1144*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_c, in_h, in_l
1145*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1146*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed word
1147*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed halfword elements from in_h are multiplied with
1148*fb1b10abSAndroid Build Coastguard Worker  *               Signed halfword elements from in_l producing a result
1149*fb1b10abSAndroid Build Coastguard Worker  *               twice the size of input i.e. signed word.
1150*fb1b10abSAndroid Build Coastguard Worker  *               Multiplication result of adjacent odd-even elements
1151*fb1b10abSAndroid Build Coastguard Worker  *               are added together and subtracted from double width elements
1152*fb1b10abSAndroid Build Coastguard Worker  *               in_c vector.
1153*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1154*fb1b10abSAndroid Build Coastguard Worker  *        in_c : 0,0,0,0, 0,0,0,0
1155*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1156*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
1157*fb1b10abSAndroid Build Coastguard Worker  *         out : -7,-3,0,0, 0,-1,0,-1
1158*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1159*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp2sub_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1160*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
1161*fb1b10abSAndroid Build Coastguard Worker                                           __m256i in_l) {
1162*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1163*fb1b10abSAndroid Build Coastguard Worker 
1164*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmulwev_w_h(in_h, in_l);
1165*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1166*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvsub_w(in_c, out);
1167*fb1b10abSAndroid Build Coastguard Worker   return out;
1168*fb1b10abSAndroid Build Coastguard Worker }
1169*fb1b10abSAndroid Build Coastguard Worker 
1170*fb1b10abSAndroid Build Coastguard Worker /*
1171*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1172*fb1b10abSAndroid Build Coastguard Worker  * Description : Dot product of halfword vector elements
1173*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1174*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1175*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed word
1176*fb1b10abSAndroid Build Coastguard Worker  * Details     : Signed halfword elements from in_h are multiplied with
1177*fb1b10abSAndroid Build Coastguard Worker  *               signed halfword elements from in_l producing a result
1178*fb1b10abSAndroid Build Coastguard Worker  *               four times the size of input i.e. signed doubleword.
1179*fb1b10abSAndroid Build Coastguard Worker  *               Then these multiplication results of four adjacent elements
1180*fb1b10abSAndroid Build Coastguard Worker  *               are added together and stored to the out vector.
1181*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
1182*fb1b10abSAndroid Build Coastguard Worker  *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1183*fb1b10abSAndroid Build Coastguard Worker  *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1184*fb1b10abSAndroid Build Coastguard Worker  *         out : -2,0,1,1
1185*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1186*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvdp4_d_h(__m256i in_h,__m256i in_l)1187*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1188*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1189*fb1b10abSAndroid Build Coastguard Worker 
1190*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmulwev_w_h(in_h, in_l);
1191*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1192*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvhaddw_d_w(out, out);
1193*fb1b10abSAndroid Build Coastguard Worker   return out;
1194*fb1b10abSAndroid Build Coastguard Worker }
1195*fb1b10abSAndroid Build Coastguard Worker 
1196*fb1b10abSAndroid Build Coastguard Worker /*
1197*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1198*fb1b10abSAndroid Build Coastguard Worker  * Description : The high half of the vector elements are expanded and
1199*fb1b10abSAndroid Build Coastguard Worker  *               added after being doubled.
1200*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1201*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1202*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_h vector and the in_l vector are added after the
1203*fb1b10abSAndroid Build Coastguard Worker  *               higher half of the two-fold sign extension (signed byte
1204*fb1b10abSAndroid Build Coastguard Worker  *               to signed halfword) and stored to the out vector.
1205*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1206*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1207*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvaddwh_h_b(__m256i in_h,__m256i in_l)1208*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1209*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1210*fb1b10abSAndroid Build Coastguard Worker 
1211*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvilvh_b(in_h, in_l);
1212*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvhaddw_h_b(out, out);
1213*fb1b10abSAndroid Build Coastguard Worker   return out;
1214*fb1b10abSAndroid Build Coastguard Worker }
1215*fb1b10abSAndroid Build Coastguard Worker 
1216*fb1b10abSAndroid Build Coastguard Worker /*
1217*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1218*fb1b10abSAndroid Build Coastguard Worker  * Description : The high half of the vector elements are expanded and
1219*fb1b10abSAndroid Build Coastguard Worker  *               added after being doubled.
1220*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1221*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1222*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_h vector and the in_l vector are added after the
1223*fb1b10abSAndroid Build Coastguard Worker  *               higher half of the two-fold sign extension (signed halfword
1224*fb1b10abSAndroid Build Coastguard Worker  *               to signed word) and stored to the out vector.
1225*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
1226*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1227*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1228*fb1b10abSAndroid Build Coastguard Worker  *         out : 1,0,0,-1, 1,0,0, 2
1229*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1230*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvaddwh_w_h(__m256i in_h,__m256i in_l)1231*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1232*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1233*fb1b10abSAndroid Build Coastguard Worker 
1234*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvilvh_h(in_h, in_l);
1235*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvhaddw_w_h(out, out);
1236*fb1b10abSAndroid Build Coastguard Worker   return out;
1237*fb1b10abSAndroid Build Coastguard Worker }
1238*fb1b10abSAndroid Build Coastguard Worker 
1239*fb1b10abSAndroid Build Coastguard Worker /*
1240*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1241*fb1b10abSAndroid Build Coastguard Worker  * Description : The low half of the vector elements are expanded and
1242*fb1b10abSAndroid Build Coastguard Worker  *               added after being doubled.
1243*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1244*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1245*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_h vector and the in_l vector are added after the
1246*fb1b10abSAndroid Build Coastguard Worker  *               lower half of the two-fold sign extension (signed byte
1247*fb1b10abSAndroid Build Coastguard Worker  *               to signed halfword) and stored to the out vector.
1248*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1249*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1250*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvaddwl_h_b(__m256i in_h,__m256i in_l)1251*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1252*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1253*fb1b10abSAndroid Build Coastguard Worker 
1254*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvilvl_b(in_h, in_l);
1255*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvhaddw_h_b(out, out);
1256*fb1b10abSAndroid Build Coastguard Worker   return out;
1257*fb1b10abSAndroid Build Coastguard Worker }
1258*fb1b10abSAndroid Build Coastguard Worker 
1259*fb1b10abSAndroid Build Coastguard Worker /*
1260*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1261*fb1b10abSAndroid Build Coastguard Worker  * Description : The low half of the vector elements are expanded and
1262*fb1b10abSAndroid Build Coastguard Worker  *               added after being doubled.
1263*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1264*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1265*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_h vector and the in_l vector are added after the
1266*fb1b10abSAndroid Build Coastguard Worker  *               lower half of the two-fold sign extension (signed halfword
1267*fb1b10abSAndroid Build Coastguard Worker  *               to signed word) and stored to the out vector.
1268*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
1269*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1270*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1271*fb1b10abSAndroid Build Coastguard Worker  *         out : 5,-1,4,2, 1,0,2,-1
1272*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1273*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvaddwl_w_h(__m256i in_h,__m256i in_l)1274*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1275*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1276*fb1b10abSAndroid Build Coastguard Worker 
1277*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvilvl_h(in_h, in_l);
1278*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvhaddw_w_h(out, out);
1279*fb1b10abSAndroid Build Coastguard Worker   return out;
1280*fb1b10abSAndroid Build Coastguard Worker }
1281*fb1b10abSAndroid Build Coastguard Worker 
1282*fb1b10abSAndroid Build Coastguard Worker /*
1283*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1284*fb1b10abSAndroid Build Coastguard Worker  * Description : The low half of the vector elements are expanded and
1285*fb1b10abSAndroid Build Coastguard Worker  *               added after being doubled.
1286*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1287*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1288*fb1b10abSAndroid Build Coastguard Worker  * Details     : The out vector and the out vector are added after the
1289*fb1b10abSAndroid Build Coastguard Worker  *               lower half of the two-fold zero extension (unsigned byte
1290*fb1b10abSAndroid Build Coastguard Worker  *               to unsigned halfword) and stored to the out vector.
1291*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1292*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1293*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvaddwl_h_bu(__m256i in_h,__m256i in_l)1294*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1295*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1296*fb1b10abSAndroid Build Coastguard Worker 
1297*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvilvl_b(in_h, in_l);
1298*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvhaddw_hu_bu(out, out);
1299*fb1b10abSAndroid Build Coastguard Worker   return out;
1300*fb1b10abSAndroid Build Coastguard Worker }
1301*fb1b10abSAndroid Build Coastguard Worker 
1302*fb1b10abSAndroid Build Coastguard Worker /*
1303*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1304*fb1b10abSAndroid Build Coastguard Worker  * Description : The low half of the vector elements are expanded and
1305*fb1b10abSAndroid Build Coastguard Worker  *               added after being doubled.
1306*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1307*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1308*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_l vector after double zero extension (unsigned byte to
1309*fb1b10abSAndroid Build Coastguard Worker  *               signed halfword),added to the in_h vector.
1310*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1311*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1312*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvaddw_h_h_bu(__m256i in_h,__m256i in_l)1313*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1314*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1315*fb1b10abSAndroid Build Coastguard Worker 
1316*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvsllwil_hu_bu(in_l, 0);
1317*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvadd_h(in_h, out);
1318*fb1b10abSAndroid Build Coastguard Worker   return out;
1319*fb1b10abSAndroid Build Coastguard Worker }
1320*fb1b10abSAndroid Build Coastguard Worker 
1321*fb1b10abSAndroid Build Coastguard Worker /*
1322*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1323*fb1b10abSAndroid Build Coastguard Worker  * Description : The low half of the vector elements are expanded and
1324*fb1b10abSAndroid Build Coastguard Worker  *               added after being doubled.
1325*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1326*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1327*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_l vector after double sign extension (signed halfword to
1328*fb1b10abSAndroid Build Coastguard Worker  *               signed word), added to the in_h vector.
1329*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1330*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 0, 1,0,0, -1,0,0,1,
1331*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
1332*fb1b10abSAndroid Build Coastguard Worker  *         out : 2, 0,1,2, -1,0,1,1,
1333*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1334*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvaddw_w_w_h(__m256i in_h,__m256i in_l)1335*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1336*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1337*fb1b10abSAndroid Build Coastguard Worker 
1338*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvsllwil_w_h(in_l, 0);
1339*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvadd_w(in_h, out);
1340*fb1b10abSAndroid Build Coastguard Worker   return out;
1341*fb1b10abSAndroid Build Coastguard Worker }
1342*fb1b10abSAndroid Build Coastguard Worker 
1343*fb1b10abSAndroid Build Coastguard Worker /*
1344*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1345*fb1b10abSAndroid Build Coastguard Worker  * Description : Multiplication and addition calculation after expansion
1346*fb1b10abSAndroid Build Coastguard Worker  *               of the lower half of the vector.
1347*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_c, in_h, in_l
1348*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1349*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_h vector and the in_l vector are multiplied after
1350*fb1b10abSAndroid Build Coastguard Worker  *               the lower half of the two-fold sign extension (signed halfword
1351*fb1b10abSAndroid Build Coastguard Worker  *               to signed word), and the result is added to the vector in_c,
1352*fb1b10abSAndroid Build Coastguard Worker  *               then stored to the out vector.
1353*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1354*fb1b10abSAndroid Build Coastguard Worker  *        in_c : 1,2,3,4, 5,6,7,8
1355*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1356*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
1357*fb1b10abSAndroid Build Coastguard Worker  *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
1358*fb1b10abSAndroid Build Coastguard Worker  *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1359*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1360*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvmaddwl_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1361*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
1362*fb1b10abSAndroid Build Coastguard Worker                                           __m256i in_l) {
1363*fb1b10abSAndroid Build Coastguard Worker   __m256i tmp0, tmp1, out;
1364*fb1b10abSAndroid Build Coastguard Worker 
1365*fb1b10abSAndroid Build Coastguard Worker   tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1366*fb1b10abSAndroid Build Coastguard Worker   tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1367*fb1b10abSAndroid Build Coastguard Worker   tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1368*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvadd_w(tmp0, in_c);
1369*fb1b10abSAndroid Build Coastguard Worker   return out;
1370*fb1b10abSAndroid Build Coastguard Worker }
1371*fb1b10abSAndroid Build Coastguard Worker 
1372*fb1b10abSAndroid Build Coastguard Worker /*
1373*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1374*fb1b10abSAndroid Build Coastguard Worker  * Description : Multiplication and addition calculation after expansion
1375*fb1b10abSAndroid Build Coastguard Worker  *               of the higher half of the vector.
1376*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_c, in_h, in_l
1377*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1378*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_h vector and the in_l vector are multiplied after
1379*fb1b10abSAndroid Build Coastguard Worker  *               the higher half of the two-fold sign extension (signed
1380*fb1b10abSAndroid Build Coastguard Worker  *               halfword to signed word), and the result is added to
1381*fb1b10abSAndroid Build Coastguard Worker  *               the vector in_c, then stored to the out vector.
1382*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1383*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1384*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvmaddwh_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1385*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
1386*fb1b10abSAndroid Build Coastguard Worker                                           __m256i in_l) {
1387*fb1b10abSAndroid Build Coastguard Worker   __m256i tmp0, tmp1, out;
1388*fb1b10abSAndroid Build Coastguard Worker 
1389*fb1b10abSAndroid Build Coastguard Worker   tmp0 = __lasx_xvilvh_h(in_h, in_h);
1390*fb1b10abSAndroid Build Coastguard Worker   tmp1 = __lasx_xvilvh_h(in_l, in_l);
1391*fb1b10abSAndroid Build Coastguard Worker   tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1392*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvadd_w(tmp0, in_c);
1393*fb1b10abSAndroid Build Coastguard Worker   return out;
1394*fb1b10abSAndroid Build Coastguard Worker }
1395*fb1b10abSAndroid Build Coastguard Worker 
1396*fb1b10abSAndroid Build Coastguard Worker /*
1397*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1398*fb1b10abSAndroid Build Coastguard Worker  * Description : Multiplication calculation after expansion of the lower
1399*fb1b10abSAndroid Build Coastguard Worker  *               half of the vector.
1400*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1401*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1402*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_h vector and the in_l vector are multiplied after
1403*fb1b10abSAndroid Build Coastguard Worker  *               the lower half of the two-fold sign extension (signed
1404*fb1b10abSAndroid Build Coastguard Worker  *               halfword to signed word), then stored to the out vector.
1405*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
1406*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1407*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1408*fb1b10abSAndroid Build Coastguard Worker  *         out : 6,1,3,0, 0,0,1,0
1409*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1410*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvmulwl_w_h(__m256i in_h,__m256i in_l)1411*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1412*fb1b10abSAndroid Build Coastguard Worker   __m256i tmp0, tmp1, out;
1413*fb1b10abSAndroid Build Coastguard Worker 
1414*fb1b10abSAndroid Build Coastguard Worker   tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1415*fb1b10abSAndroid Build Coastguard Worker   tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1416*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmul_w(tmp0, tmp1);
1417*fb1b10abSAndroid Build Coastguard Worker   return out;
1418*fb1b10abSAndroid Build Coastguard Worker }
1419*fb1b10abSAndroid Build Coastguard Worker 
1420*fb1b10abSAndroid Build Coastguard Worker /*
1421*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1422*fb1b10abSAndroid Build Coastguard Worker  * Description : Multiplication calculation after expansion of the lower
1423*fb1b10abSAndroid Build Coastguard Worker  *               half of the vector.
1424*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1425*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1426*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_h vector and the in_l vector are multiplied after
1427*fb1b10abSAndroid Build Coastguard Worker  *               the lower half of the two-fold sign extension (signed
1428*fb1b10abSAndroid Build Coastguard Worker  *               halfword to signed word), then stored to the out vector.
1429*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
1430*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1431*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1432*fb1b10abSAndroid Build Coastguard Worker  *         out : 0,0,0,0, 0,0,0,1
1433*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1434*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvmulwh_w_h(__m256i in_h,__m256i in_l)1435*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1436*fb1b10abSAndroid Build Coastguard Worker   __m256i tmp0, tmp1, out;
1437*fb1b10abSAndroid Build Coastguard Worker 
1438*fb1b10abSAndroid Build Coastguard Worker   tmp0 = __lasx_xvilvh_h(in_h, in_h);
1439*fb1b10abSAndroid Build Coastguard Worker   tmp1 = __lasx_xvilvh_h(in_l, in_l);
1440*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1441*fb1b10abSAndroid Build Coastguard Worker   return out;
1442*fb1b10abSAndroid Build Coastguard Worker }
1443*fb1b10abSAndroid Build Coastguard Worker 
1444*fb1b10abSAndroid Build Coastguard Worker /*
1445*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1446*fb1b10abSAndroid Build Coastguard Worker  * Description : The low half of the vector elements are added to the high half
1447*fb1b10abSAndroid Build Coastguard Worker  *               after being doubled, then saturated.
1448*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in_h, in_l
1449*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1450*fb1b10abSAndroid Build Coastguard Worker  * Details     : The in_h vector adds the in_l vector after the lower half of
1451*fb1b10abSAndroid Build Coastguard Worker  *               the two-fold zero extension (unsigned byte to unsigned
1452*fb1b10abSAndroid Build Coastguard Worker  *               halfword) and then saturated. The results are stored to the out
1453*fb1b10abSAndroid Build Coastguard Worker  *               vector.
1454*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1455*fb1b10abSAndroid Build Coastguard Worker  *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1456*fb1b10abSAndroid Build Coastguard Worker  *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1457*fb1b10abSAndroid Build Coastguard Worker  *               0,0,0,1
1458*fb1b10abSAndroid Build Coastguard Worker  *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1459*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1460*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h,__m256i in_l)1461*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1462*fb1b10abSAndroid Build Coastguard Worker   __m256i tmp1, out;
1463*fb1b10abSAndroid Build Coastguard Worker   __m256i zero = { 0 };
1464*fb1b10abSAndroid Build Coastguard Worker 
1465*fb1b10abSAndroid Build Coastguard Worker   tmp1 = __lasx_xvilvl_b(zero, in_l);
1466*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvsadd_hu(in_h, tmp1);
1467*fb1b10abSAndroid Build Coastguard Worker   return out;
1468*fb1b10abSAndroid Build Coastguard Worker }
1469*fb1b10abSAndroid Build Coastguard Worker 
1470*fb1b10abSAndroid Build Coastguard Worker /*
1471*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1472*fb1b10abSAndroid Build Coastguard Worker  * Description : Clip all halfword elements of input vector between min & max
1473*fb1b10abSAndroid Build Coastguard Worker  *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1474*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in    (input vector)
1475*fb1b10abSAndroid Build Coastguard Worker  *                       - min   (min threshold)
1476*fb1b10abSAndroid Build Coastguard Worker  *                       - max   (max threshold)
1477*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - in    (output vector with clipped elements)
1478*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed halfword
1479*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvclip_h(in, min, max)
1480*fb1b10abSAndroid Build Coastguard Worker  *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1481*fb1b10abSAndroid Build Coastguard Worker  *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1482*fb1b10abSAndroid Build Coastguard Worker  *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1483*fb1b10abSAndroid Build Coastguard Worker  *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1484*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1485*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvclip_h(__m256i in,__m256i min,__m256i max)1486*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1487*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1488*fb1b10abSAndroid Build Coastguard Worker 
1489*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmax_h(min, in);
1490*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmin_h(max, out);
1491*fb1b10abSAndroid Build Coastguard Worker   return out;
1492*fb1b10abSAndroid Build Coastguard Worker }
1493*fb1b10abSAndroid Build Coastguard Worker 
1494*fb1b10abSAndroid Build Coastguard Worker /*
1495*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1496*fb1b10abSAndroid Build Coastguard Worker  * Description : Clip all signed halfword elements of input vector
1497*fb1b10abSAndroid Build Coastguard Worker  *               between 0 & 255
1498*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - in   (input vector)
1499*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - out  (output vector with clipped elements)
1500*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed halfword
1501*fb1b10abSAndroid Build Coastguard Worker  * Example     : See out = __lasx_xvclip255_w(in)
1502*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1503*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvclip255_h(__m256i in)1504*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvclip255_h(__m256i in) {
1505*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1506*fb1b10abSAndroid Build Coastguard Worker 
1507*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaxi_h(in, 0);
1508*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvsat_hu(out, 7);
1509*fb1b10abSAndroid Build Coastguard Worker   return out;
1510*fb1b10abSAndroid Build Coastguard Worker }
1511*fb1b10abSAndroid Build Coastguard Worker 
1512*fb1b10abSAndroid Build Coastguard Worker /*
1513*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1514*fb1b10abSAndroid Build Coastguard Worker  * Description : Clip all signed word elements of input vector
1515*fb1b10abSAndroid Build Coastguard Worker  *               between 0 & 255
1516*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in   (input vector)
1517*fb1b10abSAndroid Build Coastguard Worker  *               Output - out  (output vector with clipped elements)
1518*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed word
1519*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvclip255_w(in)
1520*fb1b10abSAndroid Build Coastguard Worker  *          in : -8,255,280,249, -8,255,280,249
1521*fb1b10abSAndroid Build Coastguard Worker  *         out :  0,255,255,249,  0,255,255,249
1522*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1523*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvclip255_w(__m256i in)1524*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvclip255_w(__m256i in) {
1525*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1526*fb1b10abSAndroid Build Coastguard Worker 
1527*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvmaxi_w(in, 0);
1528*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvsat_wu(out, 7);
1529*fb1b10abSAndroid Build Coastguard Worker   return out;
1530*fb1b10abSAndroid Build Coastguard Worker }
1531*fb1b10abSAndroid Build Coastguard Worker 
1532*fb1b10abSAndroid Build Coastguard Worker /*
1533*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1534*fb1b10abSAndroid Build Coastguard Worker  * Description : Indexed halfword element values are replicated to all
1535*fb1b10abSAndroid Build Coastguard Worker  *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1536*fb1b10abSAndroid Build Coastguard Worker  *               if 'idx >= 8' use xvsplati_h_*.
1537*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in, idx
1538*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1539*fb1b10abSAndroid Build Coastguard Worker  * Details     : Idx element value from in vector is replicated to all
1540*fb1b10abSAndroid Build Coastguard Worker  *               elements in out vector.
1541*fb1b10abSAndroid Build Coastguard Worker  *               Valid index range for halfword operation is 0-7
1542*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvsplati_l_h(in, idx)
1543*fb1b10abSAndroid Build Coastguard Worker  *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1544*fb1b10abSAndroid Build Coastguard Worker  *         idx : 0x02
1545*fb1b10abSAndroid Build Coastguard Worker  *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1546*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1547*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvsplati_l_h(__m256i in,int idx)1548*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1549*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1550*fb1b10abSAndroid Build Coastguard Worker 
1551*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvpermi_q(in, in, 0x02);
1552*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvreplve_h(out, idx);
1553*fb1b10abSAndroid Build Coastguard Worker   return out;
1554*fb1b10abSAndroid Build Coastguard Worker }
1555*fb1b10abSAndroid Build Coastguard Worker 
1556*fb1b10abSAndroid Build Coastguard Worker /*
1557*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1558*fb1b10abSAndroid Build Coastguard Worker  * Description : Indexed halfword element values are replicated to all
1559*fb1b10abSAndroid Build Coastguard Worker  *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1560*fb1b10abSAndroid Build Coastguard Worker  *               if 'idx >= 8' use xvsplati_h_*.
1561*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs - in, idx
1562*fb1b10abSAndroid Build Coastguard Worker  *               Output - out
1563*fb1b10abSAndroid Build Coastguard Worker  * Details     : Idx element value from in vector is replicated to all
1564*fb1b10abSAndroid Build Coastguard Worker  *               elements in out vector.
1565*fb1b10abSAndroid Build Coastguard Worker  *               Valid index range for halfword operation is 0-7
1566*fb1b10abSAndroid Build Coastguard Worker  * Example     : out = __lasx_xvsplati_h_h(in, idx)
1567*fb1b10abSAndroid Build Coastguard Worker  *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1568*fb1b10abSAndroid Build Coastguard Worker  *         idx : 0x09
1569*fb1b10abSAndroid Build Coastguard Worker  *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1570*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1571*fb1b10abSAndroid Build Coastguard Worker  */
__lasx_xvsplati_h_h(__m256i in,int idx)1572*fb1b10abSAndroid Build Coastguard Worker static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1573*fb1b10abSAndroid Build Coastguard Worker   __m256i out;
1574*fb1b10abSAndroid Build Coastguard Worker 
1575*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvpermi_q(in, in, 0x13);
1576*fb1b10abSAndroid Build Coastguard Worker   out = __lasx_xvreplve_h(out, idx);
1577*fb1b10abSAndroid Build Coastguard Worker   return out;
1578*fb1b10abSAndroid Build Coastguard Worker }
1579*fb1b10abSAndroid Build Coastguard Worker 
1580*fb1b10abSAndroid Build Coastguard Worker /*
1581*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1582*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose 4x4 block with double-word elements in vectors
1583*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1584*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3
1585*fb1b10abSAndroid Build Coastguard Worker  * Example     : LASX_TRANSPOSE4x4_D
1586*fb1b10abSAndroid Build Coastguard Worker  *        _in0 : 1,2,3,4
1587*fb1b10abSAndroid Build Coastguard Worker  *        _in1 : 1,2,3,4
1588*fb1b10abSAndroid Build Coastguard Worker  *        _in2 : 1,2,3,4
1589*fb1b10abSAndroid Build Coastguard Worker  *        _in3 : 1,2,3,4
1590*fb1b10abSAndroid Build Coastguard Worker  *
1591*fb1b10abSAndroid Build Coastguard Worker  *       _out0 : 1,1,1,1
1592*fb1b10abSAndroid Build Coastguard Worker  *       _out1 : 2,2,2,2
1593*fb1b10abSAndroid Build Coastguard Worker  *       _out2 : 3,3,3,3
1594*fb1b10abSAndroid Build Coastguard Worker  *       _out3 : 4,4,4,4
1595*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1596*fb1b10abSAndroid Build Coastguard Worker  */
1597*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1598*fb1b10abSAndroid Build Coastguard Worker                             _out3)                                       \
1599*fb1b10abSAndroid Build Coastguard Worker   {                                                                      \
1600*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
1601*fb1b10abSAndroid Build Coastguard Worker     _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
1602*fb1b10abSAndroid Build Coastguard Worker     _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
1603*fb1b10abSAndroid Build Coastguard Worker     _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
1604*fb1b10abSAndroid Build Coastguard Worker     _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
1605*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
1606*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
1607*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
1608*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
1609*fb1b10abSAndroid Build Coastguard Worker   }
1610*fb1b10abSAndroid Build Coastguard Worker 
1611*fb1b10abSAndroid Build Coastguard Worker /*
1612*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1613*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose 8x8 block with word elements in vectors
1614*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1615*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1616*fb1b10abSAndroid Build Coastguard Worker  *               _out7
1617*fb1b10abSAndroid Build Coastguard Worker  * Example     : LASX_TRANSPOSE8x8_W
1618*fb1b10abSAndroid Build Coastguard Worker  *        _in0 : 1,2,3,4,5,6,7,8
1619*fb1b10abSAndroid Build Coastguard Worker  *        _in1 : 2,2,3,4,5,6,7,8
1620*fb1b10abSAndroid Build Coastguard Worker  *        _in2 : 3,2,3,4,5,6,7,8
1621*fb1b10abSAndroid Build Coastguard Worker  *        _in3 : 4,2,3,4,5,6,7,8
1622*fb1b10abSAndroid Build Coastguard Worker  *        _in4 : 5,2,3,4,5,6,7,8
1623*fb1b10abSAndroid Build Coastguard Worker  *        _in5 : 6,2,3,4,5,6,7,8
1624*fb1b10abSAndroid Build Coastguard Worker  *        _in6 : 7,2,3,4,5,6,7,8
1625*fb1b10abSAndroid Build Coastguard Worker  *        _in7 : 8,2,3,4,5,6,7,8
1626*fb1b10abSAndroid Build Coastguard Worker  *
1627*fb1b10abSAndroid Build Coastguard Worker  *       _out0 : 1,2,3,4,5,6,7,8
1628*fb1b10abSAndroid Build Coastguard Worker  *       _out1 : 2,2,2,2,2,2,2,2
1629*fb1b10abSAndroid Build Coastguard Worker  *       _out2 : 3,3,3,3,3,3,3,3
1630*fb1b10abSAndroid Build Coastguard Worker  *       _out3 : 4,4,4,4,4,4,4,4
1631*fb1b10abSAndroid Build Coastguard Worker  *       _out4 : 5,5,5,5,5,5,5,5
1632*fb1b10abSAndroid Build Coastguard Worker  *       _out5 : 6,6,6,6,6,6,6,6
1633*fb1b10abSAndroid Build Coastguard Worker  *       _out6 : 7,7,7,7,7,7,7,7
1634*fb1b10abSAndroid Build Coastguard Worker  *       _out7 : 8,8,8,8,8,8,8,8
1635*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1636*fb1b10abSAndroid Build Coastguard Worker  */
1637*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1638*fb1b10abSAndroid Build Coastguard Worker                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1639*fb1b10abSAndroid Build Coastguard Worker                             _out7)                                           \
1640*fb1b10abSAndroid Build Coastguard Worker   {                                                                          \
1641*fb1b10abSAndroid Build Coastguard Worker     __m256i _s0_m, _s1_m;                                                    \
1642*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1643*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1644*fb1b10abSAndroid Build Coastguard Worker                                                                              \
1645*fb1b10abSAndroid Build Coastguard Worker     _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
1646*fb1b10abSAndroid Build Coastguard Worker     _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
1647*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1648*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1649*fb1b10abSAndroid Build Coastguard Worker     _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
1650*fb1b10abSAndroid Build Coastguard Worker     _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
1651*fb1b10abSAndroid Build Coastguard Worker     _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1652*fb1b10abSAndroid Build Coastguard Worker     _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1653*fb1b10abSAndroid Build Coastguard Worker     _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
1654*fb1b10abSAndroid Build Coastguard Worker     _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
1655*fb1b10abSAndroid Build Coastguard Worker     _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1656*fb1b10abSAndroid Build Coastguard Worker     _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1657*fb1b10abSAndroid Build Coastguard Worker     _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
1658*fb1b10abSAndroid Build Coastguard Worker     _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
1659*fb1b10abSAndroid Build Coastguard Worker     _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1660*fb1b10abSAndroid Build Coastguard Worker     _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1661*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
1662*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
1663*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
1664*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
1665*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
1666*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
1667*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
1668*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
1669*fb1b10abSAndroid Build Coastguard Worker   }
1670*fb1b10abSAndroid Build Coastguard Worker 
1671*fb1b10abSAndroid Build Coastguard Worker /*
1672*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1673*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose input 16x8 byte block
1674*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1675*fb1b10abSAndroid Build Coastguard Worker  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1676*fb1b10abSAndroid Build Coastguard Worker  *                         (input 16x8 byte block)
1677*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1678*fb1b10abSAndroid Build Coastguard Worker  *                         _out7 (output 8x16 byte block)
1679*fb1b10abSAndroid Build Coastguard Worker  * Details     : The rows of the matrix become columns, and the columns become
1680*fb1b10abSAndroid Build Coastguard Worker  *               rows.
1681*fb1b10abSAndroid Build Coastguard Worker  * Example     : See LASX_TRANSPOSE16x8_H
1682*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1683*fb1b10abSAndroid Build Coastguard Worker  */
1684*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1685*fb1b10abSAndroid Build Coastguard Worker                              _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1686*fb1b10abSAndroid Build Coastguard Worker                              _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1687*fb1b10abSAndroid Build Coastguard Worker                              _out6, _out7)                                    \
1688*fb1b10abSAndroid Build Coastguard Worker   {                                                                           \
1689*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1690*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1691*fb1b10abSAndroid Build Coastguard Worker                                                                               \
1692*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
1693*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
1694*fb1b10abSAndroid Build Coastguard Worker     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
1695*fb1b10abSAndroid Build Coastguard Worker     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
1696*fb1b10abSAndroid Build Coastguard Worker     _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
1697*fb1b10abSAndroid Build Coastguard Worker     _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
1698*fb1b10abSAndroid Build Coastguard Worker     _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
1699*fb1b10abSAndroid Build Coastguard Worker     _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
1700*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
1701*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
1702*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
1703*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
1704*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
1705*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
1706*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
1707*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
1708*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
1709*fb1b10abSAndroid Build Coastguard Worker     _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
1710*fb1b10abSAndroid Build Coastguard Worker     _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
1711*fb1b10abSAndroid Build Coastguard Worker     _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
1712*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
1713*fb1b10abSAndroid Build Coastguard Worker     _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
1714*fb1b10abSAndroid Build Coastguard Worker     _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
1715*fb1b10abSAndroid Build Coastguard Worker     _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
1716*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
1717*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
1718*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
1719*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
1720*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
1721*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
1722*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
1723*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
1724*fb1b10abSAndroid Build Coastguard Worker   }
1725*fb1b10abSAndroid Build Coastguard Worker 
1726*fb1b10abSAndroid Build Coastguard Worker /*
1727*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1728*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose input 16x8 byte block
1729*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1730*fb1b10abSAndroid Build Coastguard Worker  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1731*fb1b10abSAndroid Build Coastguard Worker  *                         (input 16x8 byte block)
1732*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1733*fb1b10abSAndroid Build Coastguard Worker  *                         _out7 (output 8x16 byte block)
1734*fb1b10abSAndroid Build Coastguard Worker  * Details     : The rows of the matrix become columns, and the columns become
1735*fb1b10abSAndroid Build Coastguard Worker  *               rows.
1736*fb1b10abSAndroid Build Coastguard Worker  * Example     : LASX_TRANSPOSE16x8_H
1737*fb1b10abSAndroid Build Coastguard Worker  *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1738*fb1b10abSAndroid Build Coastguard Worker  *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1739*fb1b10abSAndroid Build Coastguard Worker  *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1740*fb1b10abSAndroid Build Coastguard Worker  *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1741*fb1b10abSAndroid Build Coastguard Worker  *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1742*fb1b10abSAndroid Build Coastguard Worker  *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1743*fb1b10abSAndroid Build Coastguard Worker  *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1744*fb1b10abSAndroid Build Coastguard Worker  *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1745*fb1b10abSAndroid Build Coastguard Worker  *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1746*fb1b10abSAndroid Build Coastguard Worker  *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1747*fb1b10abSAndroid Build Coastguard Worker  *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1748*fb1b10abSAndroid Build Coastguard Worker  *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1749*fb1b10abSAndroid Build Coastguard Worker  *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1750*fb1b10abSAndroid Build Coastguard Worker  *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1751*fb1b10abSAndroid Build Coastguard Worker  *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1752*fb1b10abSAndroid Build Coastguard Worker  *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1753*fb1b10abSAndroid Build Coastguard Worker  *
1754*fb1b10abSAndroid Build Coastguard Worker  *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1755*fb1b10abSAndroid Build Coastguard Worker  *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1756*fb1b10abSAndroid Build Coastguard Worker  *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1757*fb1b10abSAndroid Build Coastguard Worker  *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1758*fb1b10abSAndroid Build Coastguard Worker  *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1759*fb1b10abSAndroid Build Coastguard Worker  *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1760*fb1b10abSAndroid Build Coastguard Worker  *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1761*fb1b10abSAndroid Build Coastguard Worker  *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1762*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1763*fb1b10abSAndroid Build Coastguard Worker  */
1764*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1765*fb1b10abSAndroid Build Coastguard Worker                              _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1766*fb1b10abSAndroid Build Coastguard Worker                              _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1767*fb1b10abSAndroid Build Coastguard Worker                              _out6, _out7)                                    \
1768*fb1b10abSAndroid Build Coastguard Worker   {                                                                           \
1769*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1770*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1771*fb1b10abSAndroid Build Coastguard Worker     __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
1772*fb1b10abSAndroid Build Coastguard Worker                                                                               \
1773*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
1774*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
1775*fb1b10abSAndroid Build Coastguard Worker     _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
1776*fb1b10abSAndroid Build Coastguard Worker     _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
1777*fb1b10abSAndroid Build Coastguard Worker     _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
1778*fb1b10abSAndroid Build Coastguard Worker     _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
1779*fb1b10abSAndroid Build Coastguard Worker     _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
1780*fb1b10abSAndroid Build Coastguard Worker     _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
1781*fb1b10abSAndroid Build Coastguard Worker     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1782*fb1b10abSAndroid Build Coastguard Worker     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1783*fb1b10abSAndroid Build Coastguard Worker     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1784*fb1b10abSAndroid Build Coastguard Worker     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1785*fb1b10abSAndroid Build Coastguard Worker     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1786*fb1b10abSAndroid Build Coastguard Worker     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1787*fb1b10abSAndroid Build Coastguard Worker     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1788*fb1b10abSAndroid Build Coastguard Worker     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1789*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1790*fb1b10abSAndroid Build Coastguard Worker     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1791*fb1b10abSAndroid Build Coastguard Worker     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1792*fb1b10abSAndroid Build Coastguard Worker     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1793*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1794*fb1b10abSAndroid Build Coastguard Worker     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1795*fb1b10abSAndroid Build Coastguard Worker     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1796*fb1b10abSAndroid Build Coastguard Worker     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1797*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1798*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1799*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1800*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1801*fb1b10abSAndroid Build Coastguard Worker                                                                               \
1802*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
1803*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
1804*fb1b10abSAndroid Build Coastguard Worker     _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
1805*fb1b10abSAndroid Build Coastguard Worker     _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
1806*fb1b10abSAndroid Build Coastguard Worker     _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
1807*fb1b10abSAndroid Build Coastguard Worker     _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
1808*fb1b10abSAndroid Build Coastguard Worker     _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
1809*fb1b10abSAndroid Build Coastguard Worker     _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
1810*fb1b10abSAndroid Build Coastguard Worker     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1811*fb1b10abSAndroid Build Coastguard Worker     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1812*fb1b10abSAndroid Build Coastguard Worker     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1813*fb1b10abSAndroid Build Coastguard Worker     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1814*fb1b10abSAndroid Build Coastguard Worker     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1815*fb1b10abSAndroid Build Coastguard Worker     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1816*fb1b10abSAndroid Build Coastguard Worker     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1817*fb1b10abSAndroid Build Coastguard Worker     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1818*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1819*fb1b10abSAndroid Build Coastguard Worker     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1820*fb1b10abSAndroid Build Coastguard Worker     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1821*fb1b10abSAndroid Build Coastguard Worker     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1822*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1823*fb1b10abSAndroid Build Coastguard Worker     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1824*fb1b10abSAndroid Build Coastguard Worker     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1825*fb1b10abSAndroid Build Coastguard Worker     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1826*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1827*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1828*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1829*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1830*fb1b10abSAndroid Build Coastguard Worker   }
1831*fb1b10abSAndroid Build Coastguard Worker 
1832*fb1b10abSAndroid Build Coastguard Worker /*
1833*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1834*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose 4x4 block with halfword elements in vectors
1835*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1836*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3
1837*fb1b10abSAndroid Build Coastguard Worker  *               Return Type - signed halfword
1838*fb1b10abSAndroid Build Coastguard Worker  * Details     : The rows of the matrix become columns, and the columns become
1839*fb1b10abSAndroid Build Coastguard Worker  *               rows.
1840*fb1b10abSAndroid Build Coastguard Worker  * Example     : See LASX_TRANSPOSE8x8_H
1841*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1842*fb1b10abSAndroid Build Coastguard Worker  */
1843*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1844*fb1b10abSAndroid Build Coastguard Worker                             _out3)                                       \
1845*fb1b10abSAndroid Build Coastguard Worker   {                                                                      \
1846*fb1b10abSAndroid Build Coastguard Worker     __m256i _s0_m, _s1_m;                                                \
1847*fb1b10abSAndroid Build Coastguard Worker                                                                          \
1848*fb1b10abSAndroid Build Coastguard Worker     _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
1849*fb1b10abSAndroid Build Coastguard Worker     _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
1850*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
1851*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
1852*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
1853*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
1854*fb1b10abSAndroid Build Coastguard Worker   }
1855*fb1b10abSAndroid Build Coastguard Worker 
1856*fb1b10abSAndroid Build Coastguard Worker /*
1857*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1858*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose input 8x8 byte block
1859*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1860*fb1b10abSAndroid Build Coastguard Worker  *                         (input 8x8 byte block)
1861*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1862*fb1b10abSAndroid Build Coastguard Worker  *                         _out7 (output 8x8 byte block)
1863*fb1b10abSAndroid Build Coastguard Worker  * Example     : See LASX_TRANSPOSE8x8_H
1864*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1865*fb1b10abSAndroid Build Coastguard Worker  */
1866*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1867*fb1b10abSAndroid Build Coastguard Worker                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1868*fb1b10abSAndroid Build Coastguard Worker                             _out7)                                           \
1869*fb1b10abSAndroid Build Coastguard Worker   {                                                                          \
1870*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1871*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1872*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
1873*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
1874*fb1b10abSAndroid Build Coastguard Worker     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
1875*fb1b10abSAndroid Build Coastguard Worker     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
1876*fb1b10abSAndroid Build Coastguard Worker     _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
1877*fb1b10abSAndroid Build Coastguard Worker     _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
1878*fb1b10abSAndroid Build Coastguard Worker     _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
1879*fb1b10abSAndroid Build Coastguard Worker     _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
1880*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
1881*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
1882*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
1883*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
1884*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
1885*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
1886*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
1887*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
1888*fb1b10abSAndroid Build Coastguard Worker   }
1889*fb1b10abSAndroid Build Coastguard Worker 
1890*fb1b10abSAndroid Build Coastguard Worker /*
1891*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1892*fb1b10abSAndroid Build Coastguard Worker  * Description : Transpose 8x8 block with halfword elements in vectors.
1893*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, ~
1894*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, ~
1895*fb1b10abSAndroid Build Coastguard Worker  * Details     : The rows of the matrix become columns, and the columns become
1896*fb1b10abSAndroid Build Coastguard Worker  *               rows.
1897*fb1b10abSAndroid Build Coastguard Worker  * Example     : LASX_TRANSPOSE8x8_H
1898*fb1b10abSAndroid Build Coastguard Worker  *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1899*fb1b10abSAndroid Build Coastguard Worker  *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1900*fb1b10abSAndroid Build Coastguard Worker  *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1901*fb1b10abSAndroid Build Coastguard Worker  *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1902*fb1b10abSAndroid Build Coastguard Worker  *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1903*fb1b10abSAndroid Build Coastguard Worker  *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1904*fb1b10abSAndroid Build Coastguard Worker  *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1905*fb1b10abSAndroid Build Coastguard Worker  *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1906*fb1b10abSAndroid Build Coastguard Worker  *
1907*fb1b10abSAndroid Build Coastguard Worker  *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1908*fb1b10abSAndroid Build Coastguard Worker  *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1909*fb1b10abSAndroid Build Coastguard Worker  *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1910*fb1b10abSAndroid Build Coastguard Worker  *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1911*fb1b10abSAndroid Build Coastguard Worker  *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1912*fb1b10abSAndroid Build Coastguard Worker  *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1913*fb1b10abSAndroid Build Coastguard Worker  *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1914*fb1b10abSAndroid Build Coastguard Worker  *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1915*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1916*fb1b10abSAndroid Build Coastguard Worker  */
1917*fb1b10abSAndroid Build Coastguard Worker #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1918*fb1b10abSAndroid Build Coastguard Worker                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1919*fb1b10abSAndroid Build Coastguard Worker                             _out7)                                           \
1920*fb1b10abSAndroid Build Coastguard Worker   {                                                                          \
1921*fb1b10abSAndroid Build Coastguard Worker     __m256i _s0_m, _s1_m;                                                    \
1922*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1923*fb1b10abSAndroid Build Coastguard Worker     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1924*fb1b10abSAndroid Build Coastguard Worker                                                                              \
1925*fb1b10abSAndroid Build Coastguard Worker     _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
1926*fb1b10abSAndroid Build Coastguard Worker     _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
1927*fb1b10abSAndroid Build Coastguard Worker     _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1928*fb1b10abSAndroid Build Coastguard Worker     _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1929*fb1b10abSAndroid Build Coastguard Worker     _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
1930*fb1b10abSAndroid Build Coastguard Worker     _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
1931*fb1b10abSAndroid Build Coastguard Worker     _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1932*fb1b10abSAndroid Build Coastguard Worker     _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1933*fb1b10abSAndroid Build Coastguard Worker                                                                              \
1934*fb1b10abSAndroid Build Coastguard Worker     _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
1935*fb1b10abSAndroid Build Coastguard Worker     _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
1936*fb1b10abSAndroid Build Coastguard Worker     _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1937*fb1b10abSAndroid Build Coastguard Worker     _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1938*fb1b10abSAndroid Build Coastguard Worker     _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
1939*fb1b10abSAndroid Build Coastguard Worker     _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
1940*fb1b10abSAndroid Build Coastguard Worker     _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1941*fb1b10abSAndroid Build Coastguard Worker     _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1942*fb1b10abSAndroid Build Coastguard Worker                                                                              \
1943*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
1944*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
1945*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
1946*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
1947*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
1948*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
1949*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
1950*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
1951*fb1b10abSAndroid Build Coastguard Worker   }
1952*fb1b10abSAndroid Build Coastguard Worker 
1953*fb1b10abSAndroid Build Coastguard Worker /*
1954*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1955*fb1b10abSAndroid Build Coastguard Worker  * Description : Butterfly of 4 input vectors
1956*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1957*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3
1958*fb1b10abSAndroid Build Coastguard Worker  * Details     : Butterfly operation
1959*fb1b10abSAndroid Build Coastguard Worker  * Example     : LASX_BUTTERFLY_4
1960*fb1b10abSAndroid Build Coastguard Worker  *               _out0 = _in0 + _in3;
1961*fb1b10abSAndroid Build Coastguard Worker  *               _out1 = _in1 + _in2;
1962*fb1b10abSAndroid Build Coastguard Worker  *               _out2 = _in1 - _in2;
1963*fb1b10abSAndroid Build Coastguard Worker  *               _out3 = _in0 - _in3;
1964*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1965*fb1b10abSAndroid Build Coastguard Worker  */
1966*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1967*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
1968*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
1969*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
1970*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
1971*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
1972*fb1b10abSAndroid Build Coastguard Worker   }
1973*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1974*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
1975*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
1976*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
1977*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
1978*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
1979*fb1b10abSAndroid Build Coastguard Worker   }
1980*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1981*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
1982*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
1983*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
1984*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
1985*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
1986*fb1b10abSAndroid Build Coastguard Worker   }
1987*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1988*fb1b10abSAndroid Build Coastguard Worker   {                                                                            \
1989*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
1990*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
1991*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
1992*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
1993*fb1b10abSAndroid Build Coastguard Worker   }
1994*fb1b10abSAndroid Build Coastguard Worker 
1995*fb1b10abSAndroid Build Coastguard Worker /*
1996*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
1997*fb1b10abSAndroid Build Coastguard Worker  * Description : Butterfly of 8 input vectors
1998*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
1999*fb1b10abSAndroid Build Coastguard Worker  *               Outputs - _out0, _out1, _out2, _out3, ~
2000*fb1b10abSAndroid Build Coastguard Worker  * Details     : Butterfly operation
2001*fb1b10abSAndroid Build Coastguard Worker  * Example     : LASX_BUTTERFLY_8
2002*fb1b10abSAndroid Build Coastguard Worker  *               _out0 = _in0 + _in7;
2003*fb1b10abSAndroid Build Coastguard Worker  *               _out1 = _in1 + _in6;
2004*fb1b10abSAndroid Build Coastguard Worker  *               _out2 = _in2 + _in5;
2005*fb1b10abSAndroid Build Coastguard Worker  *               _out3 = _in3 + _in4;
2006*fb1b10abSAndroid Build Coastguard Worker  *               _out4 = _in3 - _in4;
2007*fb1b10abSAndroid Build Coastguard Worker  *               _out5 = _in2 - _in5;
2008*fb1b10abSAndroid Build Coastguard Worker  *               _out6 = _in1 - _in6;
2009*fb1b10abSAndroid Build Coastguard Worker  *               _out7 = _in0 - _in7;
2010*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
2011*fb1b10abSAndroid Build Coastguard Worker  */
2012*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
2013*fb1b10abSAndroid Build Coastguard Worker                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2014*fb1b10abSAndroid Build Coastguard Worker                            _out7)                                           \
2015*fb1b10abSAndroid Build Coastguard Worker   {                                                                         \
2016*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
2017*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
2018*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
2019*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
2020*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
2021*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
2022*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
2023*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
2024*fb1b10abSAndroid Build Coastguard Worker   }
2025*fb1b10abSAndroid Build Coastguard Worker 
2026*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
2027*fb1b10abSAndroid Build Coastguard Worker                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2028*fb1b10abSAndroid Build Coastguard Worker                            _out7)                                           \
2029*fb1b10abSAndroid Build Coastguard Worker   {                                                                         \
2030*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
2031*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
2032*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
2033*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
2034*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
2035*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
2036*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
2037*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
2038*fb1b10abSAndroid Build Coastguard Worker   }
2039*fb1b10abSAndroid Build Coastguard Worker 
2040*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
2041*fb1b10abSAndroid Build Coastguard Worker                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2042*fb1b10abSAndroid Build Coastguard Worker                            _out7)                                           \
2043*fb1b10abSAndroid Build Coastguard Worker   {                                                                         \
2044*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
2045*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
2046*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
2047*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
2048*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
2049*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
2050*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
2051*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
2052*fb1b10abSAndroid Build Coastguard Worker   }
2053*fb1b10abSAndroid Build Coastguard Worker 
2054*fb1b10abSAndroid Build Coastguard Worker #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
2055*fb1b10abSAndroid Build Coastguard Worker                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2056*fb1b10abSAndroid Build Coastguard Worker                            _out7)                                           \
2057*fb1b10abSAndroid Build Coastguard Worker   {                                                                         \
2058*fb1b10abSAndroid Build Coastguard Worker     _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
2059*fb1b10abSAndroid Build Coastguard Worker     _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
2060*fb1b10abSAndroid Build Coastguard Worker     _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
2061*fb1b10abSAndroid Build Coastguard Worker     _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
2062*fb1b10abSAndroid Build Coastguard Worker     _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
2063*fb1b10abSAndroid Build Coastguard Worker     _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
2064*fb1b10abSAndroid Build Coastguard Worker     _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
2065*fb1b10abSAndroid Build Coastguard Worker     _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
2066*fb1b10abSAndroid Build Coastguard Worker   }
2067*fb1b10abSAndroid Build Coastguard Worker 
2068*fb1b10abSAndroid Build Coastguard Worker #endif  // LASX
2069*fb1b10abSAndroid Build Coastguard Worker 
2070*fb1b10abSAndroid Build Coastguard Worker /*
2071*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
2072*fb1b10abSAndroid Build Coastguard Worker  * Description : Print out elements in vector.
2073*fb1b10abSAndroid Build Coastguard Worker  * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
2074*fb1b10abSAndroid Build Coastguard Worker  *               Outputs -
2075*fb1b10abSAndroid Build Coastguard Worker  * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
2076*fb1b10abSAndroid Build Coastguard Worker  *               '_enter' is TRUE, prefix "\nVP:" will be added first.
2077*fb1b10abSAndroid Build Coastguard Worker  * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
2078*fb1b10abSAndroid Build Coastguard Worker  *               VP:1,2,3,4,
2079*fb1b10abSAndroid Build Coastguard Worker  * =============================================================================
2080*fb1b10abSAndroid Build Coastguard Worker  */
2081*fb1b10abSAndroid Build Coastguard Worker #define VECT_PRINT(RTYPE, element_num, in0, enter)                 \
2082*fb1b10abSAndroid Build Coastguard Worker   {                                                                \
2083*fb1b10abSAndroid Build Coastguard Worker     RTYPE _tmp0 = (RTYPE)in0;                                      \
2084*fb1b10abSAndroid Build Coastguard Worker     int _i = 0;                                                    \
2085*fb1b10abSAndroid Build Coastguard Worker     if (enter) printf("\nVP:");                                    \
2086*fb1b10abSAndroid Build Coastguard Worker     for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
2087*fb1b10abSAndroid Build Coastguard Worker   }
2088*fb1b10abSAndroid Build Coastguard Worker 
2089*fb1b10abSAndroid Build Coastguard Worker #endif /* LOONGSON_INTRINSICS_H */
2090*fb1b10abSAndroid Build Coastguard Worker #endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */
2091