xref: /aosp_15_r20/external/libvpx/vpx_util/loongson_intrinsics.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  * Use of this source code is governed by a BSD-style license
5  * that can be found in the LICENSE file in the root of the source
6  * tree. An additional intellectual property rights grant can be found
7  * in the file PATENTS.  All contributing project authors may
8  * be found in the AUTHORS file in the root of the source tree.
9  *
10  */
11 
12 #ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
13 #define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
14 
15 /*
16  * Copyright (c) 2021 Loongson Technology Corporation Limited
17  * All rights reserved.
18  *
19  * Use of this source code is governed by a BSD-style license
20  * that can be found in the LICENSE file in the root of the source
21  * tree. An additional intellectual property rights grant can be found
22  * in the file PATENTS.  All contributing project authors may
23  * be found in the AUTHORS file in the root of the source tree.
24  *
25  * Contributed by Shiyou Yin <[email protected]>
26  *                Xiwei Gu   <[email protected]>
27  *                Lu Wang    <[email protected]>
28  *
29  * This file is a header file for loongarch builtin extension.
30  *
31  */
32 
33 #ifndef LOONGSON_INTRINSICS_H
34 #define LOONGSON_INTRINSICS_H
35 
36 /**
37  * MAJOR version: Macro usage changes.
38  * MINOR version: Add new functions, or bug fixes.
39  * MICRO version: Comment changes or implementation changes.
40  */
41 #define LSOM_VERSION_MAJOR 1
42 #define LSOM_VERSION_MINOR 2
43 #define LSOM_VERSION_MICRO 1
44 
45 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
46   {                                               \
47     _OUT0 = _INS(_IN0);                           \
48     _OUT1 = _INS(_IN1);                           \
49   }
50 
51 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
52   {                                                           \
53     _OUT0 = _INS(_IN0, _IN1);                                 \
54     _OUT1 = _INS(_IN2, _IN3);                                 \
55   }
56 
57 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
58   {                                                                       \
59     _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
60     _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
61   }
62 
63 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
64   {                                                                         \
65     DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
66     DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
67   }
68 
69 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
70                   _OUT1, _OUT2, _OUT3)                                         \
71   {                                                                            \
72     DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
73     DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
74   }
75 
76 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
77                   _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
78   {                                                                           \
79     DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
80     DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
81   }
82 
83 #ifdef __loongarch_sx
84 #include <lsxintrin.h>
85 /*
86  * =============================================================================
87  * Description : Dot product & addition of byte vector elements
88  * Arguments   : Inputs  - in_c, in_h, in_l
89  *               Outputs - out
90  *               Return Type - halfword
91  * Details     : Signed byte elements from in_h are multiplied by
92  *               signed byte elements from in_l, and then added adjacent to
93  *               each other to get a result twice the size of input. Then
94  *               the results are added to signed half-word elements from in_c.
95  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
96  *        in_c : 1,2,3,4, 1,2,3,4
97  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
98  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
99  *         out : 23,40,41,26, 23,40,41,26
100  * =============================================================================
101  */
__lsx_vdp2add_h_b(__m128i in_c,__m128i in_h,__m128i in_l)102 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
103                                         __m128i in_l) {
104   __m128i out;
105 
106   out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
107   out = __lsx_vmaddwod_h_b(out, in_h, in_l);
108   return out;
109 }
110 
111 /*
112  * =============================================================================
113  * Description : Dot product & addition of byte vector elements
114  * Arguments   : Inputs  - in_c, in_h, in_l
115  *               Outputs - out
116  *               Return Type - halfword
117  * Details     : Unsigned byte elements from in_h are multiplied by
118  *               unsigned byte elements from in_l, and then added adjacent to
119  *               each other to get a result twice the size of input.
120  *               The results are added to signed half-word elements from in_c.
121  * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
122  *        in_c : 1,2,3,4, 1,2,3,4
123  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
124  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
125  *         out : 23,40,41,26, 23,40,41,26
126  * =============================================================================
127  */
__lsx_vdp2add_h_bu(__m128i in_c,__m128i in_h,__m128i in_l)128 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
129                                          __m128i in_l) {
130   __m128i out;
131 
132   out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
133   out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
134   return out;
135 }
136 
137 /*
138  * =============================================================================
139  * Description : Dot product & addition of byte vector elements
140  * Arguments   : Inputs  - in_c, in_h, in_l
141  *               Outputs - out
142  *               Return Type - halfword
143  * Details     : Unsigned byte elements from in_h are multiplied by
144  *               signed byte elements from in_l, and then added adjacent to
145  *               each other to get a result twice the size of input.
146  *               The results are added to signed half-word elements from in_c.
147  * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
148  *        in_c : 1,1,1,1, 1,1,1,1
149  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
150  *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
151  *         out : -4,-24,-60,-112, 6,26,62,114
152  * =============================================================================
153  */
__lsx_vdp2add_h_bu_b(__m128i in_c,__m128i in_h,__m128i in_l)154 static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
155                                            __m128i in_l) {
156   __m128i out;
157 
158   out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
159   out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
160   return out;
161 }
162 
163 /*
164  * =============================================================================
165  * Description : Dot product & addition of half-word vector elements
166  * Arguments   : Inputs  - in_c, in_h, in_l
167  *               Outputs - out
168  *               Return Type - __m128i
169  * Details     : Signed half-word elements from in_h are multiplied by
170  *               signed half-word elements from in_l, and then added adjacent to
171  *               each other to get a result twice the size of input.
172  *               Then the results are added to signed word elements from in_c.
173  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
174  *        in_c : 1,2,3,4
175  *        in_h : 1,2,3,4, 5,6,7,8
176  *        in_l : 8,7,6,5, 4,3,2,1
177  *         out : 23,40,41,26
178  * =============================================================================
179  */
__lsx_vdp2add_w_h(__m128i in_c,__m128i in_h,__m128i in_l)180 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
181                                         __m128i in_l) {
182   __m128i out;
183 
184   out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
185   out = __lsx_vmaddwod_w_h(out, in_h, in_l);
186   return out;
187 }
188 
189 /*
190  * =============================================================================
191  * Description : Dot product of byte vector elements
192  * Arguments   : Inputs  - in_h, in_l
193  *               Outputs - out
194  *               Return Type - halfword
195  * Details     : Signed byte elements from in_h are multiplied by
196  *               signed byte elements from in_l, and then added adjacent to
197  *               each other to get a result twice the size of input.
198  * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
199  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
200  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
201  *         out : 22,38,38,22, 22,38,38,22
202  * =============================================================================
203  */
__lsx_vdp2_h_b(__m128i in_h,__m128i in_l)204 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
205   __m128i out;
206 
207   out = __lsx_vmulwev_h_b(in_h, in_l);
208   out = __lsx_vmaddwod_h_b(out, in_h, in_l);
209   return out;
210 }
211 
212 /*
213  * =============================================================================
214  * Description : Dot product of byte vector elements
215  * Arguments   : Inputs  - in_h, in_l
216  *               Outputs - out
217  *               Return Type - halfword
218  * Details     : Unsigned byte elements from in_h are multiplied by
219  *               unsigned byte elements from in_l, and then added adjacent to
220  *               each other to get a result twice the size of input.
221  * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
222  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
223  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
224  *         out : 22,38,38,22, 22,38,38,22
225  * =============================================================================
226  */
__lsx_vdp2_h_bu(__m128i in_h,__m128i in_l)227 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
228   __m128i out;
229 
230   out = __lsx_vmulwev_h_bu(in_h, in_l);
231   out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
232   return out;
233 }
234 
235 /*
236  * =============================================================================
237  * Description : Dot product of byte vector elements
238  * Arguments   : Inputs  - in_h, in_l
239  *               Outputs - out
240  *               Return Type - halfword
241  * Details     : Unsigned byte elements from in_h are multiplied by
242  *               signed byte elements from in_l, and then added adjacent to
243  *               each other to get a result twice the size of input.
244  * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
245  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
246  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
247  *         out : 22,38,38,22, 22,38,38,6
248  * =============================================================================
249  */
__lsx_vdp2_h_bu_b(__m128i in_h,__m128i in_l)250 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
251   __m128i out;
252 
253   out = __lsx_vmulwev_h_bu_b(in_h, in_l);
254   out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
255   return out;
256 }
257 
258 /*
259  * =============================================================================
260  * Description : Dot product of byte vector elements
261  * Arguments   : Inputs  - in_h, in_l
262  *               Outputs - out
263  *               Return Type - halfword
264  * Details     : Signed byte elements from in_h are multiplied by
265  *               signed byte elements from in_l, and then added adjacent to
266  *               each other to get a result twice the size of input.
267  * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
268  *        in_h : 1,2,3,4, 5,6,7,8
269  *        in_l : 8,7,6,5, 4,3,2,1
270  *         out : 22,38,38,22
271  * =============================================================================
272  */
__lsx_vdp2_w_h(__m128i in_h,__m128i in_l)273 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
274   __m128i out;
275 
276   out = __lsx_vmulwev_w_h(in_h, in_l);
277   out = __lsx_vmaddwod_w_h(out, in_h, in_l);
278   return out;
279 }
280 
281 /*
282  * =============================================================================
283  * Description : Dot product of byte vector elements
284  * Arguments   : Inputs  - in_h, in_l
285  *               Outputs - out
286  *               Return Type - double
287  * Details     : Signed byte elements from in_h are multiplied by
288  *               signed byte elements from in_l, and then added adjacent to
289  *               each other to get a result twice the size of input.
290  * Example     : out = __lsx_vdp2_d_w(in_h, in_l)
291  *        in_h : 1,2,3,4
292  *        in_l : 8,7,6,5
293  *         out : 22,38
294  * =============================================================================
295  */
__lsx_vdp2_d_w(__m128i in_h,__m128i in_l)296 static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) {
297   __m128i out;
298 
299   out = __lsx_vmulwev_d_w(in_h, in_l);
300   out = __lsx_vmaddwod_d_w(out, in_h, in_l);
301   return out;
302 }
303 
304 /*
305  * =============================================================================
306  * Description : Clip all halfword elements of input vector between min & max
307  *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
308  *               (_in))
309  * Arguments   : Inputs  - _in  (input vector)
310  *                       - min  (min threshold)
311  *                       - max  (max threshold)
312  *               Outputs - out  (output vector with clipped elements)
313  *               Return Type - signed halfword
314  * Example     : out = __lsx_vclip_h(_in)
315  *         _in : -8,2,280,249, -8,255,280,249
316  *         min : 1,1,1,1, 1,1,1,1
317  *         max : 9,9,9,9, 9,9,9,9
318  *         out : 1,2,9,9, 1,9,9,9
319  * =============================================================================
320  */
__lsx_vclip_h(__m128i _in,__m128i min,__m128i max)321 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
322   __m128i out;
323 
324   out = __lsx_vmax_h(min, _in);
325   out = __lsx_vmin_h(max, out);
326   return out;
327 }
328 
329 /*
330  * =============================================================================
331  * Description : Set each element of vector between 0 and 255
332  * Arguments   : Inputs  - _in
333  *               Outputs - out
334  *               Return Type - halfword
335  * Details     : Signed byte elements from _in are clamped between 0 and 255.
336  * Example     : out = __lsx_vclip255_h(_in)
337  *         _in : -8,255,280,249, -8,255,280,249
338  *         out : 0,255,255,249, 0,255,255,249
339  * =============================================================================
340  */
__lsx_vclip255_h(__m128i _in)341 static inline __m128i __lsx_vclip255_h(__m128i _in) {
342   __m128i out;
343 
344   out = __lsx_vmaxi_h(_in, 0);
345   out = __lsx_vsat_hu(out, 7);
346   return out;
347 }
348 
349 /*
350  * =============================================================================
351  * Description : Set each element of vector between 0 and 255
352  * Arguments   : Inputs  - _in
353  *               Outputs - out
354  *               Return Type - word
355  * Details     : Signed byte elements from _in are clamped between 0 and 255.
356  * Example     : out = __lsx_vclip255_w(_in)
357  *         _in : -8,255,280,249
358  *         out : 0,255,255,249
359  * =============================================================================
360  */
__lsx_vclip255_w(__m128i _in)361 static inline __m128i __lsx_vclip255_w(__m128i _in) {
362   __m128i out;
363 
364   out = __lsx_vmaxi_w(_in, 0);
365   out = __lsx_vsat_wu(out, 7);
366   return out;
367 }
368 
369 /*
370  * =============================================================================
371  * Description : Swap two variables
372  * Arguments   : Inputs  - _in0, _in1
373  *               Outputs - _in0, _in1 (in-place)
374  * Details     : Swapping of two input variables using xor
375  * Example     : LSX_SWAP(_in0, _in1)
376  *        _in0 : 1,2,3,4
377  *        _in1 : 5,6,7,8
378  *   _in0(out) : 5,6,7,8
379  *   _in1(out) : 1,2,3,4
380  * =============================================================================
381  */
382 #define LSX_SWAP(_in0, _in1)         \
383   {                                  \
384     _in0 = __lsx_vxor_v(_in0, _in1); \
385     _in1 = __lsx_vxor_v(_in0, _in1); \
386     _in0 = __lsx_vxor_v(_in0, _in1); \
387   }
388 
389 /*
390  * =============================================================================
391  * Description : Transpose 4x4 block with word elements in vectors
392  * Arguments   : Inputs  - in0, in1, in2, in3
393  *               Outputs - out0, out1, out2, out3
394  * Details     :
395  * Example     :
396  *               1, 2, 3, 4            1, 5, 9,13
397  *               5, 6, 7, 8    to      2, 6,10,14
398  *               9,10,11,12  =====>    3, 7,11,15
399  *              13,14,15,16            4, 8,12,16
400  * =============================================================================
401  */
402 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
403   {                                                                            \
404     __m128i _t0, _t1, _t2, _t3;                                                \
405                                                                                \
406     _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
407     _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
408     _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
409     _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
410     _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
411     _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
412     _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
413     _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
414   }
415 
416 /*
417  * =============================================================================
418  * Description : Transpose 8x8 block with byte elements in vectors
419  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
420  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
421  *               _out7
422  * Details     : The rows of the matrix become columns, and the columns
423  *               become rows.
424  * Example     : LSX_TRANSPOSE8x8_B
425  *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
426  *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
427  *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
428  *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
429  *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
430  *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
431  *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
432  *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
433  *
434  *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
435  *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
436  *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
437  *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
438  *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
439  *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
440  *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
441  *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
442  * =============================================================================
443  */
444 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
445                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
446                            _out7)                                           \
447   {                                                                         \
448     __m128i zero = { 0 };                                                   \
449     __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };             \
450     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
451                                                                             \
452     _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
453     _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
454     _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
455     _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
456     _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
457     _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
458     _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
459     _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
460     _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
461     _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
462     _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
463     _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
464     _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
465     _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
466     _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
467     _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
468   }
469 
470 /*
471  * =============================================================================
472  * Description : Transpose 8x8 block with half-word elements in vectors
473  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
474  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
475  * Details     :
476  * Example     :
477  *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
478  *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
479  *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
480  *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
481  *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
482  *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
483  *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
484  *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
485  * =============================================================================
486  */
487 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
488                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
489                            _out7)                                           \
490   {                                                                         \
491     __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
492                                                                             \
493     _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
494     _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
495     _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
496     _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
497     _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
498     _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
499     _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
500     _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
501     _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
502     _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
503     _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
504     _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
505     _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
506     _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
507     _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
508     _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
509                                                                             \
510     _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
511     _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
512     _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
513     _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
514     _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
515     _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
516     _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
517     _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
518   }
519 
520 /*
521  * =============================================================================
522  * Description : Transpose input 8x4 byte block into 4x8
523  * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
524  *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
525  *               Return Type - as per RTYPE
526  * Details     : The rows of the matrix become columns, and the columns become
527  *               rows.
528  * Example     : LSX_TRANSPOSE8x4_B
529  *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
530  *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
531  *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
532  *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
533  *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
534  *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
535  *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
536  *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
537  *
538  *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
539  *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
540  *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
541  *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
542  * =============================================================================
543  */
544 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
545                            _out0, _out1, _out2, _out3)                     \
546   {                                                                        \
547     __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
548                                                                            \
549     _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
550     _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
551     _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
552     _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
553     _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
554                                                                            \
555     _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
556     _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
557     _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
558                                                                            \
559     _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
560     _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
561     _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
562     _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
563   }
564 
565 /*
566  * =============================================================================
567  * Description : Transpose 16x8 block with byte elements in vectors
568  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
569  *                         in9, in10, in11, in12, in13, in14, in15
570  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
571  * Details     :
572  * Example     :
573  *              000,001,002,003,004,005,006,007
574  *              008,009,010,011,012,013,014,015
575  *              016,017,018,019,020,021,022,023
576  *              024,025,026,027,028,029,030,031
577  *              032,033,034,035,036,037,038,039
578  *              040,041,042,043,044,045,046,047        000,008,...,112,120
579  *              048,049,050,051,052,053,054,055        001,009,...,113,121
580  *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
581  *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
582  *              072,073,074,075,076,077,078,079        004,012,...,116,124
583  *              080,081,082,083,084,085,086,087        005,013,...,117,125
584  *              088,089,090,091,092,093,094,095        006,014,...,118,126
585  *              096,097,098,099,100,101,102,103        007,015,...,119,127
586  *              104,105,106,107,108,109,110,111
587  *              112,113,114,115,116,117,118,119
588  *              120,121,122,123,124,125,126,127
589  * =============================================================================
590  */
591 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
592                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
593                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
594                             _out6, _out7)                                    \
595   {                                                                          \
596     __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
597     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
598     DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
599               _tmp0, _tmp1, _tmp2, _tmp3);                                   \
600     DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
601               _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
602     DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
603     DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
604     DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
605     DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
606     DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
607     DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
608     DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
609     DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
610     DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
611     DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
612     DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
613     DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
614   }
615 
616 /*
617  * =============================================================================
618  * Description : Butterfly of 4 input vectors
619  * Arguments   : Inputs  - in0, in1, in2, in3
620  *               Outputs - out0, out1, out2, out3
621  * Details     : Butterfly operation
622  * Example     :
623  *               out0 = in0 + in3;
624  *               out1 = in1 + in2;
625  *               out2 = in1 - in2;
626  *               out3 = in0 - in3;
627  * =============================================================================
628  */
629 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
630   {                                                                           \
631     _out0 = __lsx_vadd_b(_in0, _in3);                                         \
632     _out1 = __lsx_vadd_b(_in1, _in2);                                         \
633     _out2 = __lsx_vsub_b(_in1, _in2);                                         \
634     _out3 = __lsx_vsub_b(_in0, _in3);                                         \
635   }
636 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
637   {                                                                           \
638     _out0 = __lsx_vadd_h(_in0, _in3);                                         \
639     _out1 = __lsx_vadd_h(_in1, _in2);                                         \
640     _out2 = __lsx_vsub_h(_in1, _in2);                                         \
641     _out3 = __lsx_vsub_h(_in0, _in3);                                         \
642   }
643 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
644   {                                                                           \
645     _out0 = __lsx_vadd_w(_in0, _in3);                                         \
646     _out1 = __lsx_vadd_w(_in1, _in2);                                         \
647     _out2 = __lsx_vsub_w(_in1, _in2);                                         \
648     _out3 = __lsx_vsub_w(_in0, _in3);                                         \
649   }
650 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
651   {                                                                           \
652     _out0 = __lsx_vadd_d(_in0, _in3);                                         \
653     _out1 = __lsx_vadd_d(_in1, _in2);                                         \
654     _out2 = __lsx_vsub_d(_in1, _in2);                                         \
655     _out3 = __lsx_vsub_d(_in0, _in3);                                         \
656   }
657 
658 /*
659  * =============================================================================
660  * Description : Butterfly of 8 input vectors
661  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
662  *               Outputs - _out0, _out1, _out2, _out3, ~
663  * Details     : Butterfly operation
664  * Example     :
665  *              _out0 = _in0 + _in7;
666  *              _out1 = _in1 + _in6;
667  *              _out2 = _in2 + _in5;
668  *              _out3 = _in3 + _in4;
669  *              _out4 = _in3 - _in4;
670  *              _out5 = _in2 - _in5;
671  *              _out6 = _in1 - _in6;
672  *              _out7 = _in0 - _in7;
673  * =============================================================================
674  */
675 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
676                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
677                           _out7)                                           \
678   {                                                                        \
679     _out0 = __lsx_vadd_b(_in0, _in7);                                      \
680     _out1 = __lsx_vadd_b(_in1, _in6);                                      \
681     _out2 = __lsx_vadd_b(_in2, _in5);                                      \
682     _out3 = __lsx_vadd_b(_in3, _in4);                                      \
683     _out4 = __lsx_vsub_b(_in3, _in4);                                      \
684     _out5 = __lsx_vsub_b(_in2, _in5);                                      \
685     _out6 = __lsx_vsub_b(_in1, _in6);                                      \
686     _out7 = __lsx_vsub_b(_in0, _in7);                                      \
687   }
688 
689 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
690                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
691                           _out7)                                           \
692   {                                                                        \
693     _out0 = __lsx_vadd_h(_in0, _in7);                                      \
694     _out1 = __lsx_vadd_h(_in1, _in6);                                      \
695     _out2 = __lsx_vadd_h(_in2, _in5);                                      \
696     _out3 = __lsx_vadd_h(_in3, _in4);                                      \
697     _out4 = __lsx_vsub_h(_in3, _in4);                                      \
698     _out5 = __lsx_vsub_h(_in2, _in5);                                      \
699     _out6 = __lsx_vsub_h(_in1, _in6);                                      \
700     _out7 = __lsx_vsub_h(_in0, _in7);                                      \
701   }
702 
703 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
704                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
705                           _out7)                                           \
706   {                                                                        \
707     _out0 = __lsx_vadd_w(_in0, _in7);                                      \
708     _out1 = __lsx_vadd_w(_in1, _in6);                                      \
709     _out2 = __lsx_vadd_w(_in2, _in5);                                      \
710     _out3 = __lsx_vadd_w(_in3, _in4);                                      \
711     _out4 = __lsx_vsub_w(_in3, _in4);                                      \
712     _out5 = __lsx_vsub_w(_in2, _in5);                                      \
713     _out6 = __lsx_vsub_w(_in1, _in6);                                      \
714     _out7 = __lsx_vsub_w(_in0, _in7);                                      \
715   }
716 
717 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
718                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
719                           _out7)                                           \
720   {                                                                        \
721     _out0 = __lsx_vadd_d(_in0, _in7);                                      \
722     _out1 = __lsx_vadd_d(_in1, _in6);                                      \
723     _out2 = __lsx_vadd_d(_in2, _in5);                                      \
724     _out3 = __lsx_vadd_d(_in3, _in4);                                      \
725     _out4 = __lsx_vsub_d(_in3, _in4);                                      \
726     _out5 = __lsx_vsub_d(_in2, _in5);                                      \
727     _out6 = __lsx_vsub_d(_in1, _in6);                                      \
728     _out7 = __lsx_vsub_d(_in0, _in7);                                      \
729   }
730 
731 /*
732  * =============================================================================
733  * Description : Butterfly of 16 input vectors
734  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
735  *               Outputs - _out0, _out1, _out2, _out3, ~
736  * Details     : Butterfly operation
737  * Example     :
738  *              _out0 = _in0 + _in15;
739  *              _out1 = _in1 + _in14;
740  *              _out2 = _in2 + _in13;
741  *              _out3 = _in3 + _in12;
742  *              _out4 = _in4 + _in11;
743  *              _out5 = _in5 + _in10;
744  *              _out6 = _in6 + _in9;
745  *              _out7 = _in7 + _in8;
746  *              _out8 = _in7 - _in8;
747  *              _out9 = _in6 - _in9;
748  *              _out10 = _in5 - _in10;
749  *              _out11 = _in4 - _in11;
750  *              _out12 = _in3 - _in12;
751  *              _out13 = _in2 - _in13;
752  *              _out14 = _in1 - _in14;
753  *              _out15 = _in0 - _in15;
754  * =============================================================================
755  */
756 
757 #define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
758                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
759                            _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
760                            _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
761                            _out13, _out14, _out15)                             \
762   {                                                                            \
763     _out0 = __lsx_vadd_b(_in0, _in15);                                         \
764     _out1 = __lsx_vadd_b(_in1, _in14);                                         \
765     _out2 = __lsx_vadd_b(_in2, _in13);                                         \
766     _out3 = __lsx_vadd_b(_in3, _in12);                                         \
767     _out4 = __lsx_vadd_b(_in4, _in11);                                         \
768     _out5 = __lsx_vadd_b(_in5, _in10);                                         \
769     _out6 = __lsx_vadd_b(_in6, _in9);                                          \
770     _out7 = __lsx_vadd_b(_in7, _in8);                                          \
771                                                                                \
772     _out8 = __lsx_vsub_b(_in7, _in8);                                          \
773     _out9 = __lsx_vsub_b(_in6, _in9);                                          \
774     _out10 = __lsx_vsub_b(_in5, _in10);                                        \
775     _out11 = __lsx_vsub_b(_in4, _in11);                                        \
776     _out12 = __lsx_vsub_b(_in3, _in12);                                        \
777     _out13 = __lsx_vsub_b(_in2, _in13);                                        \
778     _out14 = __lsx_vsub_b(_in1, _in14);                                        \
779     _out15 = __lsx_vsub_b(_in0, _in15);                                        \
780   }
781 
782 #define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
783                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
784                            _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
785                            _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
786                            _out13, _out14, _out15)                             \
787   {                                                                            \
788     _out0 = __lsx_vadd_h(_in0, _in15);                                         \
789     _out1 = __lsx_vadd_h(_in1, _in14);                                         \
790     _out2 = __lsx_vadd_h(_in2, _in13);                                         \
791     _out3 = __lsx_vadd_h(_in3, _in12);                                         \
792     _out4 = __lsx_vadd_h(_in4, _in11);                                         \
793     _out5 = __lsx_vadd_h(_in5, _in10);                                         \
794     _out6 = __lsx_vadd_h(_in6, _in9);                                          \
795     _out7 = __lsx_vadd_h(_in7, _in8);                                          \
796                                                                                \
797     _out8 = __lsx_vsub_h(_in7, _in8);                                          \
798     _out9 = __lsx_vsub_h(_in6, _in9);                                          \
799     _out10 = __lsx_vsub_h(_in5, _in10);                                        \
800     _out11 = __lsx_vsub_h(_in4, _in11);                                        \
801     _out12 = __lsx_vsub_h(_in3, _in12);                                        \
802     _out13 = __lsx_vsub_h(_in2, _in13);                                        \
803     _out14 = __lsx_vsub_h(_in1, _in14);                                        \
804     _out15 = __lsx_vsub_h(_in0, _in15);                                        \
805   }
806 
807 #define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
808                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
809                            _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
810                            _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
811                            _out13, _out14, _out15)                             \
812   {                                                                            \
813     _out0 = __lsx_vadd_w(_in0, _in15);                                         \
814     _out1 = __lsx_vadd_w(_in1, _in14);                                         \
815     _out2 = __lsx_vadd_w(_in2, _in13);                                         \
816     _out3 = __lsx_vadd_w(_in3, _in12);                                         \
817     _out4 = __lsx_vadd_w(_in4, _in11);                                         \
818     _out5 = __lsx_vadd_w(_in5, _in10);                                         \
819     _out6 = __lsx_vadd_w(_in6, _in9);                                          \
820     _out7 = __lsx_vadd_w(_in7, _in8);                                          \
821                                                                                \
822     _out8 = __lsx_vsub_w(_in7, _in8);                                          \
823     _out9 = __lsx_vsub_w(_in6, _in9);                                          \
824     _out10 = __lsx_vsub_w(_in5, _in10);                                        \
825     _out11 = __lsx_vsub_w(_in4, _in11);                                        \
826     _out12 = __lsx_vsub_w(_in3, _in12);                                        \
827     _out13 = __lsx_vsub_w(_in2, _in13);                                        \
828     _out14 = __lsx_vsub_w(_in1, _in14);                                        \
829     _out15 = __lsx_vsub_w(_in0, _in15);                                        \
830   }
831 
832 #define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
833                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
834                            _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
835                            _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
836                            _out13, _out14, _out15)                             \
837   {                                                                            \
838     _out0 = __lsx_vadd_d(_in0, _in15);                                         \
839     _out1 = __lsx_vadd_d(_in1, _in14);                                         \
840     _out2 = __lsx_vadd_d(_in2, _in13);                                         \
841     _out3 = __lsx_vadd_d(_in3, _in12);                                         \
842     _out4 = __lsx_vadd_d(_in4, _in11);                                         \
843     _out5 = __lsx_vadd_d(_in5, _in10);                                         \
844     _out6 = __lsx_vadd_d(_in6, _in9);                                          \
845     _out7 = __lsx_vadd_d(_in7, _in8);                                          \
846                                                                                \
847     _out8 = __lsx_vsub_d(_in7, _in8);                                          \
848     _out9 = __lsx_vsub_d(_in6, _in9);                                          \
849     _out10 = __lsx_vsub_d(_in5, _in10);                                        \
850     _out11 = __lsx_vsub_d(_in4, _in11);                                        \
851     _out12 = __lsx_vsub_d(_in3, _in12);                                        \
852     _out13 = __lsx_vsub_d(_in2, _in13);                                        \
853     _out14 = __lsx_vsub_d(_in1, _in14);                                        \
854     _out15 = __lsx_vsub_d(_in0, _in15);                                        \
855   }
856 
857 #endif  // LSX
858 
859 #ifdef __loongarch_asx
860 #include <lasxintrin.h>
861 /*
862  * =============================================================================
863  * Description : Dot product of byte vector elements
864  * Arguments   : Inputs - in_h, in_l
865  *               Output - out
866  *               Return Type - signed halfword
867  * Details     : Unsigned byte elements from in_h are multiplied with
868  *               unsigned byte elements from in_l producing a result
869  *               twice the size of input i.e. signed halfword.
870  *               Then these multiplied results of adjacent odd-even elements
871  *               are added to the out vector
872  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
873  * =============================================================================
874  */
__lasx_xvdp2_h_bu(__m256i in_h,__m256i in_l)875 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
876   __m256i out;
877 
878   out = __lasx_xvmulwev_h_bu(in_h, in_l);
879   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
880   return out;
881 }
882 
883 /*
884  * =============================================================================
885  * Description : Dot product of byte vector elements
886  * Arguments   : Inputs - in_h, in_l
887  *               Output - out
888  *               Return Type - signed halfword
889  * Details     : Signed byte elements from in_h are multiplied with
890  *               signed byte elements from in_l producing a result
891  *               twice the size of input i.e. signed halfword.
892  *               Then these multiplication results of adjacent odd-even elements
893  *               are added to the out vector
894  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
895  * =============================================================================
896  */
__lasx_xvdp2_h_b(__m256i in_h,__m256i in_l)897 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
898   __m256i out;
899 
900   out = __lasx_xvmulwev_h_b(in_h, in_l);
901   out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
902   return out;
903 }
904 
905 /*
906  * =============================================================================
907  * Description : Dot product of halfword vector elements
908  * Arguments   : Inputs - in_h, in_l
909  *               Output - out
910  *               Return Type - signed word
911  * Details     : Signed halfword elements from in_h are multiplied with
912  *               signed halfword elements from in_l producing a result
913  *               twice the size of input i.e. signed word.
914  *               Then these multiplied results of adjacent odd-even elements
915  *               are added to the out vector.
916  * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
917  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
918  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
919  *         out : 22,38,38,22, 22,38,38,22
920  * =============================================================================
921  */
__lasx_xvdp2_w_h(__m256i in_h,__m256i in_l)922 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
923   __m256i out;
924 
925   out = __lasx_xvmulwev_w_h(in_h, in_l);
926   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
927   return out;
928 }
929 
930 /*
931  * =============================================================================
932  * Description : Dot product of word vector elements
933  * Arguments   : Inputs - in_h, in_l
934  *               Output - out
935  *               Return Type - signed double
936  * Details     : Signed word elements from in_h are multiplied with
937  *               signed word elements from in_l producing a result
938  *               twice the size of input i.e. signed double-word.
939  *               Then these multiplied results of adjacent odd-even elements
940  *               are added to the out vector.
941  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
942  * =============================================================================
943  */
__lasx_xvdp2_d_w(__m256i in_h,__m256i in_l)944 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
945   __m256i out;
946 
947   out = __lasx_xvmulwev_d_w(in_h, in_l);
948   out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
949   return out;
950 }
951 
952 /*
953  * =============================================================================
954  * Description : Dot product of halfword vector elements
955  * Arguments   : Inputs - in_h, in_l
956  *               Output - out
957  *               Return Type - signed word
958  * Details     : Unsigned halfword elements from in_h are multiplied with
959  *               signed halfword elements from in_l producing a result
960  *               twice the size of input i.e. unsigned word.
961  *               Multiplication result of adjacent odd-even elements
962  *               are added to the out vector
963  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
964  * =============================================================================
965  */
__lasx_xvdp2_w_hu_h(__m256i in_h,__m256i in_l)966 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
967   __m256i out;
968 
969   out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
970   out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
971   return out;
972 }
973 
974 /*
975  * =============================================================================
976  * Description : Dot product & addition of byte vector elements
977  * Arguments   : Inputs - in_h, in_l
978  *               Output - out
979  *               Return Type - halfword
980  * Details     : Signed byte elements from in_h are multiplied with
981  *               signed byte elements from in_l producing a result
982  *               twice the size of input i.e. signed halfword.
983  *               Then these multiplied results of adjacent odd-even elements
984  *               are added to the in_c vector.
985  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
986  * =============================================================================
987  */
__lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h,__m256i in_l)988 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
989                                           __m256i in_l) {
990   __m256i out;
991 
992   out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
993   out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
994   return out;
995 }
996 
997 /*
998  * =============================================================================
999  * Description : Dot product & addition of byte vector elements
1000  * Arguments   : Inputs - in_h, in_l
1001  *               Output - out
1002  *               Return Type - halfword
1003  * Details     : Unsigned byte elements from in_h are multiplied with
1004  *               unsigned byte elements from in_l producing a result
1005  *               twice the size of input i.e. signed halfword.
1006  *               Then these multiplied results of adjacent odd-even elements
1007  *               are added to the in_c vector.
1008  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1009  * =============================================================================
1010  */
__lasx_xvdp2add_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)1011 static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
1012                                            __m256i in_l) {
1013   __m256i out;
1014 
1015   out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
1016   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
1017   return out;
1018 }
1019 
1020 /*
1021  * =============================================================================
1022  * Description : Dot product & addition of byte vector elements
1023  * Arguments   : Inputs - in_h, in_l
1024  *               Output - out
1025  *               Return Type - halfword
1026  * Details     : Unsigned byte elements from in_h are multiplied with
1027  *               signed byte elements from in_l producing a result
1028  *               twice the size of input i.e. signed halfword.
1029  *               Then these multiplied results of adjacent odd-even elements
1030  *               are added to the in_c vector.
1031  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1032  * =============================================================================
1033  */
__lasx_xvdp2add_h_bu_b(__m256i in_c,__m256i in_h,__m256i in_l)1034 static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
1035                                              __m256i in_l) {
1036   __m256i out;
1037 
1038   out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
1039   out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
1040   return out;
1041 }
1042 
1043 /*
1044  * =============================================================================
1045  * Description : Dot product of halfword vector elements
1046  * Arguments   : Inputs - in_c, in_h, in_l
1047  *               Output - out
1048  *               Return Type - per RTYPE
1049  * Details     : Signed halfword elements from in_h are multiplied with
1050  *               signed halfword elements from in_l producing a result
1051  *               twice the size of input i.e. signed word.
1052  *               Multiplication result of adjacent odd-even elements
1053  *               are added to the in_c vector.
1054  * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1055  *        in_c : 1,2,3,4, 1,2,3,4
1056  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
1057  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
1058  *         out : 23,40,41,26, 23,40,41,26
1059  * =============================================================================
1060  */
__lasx_xvdp2add_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1061 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
1062                                           __m256i in_l) {
1063   __m256i out;
1064 
1065   out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
1066   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1067   return out;
1068 }
1069 
1070 /*
1071  * =============================================================================
1072  * Description : Dot product of halfword vector elements
1073  * Arguments   : Inputs - in_c, in_h, in_l
1074  *               Output - out
1075  *               Return Type - signed word
1076  * Details     : Unsigned halfword elements from in_h are multiplied with
1077  *               unsigned halfword elements from in_l producing a result
1078  *               twice the size of input i.e. signed word.
1079  *               Multiplication result of adjacent odd-even elements
1080  *               are added to the in_c vector.
1081  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1082  * =============================================================================
1083  */
__lasx_xvdp2add_w_hu(__m256i in_c,__m256i in_h,__m256i in_l)1084 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
1085                                            __m256i in_l) {
1086   __m256i out;
1087 
1088   out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
1089   out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
1090   return out;
1091 }
1092 
1093 /*
1094  * =============================================================================
1095  * Description : Dot product of halfword vector elements
1096  * Arguments   : Inputs - in_c, in_h, in_l
1097  *               Output - out
1098  *               Return Type - signed word
1099  * Details     : Unsigned halfword elements from in_h are multiplied with
1100  *               signed halfword elements from in_l producing a result
1101  *               twice the size of input i.e. signed word.
1102  *               Multiplication result of adjacent odd-even elements
1103  *               are added to the in_c vector
1104  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1105  * =============================================================================
1106  */
__lasx_xvdp2add_w_hu_h(__m256i in_c,__m256i in_h,__m256i in_l)1107 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
1108                                              __m256i in_l) {
1109   __m256i out;
1110 
1111   out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
1112   out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
1113   return out;
1114 }
1115 
1116 /*
1117  * =============================================================================
1118  * Description : Vector Unsigned Dot Product and Subtract
1119  * Arguments   : Inputs - in_c, in_h, in_l
1120  *               Output - out
1121  *               Return Type - signed halfword
1122  * Details     : Unsigned byte elements from in_h are multiplied with
1123  *               unsigned byte elements from in_l producing a result
1124  *               twice the size of input i.e. signed halfword.
1125  *               Multiplication result of adjacent odd-even elements
1126  *               are added together and subtracted from double width elements
1127  *               in_c vector.
1128  * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1129  * =============================================================================
1130  */
__lasx_xvdp2sub_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)1131 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
1132                                            __m256i in_l) {
1133   __m256i out;
1134 
1135   out = __lasx_xvmulwev_h_bu(in_h, in_l);
1136   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
1137   out = __lasx_xvsub_h(in_c, out);
1138   return out;
1139 }
1140 
1141 /*
1142  * =============================================================================
1143  * Description : Vector Signed Dot Product and Subtract
1144  * Arguments   : Inputs - in_c, in_h, in_l
1145  *               Output - out
1146  *               Return Type - signed word
1147  * Details     : Signed halfword elements from in_h are multiplied with
1148  *               Signed halfword elements from in_l producing a result
1149  *               twice the size of input i.e. signed word.
1150  *               Multiplication result of adjacent odd-even elements
1151  *               are added together and subtracted from double width elements
1152  *               in_c vector.
1153  * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1154  *        in_c : 0,0,0,0, 0,0,0,0
1155  *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1156  *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
1157  *         out : -7,-3,0,0, 0,-1,0,-1
1158  * =============================================================================
1159  */
__lasx_xvdp2sub_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1160 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
1161                                           __m256i in_l) {
1162   __m256i out;
1163 
1164   out = __lasx_xvmulwev_w_h(in_h, in_l);
1165   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1166   out = __lasx_xvsub_w(in_c, out);
1167   return out;
1168 }
1169 
1170 /*
1171  * =============================================================================
1172  * Description : Dot product of halfword vector elements
1173  * Arguments   : Inputs - in_h, in_l
1174  *               Output - out
1175  *               Return Type - signed word
1176  * Details     : Signed halfword elements from in_h are multiplied with
1177  *               signed halfword elements from in_l producing a result
1178  *               four times the size of input i.e. signed doubleword.
1179  *               Then these multiplication results of four adjacent elements
1180  *               are added together and stored to the out vector.
1181  * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
1182  *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1183  *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1184  *         out : -2,0,1,1
1185  * =============================================================================
1186  */
__lasx_xvdp4_d_h(__m256i in_h,__m256i in_l)1187 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1188   __m256i out;
1189 
1190   out = __lasx_xvmulwev_w_h(in_h, in_l);
1191   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1192   out = __lasx_xvhaddw_d_w(out, out);
1193   return out;
1194 }
1195 
1196 /*
1197  * =============================================================================
1198  * Description : The high half of the vector elements are expanded and
1199  *               added after being doubled.
1200  * Arguments   : Inputs - in_h, in_l
1201  *               Output - out
1202  * Details     : The in_h vector and the in_l vector are added after the
1203  *               higher half of the two-fold sign extension (signed byte
1204  *               to signed halfword) and stored to the out vector.
1205  * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1206  * =============================================================================
1207  */
__lasx_xvaddwh_h_b(__m256i in_h,__m256i in_l)1208 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1209   __m256i out;
1210 
1211   out = __lasx_xvilvh_b(in_h, in_l);
1212   out = __lasx_xvhaddw_h_b(out, out);
1213   return out;
1214 }
1215 
1216 /*
1217  * =============================================================================
1218  * Description : The high half of the vector elements are expanded and
1219  *               added after being doubled.
1220  * Arguments   : Inputs - in_h, in_l
1221  *               Output - out
1222  * Details     : The in_h vector and the in_l vector are added after the
1223  *               higher half of the two-fold sign extension (signed halfword
1224  *               to signed word) and stored to the out vector.
1225  * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
1226  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1227  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1228  *         out : 1,0,0,-1, 1,0,0, 2
1229  * =============================================================================
1230  */
__lasx_xvaddwh_w_h(__m256i in_h,__m256i in_l)1231 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1232   __m256i out;
1233 
1234   out = __lasx_xvilvh_h(in_h, in_l);
1235   out = __lasx_xvhaddw_w_h(out, out);
1236   return out;
1237 }
1238 
1239 /*
1240  * =============================================================================
1241  * Description : The low half of the vector elements are expanded and
1242  *               added after being doubled.
1243  * Arguments   : Inputs - in_h, in_l
1244  *               Output - out
1245  * Details     : The in_h vector and the in_l vector are added after the
1246  *               lower half of the two-fold sign extension (signed byte
1247  *               to signed halfword) and stored to the out vector.
1248  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1249  * =============================================================================
1250  */
__lasx_xvaddwl_h_b(__m256i in_h,__m256i in_l)1251 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1252   __m256i out;
1253 
1254   out = __lasx_xvilvl_b(in_h, in_l);
1255   out = __lasx_xvhaddw_h_b(out, out);
1256   return out;
1257 }
1258 
1259 /*
1260  * =============================================================================
1261  * Description : The low half of the vector elements are expanded and
1262  *               added after being doubled.
1263  * Arguments   : Inputs - in_h, in_l
1264  *               Output - out
1265  * Details     : The in_h vector and the in_l vector are added after the
1266  *               lower half of the two-fold sign extension (signed halfword
1267  *               to signed word) and stored to the out vector.
1268  * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
1269  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1270  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1271  *         out : 5,-1,4,2, 1,0,2,-1
1272  * =============================================================================
1273  */
__lasx_xvaddwl_w_h(__m256i in_h,__m256i in_l)1274 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1275   __m256i out;
1276 
1277   out = __lasx_xvilvl_h(in_h, in_l);
1278   out = __lasx_xvhaddw_w_h(out, out);
1279   return out;
1280 }
1281 
1282 /*
1283  * =============================================================================
1284  * Description : The low half of the vector elements are expanded and
1285  *               added after being doubled.
1286  * Arguments   : Inputs - in_h, in_l
1287  *               Output - out
1288  * Details     : The out vector and the out vector are added after the
1289  *               lower half of the two-fold zero extension (unsigned byte
1290  *               to unsigned halfword) and stored to the out vector.
1291  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1292  * =============================================================================
1293  */
__lasx_xvaddwl_h_bu(__m256i in_h,__m256i in_l)1294 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1295   __m256i out;
1296 
1297   out = __lasx_xvilvl_b(in_h, in_l);
1298   out = __lasx_xvhaddw_hu_bu(out, out);
1299   return out;
1300 }
1301 
1302 /*
1303  * =============================================================================
1304  * Description : The low half of the vector elements are expanded and
1305  *               added after being doubled.
1306  * Arguments   : Inputs - in_h, in_l
1307  *               Output - out
1308  * Details     : The in_l vector after double zero extension (unsigned byte to
1309  *               signed halfword),added to the in_h vector.
1310  * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1311  * =============================================================================
1312  */
__lasx_xvaddw_h_h_bu(__m256i in_h,__m256i in_l)1313 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1314   __m256i out;
1315 
1316   out = __lasx_xvsllwil_hu_bu(in_l, 0);
1317   out = __lasx_xvadd_h(in_h, out);
1318   return out;
1319 }
1320 
1321 /*
1322  * =============================================================================
1323  * Description : The low half of the vector elements are expanded and
1324  *               added after being doubled.
1325  * Arguments   : Inputs - in_h, in_l
1326  *               Output - out
1327  * Details     : The in_l vector after double sign extension (signed halfword to
1328  *               signed word), added to the in_h vector.
1329  * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1330  *        in_h : 0, 1,0,0, -1,0,0,1,
1331  *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
1332  *         out : 2, 0,1,2, -1,0,1,1,
1333  * =============================================================================
1334  */
__lasx_xvaddw_w_w_h(__m256i in_h,__m256i in_l)1335 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1336   __m256i out;
1337 
1338   out = __lasx_xvsllwil_w_h(in_l, 0);
1339   out = __lasx_xvadd_w(in_h, out);
1340   return out;
1341 }
1342 
1343 /*
1344  * =============================================================================
1345  * Description : Multiplication and addition calculation after expansion
1346  *               of the lower half of the vector.
1347  * Arguments   : Inputs - in_c, in_h, in_l
1348  *               Output - out
1349  * Details     : The in_h vector and the in_l vector are multiplied after
1350  *               the lower half of the two-fold sign extension (signed halfword
1351  *               to signed word), and the result is added to the vector in_c,
1352  *               then stored to the out vector.
1353  * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1354  *        in_c : 1,2,3,4, 5,6,7,8
1355  *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1356  *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
1357  *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
1358  *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1359  * =============================================================================
1360  */
__lasx_xvmaddwl_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1361 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
1362                                           __m256i in_l) {
1363   __m256i tmp0, tmp1, out;
1364 
1365   tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1366   tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1367   tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1368   out = __lasx_xvadd_w(tmp0, in_c);
1369   return out;
1370 }
1371 
1372 /*
1373  * =============================================================================
1374  * Description : Multiplication and addition calculation after expansion
1375  *               of the higher half of the vector.
1376  * Arguments   : Inputs - in_c, in_h, in_l
1377  *               Output - out
1378  * Details     : The in_h vector and the in_l vector are multiplied after
1379  *               the higher half of the two-fold sign extension (signed
1380  *               halfword to signed word), and the result is added to
1381  *               the vector in_c, then stored to the out vector.
1382  * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1383  * =============================================================================
1384  */
__lasx_xvmaddwh_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1385 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
1386                                           __m256i in_l) {
1387   __m256i tmp0, tmp1, out;
1388 
1389   tmp0 = __lasx_xvilvh_h(in_h, in_h);
1390   tmp1 = __lasx_xvilvh_h(in_l, in_l);
1391   tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1392   out = __lasx_xvadd_w(tmp0, in_c);
1393   return out;
1394 }
1395 
1396 /*
1397  * =============================================================================
1398  * Description : Multiplication calculation after expansion of the lower
1399  *               half of the vector.
1400  * Arguments   : Inputs - in_h, in_l
1401  *               Output - out
1402  * Details     : The in_h vector and the in_l vector are multiplied after
1403  *               the lower half of the two-fold sign extension (signed
1404  *               halfword to signed word), then stored to the out vector.
1405  * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
1406  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1407  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1408  *         out : 6,1,3,0, 0,0,1,0
1409  * =============================================================================
1410  */
__lasx_xvmulwl_w_h(__m256i in_h,__m256i in_l)1411 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1412   __m256i tmp0, tmp1, out;
1413 
1414   tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1415   tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1416   out = __lasx_xvmul_w(tmp0, tmp1);
1417   return out;
1418 }
1419 
1420 /*
1421  * =============================================================================
1422  * Description : Multiplication calculation after expansion of the lower
1423  *               half of the vector.
1424  * Arguments   : Inputs - in_h, in_l
1425  *               Output - out
1426  * Details     : The in_h vector and the in_l vector are multiplied after
1427  *               the lower half of the two-fold sign extension (signed
1428  *               halfword to signed word), then stored to the out vector.
1429  * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
1430  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1431  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1432  *         out : 0,0,0,0, 0,0,0,1
1433  * =============================================================================
1434  */
__lasx_xvmulwh_w_h(__m256i in_h,__m256i in_l)1435 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1436   __m256i tmp0, tmp1, out;
1437 
1438   tmp0 = __lasx_xvilvh_h(in_h, in_h);
1439   tmp1 = __lasx_xvilvh_h(in_l, in_l);
1440   out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1441   return out;
1442 }
1443 
1444 /*
1445  * =============================================================================
1446  * Description : The low half of the vector elements are added to the high half
1447  *               after being doubled, then saturated.
1448  * Arguments   : Inputs - in_h, in_l
1449  *               Output - out
1450  * Details     : The in_h vector adds the in_l vector after the lower half of
1451  *               the two-fold zero extension (unsigned byte to unsigned
1452  *               halfword) and then saturated. The results are stored to the out
1453  *               vector.
1454  * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1455  *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1456  *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1457  *               0,0,0,1
1458  *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1459  * =============================================================================
1460  */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h,__m256i in_l)1461 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1462   __m256i tmp1, out;
1463   __m256i zero = { 0 };
1464 
1465   tmp1 = __lasx_xvilvl_b(zero, in_l);
1466   out = __lasx_xvsadd_hu(in_h, tmp1);
1467   return out;
1468 }
1469 
1470 /*
1471  * =============================================================================
1472  * Description : Clip all halfword elements of input vector between min & max
1473  *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1474  * Arguments   : Inputs  - in    (input vector)
1475  *                       - min   (min threshold)
1476  *                       - max   (max threshold)
1477  *               Outputs - in    (output vector with clipped elements)
1478  *               Return Type - signed halfword
1479  * Example     : out = __lasx_xvclip_h(in, min, max)
1480  *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1481  *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1482  *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1483  *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1484  * =============================================================================
1485  */
__lasx_xvclip_h(__m256i in,__m256i min,__m256i max)1486 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1487   __m256i out;
1488 
1489   out = __lasx_xvmax_h(min, in);
1490   out = __lasx_xvmin_h(max, out);
1491   return out;
1492 }
1493 
1494 /*
1495  * =============================================================================
1496  * Description : Clip all signed halfword elements of input vector
1497  *               between 0 & 255
1498  * Arguments   : Inputs  - in   (input vector)
1499  *               Outputs - out  (output vector with clipped elements)
1500  *               Return Type - signed halfword
1501  * Example     : See out = __lasx_xvclip255_w(in)
1502  * =============================================================================
1503  */
__lasx_xvclip255_h(__m256i in)1504 static inline __m256i __lasx_xvclip255_h(__m256i in) {
1505   __m256i out;
1506 
1507   out = __lasx_xvmaxi_h(in, 0);
1508   out = __lasx_xvsat_hu(out, 7);
1509   return out;
1510 }
1511 
1512 /*
1513  * =============================================================================
1514  * Description : Clip all signed word elements of input vector
1515  *               between 0 & 255
1516  * Arguments   : Inputs - in   (input vector)
1517  *               Output - out  (output vector with clipped elements)
1518  *               Return Type - signed word
1519  * Example     : out = __lasx_xvclip255_w(in)
1520  *          in : -8,255,280,249, -8,255,280,249
1521  *         out :  0,255,255,249,  0,255,255,249
1522  * =============================================================================
1523  */
__lasx_xvclip255_w(__m256i in)1524 static inline __m256i __lasx_xvclip255_w(__m256i in) {
1525   __m256i out;
1526 
1527   out = __lasx_xvmaxi_w(in, 0);
1528   out = __lasx_xvsat_wu(out, 7);
1529   return out;
1530 }
1531 
1532 /*
1533  * =============================================================================
1534  * Description : Indexed halfword element values are replicated to all
1535  *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1536  *               if 'idx >= 8' use xvsplati_h_*.
1537  * Arguments   : Inputs - in, idx
1538  *               Output - out
1539  * Details     : Idx element value from in vector is replicated to all
1540  *               elements in out vector.
1541  *               Valid index range for halfword operation is 0-7
1542  * Example     : out = __lasx_xvsplati_l_h(in, idx)
1543  *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1544  *         idx : 0x02
1545  *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1546  * =============================================================================
1547  */
__lasx_xvsplati_l_h(__m256i in,int idx)1548 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1549   __m256i out;
1550 
1551   out = __lasx_xvpermi_q(in, in, 0x02);
1552   out = __lasx_xvreplve_h(out, idx);
1553   return out;
1554 }
1555 
1556 /*
1557  * =============================================================================
1558  * Description : Indexed halfword element values are replicated to all
1559  *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1560  *               if 'idx >= 8' use xvsplati_h_*.
1561  * Arguments   : Inputs - in, idx
1562  *               Output - out
1563  * Details     : Idx element value from in vector is replicated to all
1564  *               elements in out vector.
1565  *               Valid index range for halfword operation is 0-7
1566  * Example     : out = __lasx_xvsplati_h_h(in, idx)
1567  *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1568  *         idx : 0x09
1569  *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1570  * =============================================================================
1571  */
__lasx_xvsplati_h_h(__m256i in,int idx)1572 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1573   __m256i out;
1574 
1575   out = __lasx_xvpermi_q(in, in, 0x13);
1576   out = __lasx_xvreplve_h(out, idx);
1577   return out;
1578 }
1579 
1580 /*
1581  * =============================================================================
1582  * Description : Transpose 4x4 block with double-word elements in vectors
1583  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1584  *               Outputs - _out0, _out1, _out2, _out3
1585  * Example     : LASX_TRANSPOSE4x4_D
1586  *        _in0 : 1,2,3,4
1587  *        _in1 : 1,2,3,4
1588  *        _in2 : 1,2,3,4
1589  *        _in3 : 1,2,3,4
1590  *
1591  *       _out0 : 1,1,1,1
1592  *       _out1 : 2,2,2,2
1593  *       _out2 : 3,3,3,3
1594  *       _out3 : 4,4,4,4
1595  * =============================================================================
1596  */
1597 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1598                             _out3)                                       \
1599   {                                                                      \
1600     __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
1601     _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
1602     _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
1603     _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
1604     _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
1605     _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
1606     _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
1607     _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
1608     _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
1609   }
1610 
1611 /*
1612  * =============================================================================
1613  * Description : Transpose 8x8 block with word elements in vectors
1614  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1615  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1616  *               _out7
1617  * Example     : LASX_TRANSPOSE8x8_W
1618  *        _in0 : 1,2,3,4,5,6,7,8
1619  *        _in1 : 2,2,3,4,5,6,7,8
1620  *        _in2 : 3,2,3,4,5,6,7,8
1621  *        _in3 : 4,2,3,4,5,6,7,8
1622  *        _in4 : 5,2,3,4,5,6,7,8
1623  *        _in5 : 6,2,3,4,5,6,7,8
1624  *        _in6 : 7,2,3,4,5,6,7,8
1625  *        _in7 : 8,2,3,4,5,6,7,8
1626  *
1627  *       _out0 : 1,2,3,4,5,6,7,8
1628  *       _out1 : 2,2,2,2,2,2,2,2
1629  *       _out2 : 3,3,3,3,3,3,3,3
1630  *       _out3 : 4,4,4,4,4,4,4,4
1631  *       _out4 : 5,5,5,5,5,5,5,5
1632  *       _out5 : 6,6,6,6,6,6,6,6
1633  *       _out6 : 7,7,7,7,7,7,7,7
1634  *       _out7 : 8,8,8,8,8,8,8,8
1635  * =============================================================================
1636  */
1637 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1638                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1639                             _out7)                                           \
1640   {                                                                          \
1641     __m256i _s0_m, _s1_m;                                                    \
1642     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1643     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1644                                                                              \
1645     _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
1646     _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
1647     _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1648     _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1649     _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
1650     _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
1651     _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1652     _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1653     _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
1654     _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
1655     _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1656     _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1657     _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
1658     _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
1659     _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1660     _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1661     _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
1662     _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
1663     _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
1664     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
1665     _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
1666     _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
1667     _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
1668     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
1669   }
1670 
1671 /*
1672  * =============================================================================
1673  * Description : Transpose input 16x8 byte block
1674  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1675  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1676  *                         (input 16x8 byte block)
1677  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1678  *                         _out7 (output 8x16 byte block)
1679  * Details     : The rows of the matrix become columns, and the columns become
1680  *               rows.
1681  * Example     : See LASX_TRANSPOSE16x8_H
1682  * =============================================================================
1683  */
1684 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1685                              _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1686                              _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1687                              _out6, _out7)                                    \
1688   {                                                                           \
1689     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1690     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1691                                                                               \
1692     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
1693     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
1694     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
1695     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
1696     _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
1697     _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
1698     _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
1699     _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
1700     _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
1701     _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
1702     _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
1703     _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
1704     _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
1705     _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
1706     _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
1707     _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
1708     _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
1709     _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
1710     _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
1711     _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
1712     _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
1713     _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
1714     _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
1715     _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
1716     _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
1717     _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
1718     _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
1719     _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
1720     _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
1721     _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
1722     _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
1723     _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
1724   }
1725 
1726 /*
1727  * =============================================================================
1728  * Description : Transpose input 16x8 byte block
1729  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1730  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1731  *                         (input 16x8 byte block)
1732  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1733  *                         _out7 (output 8x16 byte block)
1734  * Details     : The rows of the matrix become columns, and the columns become
1735  *               rows.
1736  * Example     : LASX_TRANSPOSE16x8_H
1737  *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1738  *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1739  *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1740  *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1741  *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1742  *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1743  *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1744  *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1745  *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1746  *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1747  *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1748  *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1749  *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1750  *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1751  *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1752  *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1753  *
1754  *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1755  *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1756  *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1757  *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1758  *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1759  *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1760  *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1761  *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1762  * =============================================================================
1763  */
1764 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1765                              _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1766                              _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1767                              _out6, _out7)                                    \
1768   {                                                                           \
1769     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1770     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1771     __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
1772                                                                               \
1773     _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
1774     _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
1775     _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
1776     _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
1777     _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
1778     _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
1779     _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
1780     _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
1781     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1782     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1783     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1784     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1785     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1786     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1787     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1788     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1789     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1790     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1791     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1792     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1793     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1794     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1795     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1796     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1797     _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1798     _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1799     _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1800     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1801                                                                               \
1802     _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
1803     _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
1804     _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
1805     _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
1806     _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
1807     _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
1808     _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
1809     _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
1810     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1811     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1812     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1813     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1814     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1815     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1816     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1817     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1818     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1819     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1820     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1821     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1822     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1823     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1824     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1825     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1826     _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1827     _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1828     _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1829     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1830   }
1831 
1832 /*
1833  * =============================================================================
1834  * Description : Transpose 4x4 block with halfword elements in vectors
1835  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1836  *               Outputs - _out0, _out1, _out2, _out3
1837  *               Return Type - signed halfword
1838  * Details     : The rows of the matrix become columns, and the columns become
1839  *               rows.
1840  * Example     : See LASX_TRANSPOSE8x8_H
1841  * =============================================================================
1842  */
1843 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1844                             _out3)                                       \
1845   {                                                                      \
1846     __m256i _s0_m, _s1_m;                                                \
1847                                                                          \
1848     _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
1849     _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
1850     _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
1851     _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
1852     _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
1853     _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
1854   }
1855 
1856 /*
1857  * =============================================================================
1858  * Description : Transpose input 8x8 byte block
1859  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1860  *                         (input 8x8 byte block)
1861  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1862  *                         _out7 (output 8x8 byte block)
1863  * Example     : See LASX_TRANSPOSE8x8_H
1864  * =============================================================================
1865  */
1866 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1867                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1868                             _out7)                                           \
1869   {                                                                          \
1870     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1871     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1872     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
1873     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
1874     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
1875     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
1876     _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
1877     _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
1878     _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
1879     _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
1880     _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
1881     _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
1882     _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
1883     _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
1884     _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
1885     _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
1886     _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
1887     _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
1888   }
1889 
1890 /*
1891  * =============================================================================
1892  * Description : Transpose 8x8 block with halfword elements in vectors.
1893  * Arguments   : Inputs  - _in0, _in1, ~
1894  *               Outputs - _out0, _out1, ~
1895  * Details     : The rows of the matrix become columns, and the columns become
1896  *               rows.
1897  * Example     : LASX_TRANSPOSE8x8_H
1898  *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1899  *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1900  *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1901  *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1902  *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1903  *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1904  *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1905  *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1906  *
1907  *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1908  *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1909  *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1910  *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1911  *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1912  *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1913  *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1914  *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1915  * =============================================================================
1916  */
1917 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1918                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1919                             _out7)                                           \
1920   {                                                                          \
1921     __m256i _s0_m, _s1_m;                                                    \
1922     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1923     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1924                                                                              \
1925     _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
1926     _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
1927     _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1928     _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1929     _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
1930     _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
1931     _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1932     _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1933                                                                              \
1934     _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
1935     _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
1936     _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1937     _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1938     _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
1939     _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
1940     _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1941     _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1942                                                                              \
1943     _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
1944     _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
1945     _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
1946     _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
1947     _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
1948     _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
1949     _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
1950     _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
1951   }
1952 
1953 /*
1954  * =============================================================================
1955  * Description : Butterfly of 4 input vectors
1956  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1957  *               Outputs - _out0, _out1, _out2, _out3
1958  * Details     : Butterfly operation
1959  * Example     : LASX_BUTTERFLY_4
1960  *               _out0 = _in0 + _in3;
1961  *               _out1 = _in1 + _in2;
1962  *               _out2 = _in1 - _in2;
1963  *               _out3 = _in0 - _in3;
1964  * =============================================================================
1965  */
1966 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1967   {                                                                            \
1968     _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
1969     _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
1970     _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
1971     _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
1972   }
1973 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1974   {                                                                            \
1975     _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
1976     _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
1977     _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
1978     _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
1979   }
1980 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1981   {                                                                            \
1982     _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
1983     _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
1984     _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
1985     _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
1986   }
1987 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1988   {                                                                            \
1989     _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
1990     _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
1991     _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
1992     _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
1993   }
1994 
1995 /*
1996  * =============================================================================
1997  * Description : Butterfly of 8 input vectors
1998  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
1999  *               Outputs - _out0, _out1, _out2, _out3, ~
2000  * Details     : Butterfly operation
2001  * Example     : LASX_BUTTERFLY_8
2002  *               _out0 = _in0 + _in7;
2003  *               _out1 = _in1 + _in6;
2004  *               _out2 = _in2 + _in5;
2005  *               _out3 = _in3 + _in4;
2006  *               _out4 = _in3 - _in4;
2007  *               _out5 = _in2 - _in5;
2008  *               _out6 = _in1 - _in6;
2009  *               _out7 = _in0 - _in7;
2010  * =============================================================================
2011  */
2012 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
2013                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2014                            _out7)                                           \
2015   {                                                                         \
2016     _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
2017     _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
2018     _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
2019     _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
2020     _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
2021     _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
2022     _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
2023     _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
2024   }
2025 
2026 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
2027                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2028                            _out7)                                           \
2029   {                                                                         \
2030     _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
2031     _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
2032     _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
2033     _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
2034     _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
2035     _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
2036     _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
2037     _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
2038   }
2039 
2040 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
2041                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2042                            _out7)                                           \
2043   {                                                                         \
2044     _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
2045     _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
2046     _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
2047     _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
2048     _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
2049     _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
2050     _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
2051     _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
2052   }
2053 
2054 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
2055                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2056                            _out7)                                           \
2057   {                                                                         \
2058     _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
2059     _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
2060     _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
2061     _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
2062     _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
2063     _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
2064     _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
2065     _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
2066   }
2067 
2068 #endif  // LASX
2069 
2070 /*
2071  * =============================================================================
2072  * Description : Print out elements in vector.
2073  * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
2074  *               Outputs -
2075  * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
2076  *               '_enter' is TRUE, prefix "\nVP:" will be added first.
2077  * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
2078  *               VP:1,2,3,4,
2079  * =============================================================================
2080  */
2081 #define VECT_PRINT(RTYPE, element_num, in0, enter)                 \
2082   {                                                                \
2083     RTYPE _tmp0 = (RTYPE)in0;                                      \
2084     int _i = 0;                                                    \
2085     if (enter) printf("\nVP:");                                    \
2086     for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
2087   }
2088 
2089 #endif /* LOONGSON_INTRINSICS_H */
2090 #endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */
2091