1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 *
10 */
11
12 #ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
13 #define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
14
15 /*
16 * Copyright (c) 2021 Loongson Technology Corporation Limited
17 * All rights reserved.
18 *
19 * Use of this source code is governed by a BSD-style license
20 * that can be found in the LICENSE file in the root of the source
21 * tree. An additional intellectual property rights grant can be found
22 * in the file PATENTS. All contributing project authors may
23 * be found in the AUTHORS file in the root of the source tree.
24 *
25 * Contributed by Shiyou Yin <[email protected]>
26 * Xiwei Gu <[email protected]>
27 * Lu Wang <[email protected]>
28 *
29 * This file is a header file for loongarch builtin extension.
30 *
31 */
32
33 #ifndef LOONGSON_INTRINSICS_H
34 #define LOONGSON_INTRINSICS_H
35
36 /**
37 * MAJOR version: Macro usage changes.
38 * MINOR version: Add new functions, or bug fixes.
39 * MICRO version: Comment changes or implementation changes.
40 */
41 #define LSOM_VERSION_MAJOR 1
42 #define LSOM_VERSION_MINOR 2
43 #define LSOM_VERSION_MICRO 1
44
45 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
46 { \
47 _OUT0 = _INS(_IN0); \
48 _OUT1 = _INS(_IN1); \
49 }
50
51 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
52 { \
53 _OUT0 = _INS(_IN0, _IN1); \
54 _OUT1 = _INS(_IN2, _IN3); \
55 }
56
57 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
58 { \
59 _OUT0 = _INS(_IN0, _IN1, _IN2); \
60 _OUT1 = _INS(_IN3, _IN4, _IN5); \
61 }
62
63 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
64 { \
65 DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
66 DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
67 }
68
69 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
70 _OUT1, _OUT2, _OUT3) \
71 { \
72 DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
73 DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
74 }
75
76 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
77 _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
78 { \
79 DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
80 DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
81 }
82
83 #ifdef __loongarch_sx
84 #include <lsxintrin.h>
85 /*
86 * =============================================================================
87 * Description : Dot product & addition of byte vector elements
88 * Arguments : Inputs - in_c, in_h, in_l
89 * Outputs - out
90 * Return Type - halfword
91 * Details : Signed byte elements from in_h are multiplied by
92 * signed byte elements from in_l, and then added adjacent to
93 * each other to get a result twice the size of input. Then
94 * the results are added to signed half-word elements from in_c.
95 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
96 * in_c : 1,2,3,4, 1,2,3,4
97 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
98 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
99 * out : 23,40,41,26, 23,40,41,26
100 * =============================================================================
101 */
__lsx_vdp2add_h_b(__m128i in_c,__m128i in_h,__m128i in_l)102 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
103 __m128i in_l) {
104 __m128i out;
105
106 out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
107 out = __lsx_vmaddwod_h_b(out, in_h, in_l);
108 return out;
109 }
110
111 /*
112 * =============================================================================
113 * Description : Dot product & addition of byte vector elements
114 * Arguments : Inputs - in_c, in_h, in_l
115 * Outputs - out
116 * Return Type - halfword
117 * Details : Unsigned byte elements from in_h are multiplied by
118 * unsigned byte elements from in_l, and then added adjacent to
119 * each other to get a result twice the size of input.
120 * The results are added to signed half-word elements from in_c.
121 * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
122 * in_c : 1,2,3,4, 1,2,3,4
123 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
124 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
125 * out : 23,40,41,26, 23,40,41,26
126 * =============================================================================
127 */
__lsx_vdp2add_h_bu(__m128i in_c,__m128i in_h,__m128i in_l)128 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
129 __m128i in_l) {
130 __m128i out;
131
132 out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
133 out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
134 return out;
135 }
136
137 /*
138 * =============================================================================
139 * Description : Dot product & addition of byte vector elements
140 * Arguments : Inputs - in_c, in_h, in_l
141 * Outputs - out
142 * Return Type - halfword
143 * Details : Unsigned byte elements from in_h are multiplied by
144 * signed byte elements from in_l, and then added adjacent to
145 * each other to get a result twice the size of input.
146 * The results are added to signed half-word elements from in_c.
147 * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
148 * in_c : 1,1,1,1, 1,1,1,1
149 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
150 * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
151 * out : -4,-24,-60,-112, 6,26,62,114
152 * =============================================================================
153 */
__lsx_vdp2add_h_bu_b(__m128i in_c,__m128i in_h,__m128i in_l)154 static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
155 __m128i in_l) {
156 __m128i out;
157
158 out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
159 out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
160 return out;
161 }
162
163 /*
164 * =============================================================================
165 * Description : Dot product & addition of half-word vector elements
166 * Arguments : Inputs - in_c, in_h, in_l
167 * Outputs - out
168 * Return Type - __m128i
169 * Details : Signed half-word elements from in_h are multiplied by
170 * signed half-word elements from in_l, and then added adjacent to
171 * each other to get a result twice the size of input.
172 * Then the results are added to signed word elements from in_c.
173 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
174 * in_c : 1,2,3,4
175 * in_h : 1,2,3,4, 5,6,7,8
176 * in_l : 8,7,6,5, 4,3,2,1
177 * out : 23,40,41,26
178 * =============================================================================
179 */
__lsx_vdp2add_w_h(__m128i in_c,__m128i in_h,__m128i in_l)180 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
181 __m128i in_l) {
182 __m128i out;
183
184 out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
185 out = __lsx_vmaddwod_w_h(out, in_h, in_l);
186 return out;
187 }
188
189 /*
190 * =============================================================================
191 * Description : Dot product of byte vector elements
192 * Arguments : Inputs - in_h, in_l
193 * Outputs - out
194 * Return Type - halfword
195 * Details : Signed byte elements from in_h are multiplied by
196 * signed byte elements from in_l, and then added adjacent to
197 * each other to get a result twice the size of input.
198 * Example : out = __lsx_vdp2_h_b(in_h, in_l)
199 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
200 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
201 * out : 22,38,38,22, 22,38,38,22
202 * =============================================================================
203 */
__lsx_vdp2_h_b(__m128i in_h,__m128i in_l)204 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
205 __m128i out;
206
207 out = __lsx_vmulwev_h_b(in_h, in_l);
208 out = __lsx_vmaddwod_h_b(out, in_h, in_l);
209 return out;
210 }
211
212 /*
213 * =============================================================================
214 * Description : Dot product of byte vector elements
215 * Arguments : Inputs - in_h, in_l
216 * Outputs - out
217 * Return Type - halfword
218 * Details : Unsigned byte elements from in_h are multiplied by
219 * unsigned byte elements from in_l, and then added adjacent to
220 * each other to get a result twice the size of input.
221 * Example : out = __lsx_vdp2_h_bu(in_h, in_l)
222 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
223 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
224 * out : 22,38,38,22, 22,38,38,22
225 * =============================================================================
226 */
__lsx_vdp2_h_bu(__m128i in_h,__m128i in_l)227 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
228 __m128i out;
229
230 out = __lsx_vmulwev_h_bu(in_h, in_l);
231 out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
232 return out;
233 }
234
235 /*
236 * =============================================================================
237 * Description : Dot product of byte vector elements
238 * Arguments : Inputs - in_h, in_l
239 * Outputs - out
240 * Return Type - halfword
241 * Details : Unsigned byte elements from in_h are multiplied by
242 * signed byte elements from in_l, and then added adjacent to
243 * each other to get a result twice the size of input.
244 * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l)
245 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
246 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
247 * out : 22,38,38,22, 22,38,38,6
248 * =============================================================================
249 */
__lsx_vdp2_h_bu_b(__m128i in_h,__m128i in_l)250 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
251 __m128i out;
252
253 out = __lsx_vmulwev_h_bu_b(in_h, in_l);
254 out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
255 return out;
256 }
257
258 /*
259 * =============================================================================
260 * Description : Dot product of byte vector elements
261 * Arguments : Inputs - in_h, in_l
262 * Outputs - out
263 * Return Type - halfword
264 * Details : Signed byte elements from in_h are multiplied by
265 * signed byte elements from in_l, and then added adjacent to
266 * each other to get a result twice the size of input.
267 * Example : out = __lsx_vdp2_w_h(in_h, in_l)
268 * in_h : 1,2,3,4, 5,6,7,8
269 * in_l : 8,7,6,5, 4,3,2,1
270 * out : 22,38,38,22
271 * =============================================================================
272 */
__lsx_vdp2_w_h(__m128i in_h,__m128i in_l)273 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
274 __m128i out;
275
276 out = __lsx_vmulwev_w_h(in_h, in_l);
277 out = __lsx_vmaddwod_w_h(out, in_h, in_l);
278 return out;
279 }
280
281 /*
282 * =============================================================================
283 * Description : Dot product of byte vector elements
284 * Arguments : Inputs - in_h, in_l
285 * Outputs - out
286 * Return Type - double
287 * Details : Signed byte elements from in_h are multiplied by
288 * signed byte elements from in_l, and then added adjacent to
289 * each other to get a result twice the size of input.
290 * Example : out = __lsx_vdp2_d_w(in_h, in_l)
291 * in_h : 1,2,3,4
292 * in_l : 8,7,6,5
293 * out : 22,38
294 * =============================================================================
295 */
__lsx_vdp2_d_w(__m128i in_h,__m128i in_l)296 static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) {
297 __m128i out;
298
299 out = __lsx_vmulwev_d_w(in_h, in_l);
300 out = __lsx_vmaddwod_d_w(out, in_h, in_l);
301 return out;
302 }
303
304 /*
305 * =============================================================================
306 * Description : Clip all halfword elements of input vector between min & max
307 * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
308 * (_in))
309 * Arguments : Inputs - _in (input vector)
310 * - min (min threshold)
311 * - max (max threshold)
312 * Outputs - out (output vector with clipped elements)
313 * Return Type - signed halfword
314 * Example : out = __lsx_vclip_h(_in)
315 * _in : -8,2,280,249, -8,255,280,249
316 * min : 1,1,1,1, 1,1,1,1
317 * max : 9,9,9,9, 9,9,9,9
318 * out : 1,2,9,9, 1,9,9,9
319 * =============================================================================
320 */
__lsx_vclip_h(__m128i _in,__m128i min,__m128i max)321 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
322 __m128i out;
323
324 out = __lsx_vmax_h(min, _in);
325 out = __lsx_vmin_h(max, out);
326 return out;
327 }
328
329 /*
330 * =============================================================================
331 * Description : Set each element of vector between 0 and 255
332 * Arguments : Inputs - _in
333 * Outputs - out
334 * Return Type - halfword
335 * Details : Signed byte elements from _in are clamped between 0 and 255.
336 * Example : out = __lsx_vclip255_h(_in)
337 * _in : -8,255,280,249, -8,255,280,249
338 * out : 0,255,255,249, 0,255,255,249
339 * =============================================================================
340 */
__lsx_vclip255_h(__m128i _in)341 static inline __m128i __lsx_vclip255_h(__m128i _in) {
342 __m128i out;
343
344 out = __lsx_vmaxi_h(_in, 0);
345 out = __lsx_vsat_hu(out, 7);
346 return out;
347 }
348
349 /*
350 * =============================================================================
351 * Description : Set each element of vector between 0 and 255
352 * Arguments : Inputs - _in
353 * Outputs - out
354 * Return Type - word
355 * Details : Signed byte elements from _in are clamped between 0 and 255.
356 * Example : out = __lsx_vclip255_w(_in)
357 * _in : -8,255,280,249
358 * out : 0,255,255,249
359 * =============================================================================
360 */
__lsx_vclip255_w(__m128i _in)361 static inline __m128i __lsx_vclip255_w(__m128i _in) {
362 __m128i out;
363
364 out = __lsx_vmaxi_w(_in, 0);
365 out = __lsx_vsat_wu(out, 7);
366 return out;
367 }
368
369 /*
370 * =============================================================================
371 * Description : Swap two variables
372 * Arguments : Inputs - _in0, _in1
373 * Outputs - _in0, _in1 (in-place)
374 * Details : Swapping of two input variables using xor
375 * Example : LSX_SWAP(_in0, _in1)
376 * _in0 : 1,2,3,4
377 * _in1 : 5,6,7,8
378 * _in0(out) : 5,6,7,8
379 * _in1(out) : 1,2,3,4
380 * =============================================================================
381 */
382 #define LSX_SWAP(_in0, _in1) \
383 { \
384 _in0 = __lsx_vxor_v(_in0, _in1); \
385 _in1 = __lsx_vxor_v(_in0, _in1); \
386 _in0 = __lsx_vxor_v(_in0, _in1); \
387 }
388
389 /*
390 * =============================================================================
391 * Description : Transpose 4x4 block with word elements in vectors
392 * Arguments : Inputs - in0, in1, in2, in3
393 * Outputs - out0, out1, out2, out3
394 * Details :
395 * Example :
396 * 1, 2, 3, 4 1, 5, 9,13
397 * 5, 6, 7, 8 to 2, 6,10,14
398 * 9,10,11,12 =====> 3, 7,11,15
399 * 13,14,15,16 4, 8,12,16
400 * =============================================================================
401 */
402 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
403 { \
404 __m128i _t0, _t1, _t2, _t3; \
405 \
406 _t0 = __lsx_vilvl_w(_in1, _in0); \
407 _t1 = __lsx_vilvh_w(_in1, _in0); \
408 _t2 = __lsx_vilvl_w(_in3, _in2); \
409 _t3 = __lsx_vilvh_w(_in3, _in2); \
410 _out0 = __lsx_vilvl_d(_t2, _t0); \
411 _out1 = __lsx_vilvh_d(_t2, _t0); \
412 _out2 = __lsx_vilvl_d(_t3, _t1); \
413 _out3 = __lsx_vilvh_d(_t3, _t1); \
414 }
415
416 /*
417 * =============================================================================
418 * Description : Transpose 8x8 block with byte elements in vectors
419 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
420 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
421 * _out7
422 * Details : The rows of the matrix become columns, and the columns
423 * become rows.
424 * Example : LSX_TRANSPOSE8x8_B
425 * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
426 * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
427 * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
428 * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
429 * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
430 * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
431 * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
432 * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
433 *
434 * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
435 * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
436 * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
437 * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
438 * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
439 * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
440 * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
441 * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
442 * =============================================================================
443 */
444 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
445 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
446 _out7) \
447 { \
448 __m128i zero = { 0 }; \
449 __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
450 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
451 \
452 _t0 = __lsx_vilvl_b(_in2, _in0); \
453 _t1 = __lsx_vilvl_b(_in3, _in1); \
454 _t2 = __lsx_vilvl_b(_in6, _in4); \
455 _t3 = __lsx_vilvl_b(_in7, _in5); \
456 _t4 = __lsx_vilvl_b(_t1, _t0); \
457 _t5 = __lsx_vilvh_b(_t1, _t0); \
458 _t6 = __lsx_vilvl_b(_t3, _t2); \
459 _t7 = __lsx_vilvh_b(_t3, _t2); \
460 _out0 = __lsx_vilvl_w(_t6, _t4); \
461 _out2 = __lsx_vilvh_w(_t6, _t4); \
462 _out4 = __lsx_vilvl_w(_t7, _t5); \
463 _out6 = __lsx_vilvh_w(_t7, _t5); \
464 _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
465 _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
466 _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
467 _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
468 }
469
470 /*
471 * =============================================================================
472 * Description : Transpose 8x8 block with half-word elements in vectors
473 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
474 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
475 * Details :
476 * Example :
477 * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70
478 * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71
479 * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72
480 * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73
481 * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74
482 * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75
483 * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76
484 * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
485 * =============================================================================
486 */
487 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
488 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
489 _out7) \
490 { \
491 __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
492 \
493 _s0 = __lsx_vilvl_h(_in6, _in4); \
494 _s1 = __lsx_vilvl_h(_in7, _in5); \
495 _t0 = __lsx_vilvl_h(_s1, _s0); \
496 _t1 = __lsx_vilvh_h(_s1, _s0); \
497 _s0 = __lsx_vilvh_h(_in6, _in4); \
498 _s1 = __lsx_vilvh_h(_in7, _in5); \
499 _t2 = __lsx_vilvl_h(_s1, _s0); \
500 _t3 = __lsx_vilvh_h(_s1, _s0); \
501 _s0 = __lsx_vilvl_h(_in2, _in0); \
502 _s1 = __lsx_vilvl_h(_in3, _in1); \
503 _t4 = __lsx_vilvl_h(_s1, _s0); \
504 _t5 = __lsx_vilvh_h(_s1, _s0); \
505 _s0 = __lsx_vilvh_h(_in2, _in0); \
506 _s1 = __lsx_vilvh_h(_in3, _in1); \
507 _t6 = __lsx_vilvl_h(_s1, _s0); \
508 _t7 = __lsx_vilvh_h(_s1, _s0); \
509 \
510 _out0 = __lsx_vpickev_d(_t0, _t4); \
511 _out2 = __lsx_vpickev_d(_t1, _t5); \
512 _out4 = __lsx_vpickev_d(_t2, _t6); \
513 _out6 = __lsx_vpickev_d(_t3, _t7); \
514 _out1 = __lsx_vpickod_d(_t0, _t4); \
515 _out3 = __lsx_vpickod_d(_t1, _t5); \
516 _out5 = __lsx_vpickod_d(_t2, _t6); \
517 _out7 = __lsx_vpickod_d(_t3, _t7); \
518 }
519
520 /*
521 * =============================================================================
522 * Description : Transpose input 8x4 byte block into 4x8
523 * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
524 * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
525 * Return Type - as per RTYPE
526 * Details : The rows of the matrix become columns, and the columns become
527 * rows.
528 * Example : LSX_TRANSPOSE8x4_B
529 * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
530 * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
531 * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
532 * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
533 * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
534 * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
535 * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
536 * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
537 *
538 * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
539 * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
540 * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
541 * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
542 * =============================================================================
543 */
544 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
545 _out0, _out1, _out2, _out3) \
546 { \
547 __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
548 \
549 _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
550 _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
551 _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
552 _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
553 _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
554 \
555 _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
556 _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
557 _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
558 \
559 _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
560 _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
561 _out1 = __lsx_vilvh_d(_out2, _out0); \
562 _out3 = __lsx_vilvh_d(_out0, _out2); \
563 }
564
565 /*
566 * =============================================================================
567 * Description : Transpose 16x8 block with byte elements in vectors
568 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8
569 * in9, in10, in11, in12, in13, in14, in15
570 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
571 * Details :
572 * Example :
573 * 000,001,002,003,004,005,006,007
574 * 008,009,010,011,012,013,014,015
575 * 016,017,018,019,020,021,022,023
576 * 024,025,026,027,028,029,030,031
577 * 032,033,034,035,036,037,038,039
578 * 040,041,042,043,044,045,046,047 000,008,...,112,120
579 * 048,049,050,051,052,053,054,055 001,009,...,113,121
580 * 056,057,058,059,060,061,062,063 to 002,010,...,114,122
581 * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
582 * 072,073,074,075,076,077,078,079 004,012,...,116,124
583 * 080,081,082,083,084,085,086,087 005,013,...,117,125
584 * 088,089,090,091,092,093,094,095 006,014,...,118,126
585 * 096,097,098,099,100,101,102,103 007,015,...,119,127
586 * 104,105,106,107,108,109,110,111
587 * 112,113,114,115,116,117,118,119
588 * 120,121,122,123,124,125,126,127
589 * =============================================================================
590 */
591 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
592 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
593 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
594 _out6, _out7) \
595 { \
596 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
597 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
598 DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
599 _tmp0, _tmp1, _tmp2, _tmp3); \
600 DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
601 _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
602 DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
603 DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
604 DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
605 DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
606 DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
607 DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
608 DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
609 DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
610 DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
611 DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
612 DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
613 DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
614 }
615
616 /*
617 * =============================================================================
618 * Description : Butterfly of 4 input vectors
619 * Arguments : Inputs - in0, in1, in2, in3
620 * Outputs - out0, out1, out2, out3
621 * Details : Butterfly operation
622 * Example :
623 * out0 = in0 + in3;
624 * out1 = in1 + in2;
625 * out2 = in1 - in2;
626 * out3 = in0 - in3;
627 * =============================================================================
628 */
629 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
630 { \
631 _out0 = __lsx_vadd_b(_in0, _in3); \
632 _out1 = __lsx_vadd_b(_in1, _in2); \
633 _out2 = __lsx_vsub_b(_in1, _in2); \
634 _out3 = __lsx_vsub_b(_in0, _in3); \
635 }
636 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
637 { \
638 _out0 = __lsx_vadd_h(_in0, _in3); \
639 _out1 = __lsx_vadd_h(_in1, _in2); \
640 _out2 = __lsx_vsub_h(_in1, _in2); \
641 _out3 = __lsx_vsub_h(_in0, _in3); \
642 }
643 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
644 { \
645 _out0 = __lsx_vadd_w(_in0, _in3); \
646 _out1 = __lsx_vadd_w(_in1, _in2); \
647 _out2 = __lsx_vsub_w(_in1, _in2); \
648 _out3 = __lsx_vsub_w(_in0, _in3); \
649 }
650 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
651 { \
652 _out0 = __lsx_vadd_d(_in0, _in3); \
653 _out1 = __lsx_vadd_d(_in1, _in2); \
654 _out2 = __lsx_vsub_d(_in1, _in2); \
655 _out3 = __lsx_vsub_d(_in0, _in3); \
656 }
657
658 /*
659 * =============================================================================
660 * Description : Butterfly of 8 input vectors
661 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
662 * Outputs - _out0, _out1, _out2, _out3, ~
663 * Details : Butterfly operation
664 * Example :
665 * _out0 = _in0 + _in7;
666 * _out1 = _in1 + _in6;
667 * _out2 = _in2 + _in5;
668 * _out3 = _in3 + _in4;
669 * _out4 = _in3 - _in4;
670 * _out5 = _in2 - _in5;
671 * _out6 = _in1 - _in6;
672 * _out7 = _in0 - _in7;
673 * =============================================================================
674 */
675 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
676 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
677 _out7) \
678 { \
679 _out0 = __lsx_vadd_b(_in0, _in7); \
680 _out1 = __lsx_vadd_b(_in1, _in6); \
681 _out2 = __lsx_vadd_b(_in2, _in5); \
682 _out3 = __lsx_vadd_b(_in3, _in4); \
683 _out4 = __lsx_vsub_b(_in3, _in4); \
684 _out5 = __lsx_vsub_b(_in2, _in5); \
685 _out6 = __lsx_vsub_b(_in1, _in6); \
686 _out7 = __lsx_vsub_b(_in0, _in7); \
687 }
688
689 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
690 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
691 _out7) \
692 { \
693 _out0 = __lsx_vadd_h(_in0, _in7); \
694 _out1 = __lsx_vadd_h(_in1, _in6); \
695 _out2 = __lsx_vadd_h(_in2, _in5); \
696 _out3 = __lsx_vadd_h(_in3, _in4); \
697 _out4 = __lsx_vsub_h(_in3, _in4); \
698 _out5 = __lsx_vsub_h(_in2, _in5); \
699 _out6 = __lsx_vsub_h(_in1, _in6); \
700 _out7 = __lsx_vsub_h(_in0, _in7); \
701 }
702
703 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
704 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
705 _out7) \
706 { \
707 _out0 = __lsx_vadd_w(_in0, _in7); \
708 _out1 = __lsx_vadd_w(_in1, _in6); \
709 _out2 = __lsx_vadd_w(_in2, _in5); \
710 _out3 = __lsx_vadd_w(_in3, _in4); \
711 _out4 = __lsx_vsub_w(_in3, _in4); \
712 _out5 = __lsx_vsub_w(_in2, _in5); \
713 _out6 = __lsx_vsub_w(_in1, _in6); \
714 _out7 = __lsx_vsub_w(_in0, _in7); \
715 }
716
717 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
718 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
719 _out7) \
720 { \
721 _out0 = __lsx_vadd_d(_in0, _in7); \
722 _out1 = __lsx_vadd_d(_in1, _in6); \
723 _out2 = __lsx_vadd_d(_in2, _in5); \
724 _out3 = __lsx_vadd_d(_in3, _in4); \
725 _out4 = __lsx_vsub_d(_in3, _in4); \
726 _out5 = __lsx_vsub_d(_in2, _in5); \
727 _out6 = __lsx_vsub_d(_in1, _in6); \
728 _out7 = __lsx_vsub_d(_in0, _in7); \
729 }
730
731 /*
732 * =============================================================================
733 * Description : Butterfly of 16 input vectors
734 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
735 * Outputs - _out0, _out1, _out2, _out3, ~
736 * Details : Butterfly operation
737 * Example :
738 * _out0 = _in0 + _in15;
739 * _out1 = _in1 + _in14;
740 * _out2 = _in2 + _in13;
741 * _out3 = _in3 + _in12;
742 * _out4 = _in4 + _in11;
743 * _out5 = _in5 + _in10;
744 * _out6 = _in6 + _in9;
745 * _out7 = _in7 + _in8;
746 * _out8 = _in7 - _in8;
747 * _out9 = _in6 - _in9;
748 * _out10 = _in5 - _in10;
749 * _out11 = _in4 - _in11;
750 * _out12 = _in3 - _in12;
751 * _out13 = _in2 - _in13;
752 * _out14 = _in1 - _in14;
753 * _out15 = _in0 - _in15;
754 * =============================================================================
755 */
756
757 #define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
758 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
759 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
760 _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
761 _out13, _out14, _out15) \
762 { \
763 _out0 = __lsx_vadd_b(_in0, _in15); \
764 _out1 = __lsx_vadd_b(_in1, _in14); \
765 _out2 = __lsx_vadd_b(_in2, _in13); \
766 _out3 = __lsx_vadd_b(_in3, _in12); \
767 _out4 = __lsx_vadd_b(_in4, _in11); \
768 _out5 = __lsx_vadd_b(_in5, _in10); \
769 _out6 = __lsx_vadd_b(_in6, _in9); \
770 _out7 = __lsx_vadd_b(_in7, _in8); \
771 \
772 _out8 = __lsx_vsub_b(_in7, _in8); \
773 _out9 = __lsx_vsub_b(_in6, _in9); \
774 _out10 = __lsx_vsub_b(_in5, _in10); \
775 _out11 = __lsx_vsub_b(_in4, _in11); \
776 _out12 = __lsx_vsub_b(_in3, _in12); \
777 _out13 = __lsx_vsub_b(_in2, _in13); \
778 _out14 = __lsx_vsub_b(_in1, _in14); \
779 _out15 = __lsx_vsub_b(_in0, _in15); \
780 }
781
782 #define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
783 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
784 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
785 _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
786 _out13, _out14, _out15) \
787 { \
788 _out0 = __lsx_vadd_h(_in0, _in15); \
789 _out1 = __lsx_vadd_h(_in1, _in14); \
790 _out2 = __lsx_vadd_h(_in2, _in13); \
791 _out3 = __lsx_vadd_h(_in3, _in12); \
792 _out4 = __lsx_vadd_h(_in4, _in11); \
793 _out5 = __lsx_vadd_h(_in5, _in10); \
794 _out6 = __lsx_vadd_h(_in6, _in9); \
795 _out7 = __lsx_vadd_h(_in7, _in8); \
796 \
797 _out8 = __lsx_vsub_h(_in7, _in8); \
798 _out9 = __lsx_vsub_h(_in6, _in9); \
799 _out10 = __lsx_vsub_h(_in5, _in10); \
800 _out11 = __lsx_vsub_h(_in4, _in11); \
801 _out12 = __lsx_vsub_h(_in3, _in12); \
802 _out13 = __lsx_vsub_h(_in2, _in13); \
803 _out14 = __lsx_vsub_h(_in1, _in14); \
804 _out15 = __lsx_vsub_h(_in0, _in15); \
805 }
806
807 #define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
808 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
809 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
810 _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
811 _out13, _out14, _out15) \
812 { \
813 _out0 = __lsx_vadd_w(_in0, _in15); \
814 _out1 = __lsx_vadd_w(_in1, _in14); \
815 _out2 = __lsx_vadd_w(_in2, _in13); \
816 _out3 = __lsx_vadd_w(_in3, _in12); \
817 _out4 = __lsx_vadd_w(_in4, _in11); \
818 _out5 = __lsx_vadd_w(_in5, _in10); \
819 _out6 = __lsx_vadd_w(_in6, _in9); \
820 _out7 = __lsx_vadd_w(_in7, _in8); \
821 \
822 _out8 = __lsx_vsub_w(_in7, _in8); \
823 _out9 = __lsx_vsub_w(_in6, _in9); \
824 _out10 = __lsx_vsub_w(_in5, _in10); \
825 _out11 = __lsx_vsub_w(_in4, _in11); \
826 _out12 = __lsx_vsub_w(_in3, _in12); \
827 _out13 = __lsx_vsub_w(_in2, _in13); \
828 _out14 = __lsx_vsub_w(_in1, _in14); \
829 _out15 = __lsx_vsub_w(_in0, _in15); \
830 }
831
832 #define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
833 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
834 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
835 _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
836 _out13, _out14, _out15) \
837 { \
838 _out0 = __lsx_vadd_d(_in0, _in15); \
839 _out1 = __lsx_vadd_d(_in1, _in14); \
840 _out2 = __lsx_vadd_d(_in2, _in13); \
841 _out3 = __lsx_vadd_d(_in3, _in12); \
842 _out4 = __lsx_vadd_d(_in4, _in11); \
843 _out5 = __lsx_vadd_d(_in5, _in10); \
844 _out6 = __lsx_vadd_d(_in6, _in9); \
845 _out7 = __lsx_vadd_d(_in7, _in8); \
846 \
847 _out8 = __lsx_vsub_d(_in7, _in8); \
848 _out9 = __lsx_vsub_d(_in6, _in9); \
849 _out10 = __lsx_vsub_d(_in5, _in10); \
850 _out11 = __lsx_vsub_d(_in4, _in11); \
851 _out12 = __lsx_vsub_d(_in3, _in12); \
852 _out13 = __lsx_vsub_d(_in2, _in13); \
853 _out14 = __lsx_vsub_d(_in1, _in14); \
854 _out15 = __lsx_vsub_d(_in0, _in15); \
855 }
856
857 #endif // LSX
858
859 #ifdef __loongarch_asx
860 #include <lasxintrin.h>
861 /*
862 * =============================================================================
863 * Description : Dot product of byte vector elements
864 * Arguments : Inputs - in_h, in_l
865 * Output - out
866 * Return Type - signed halfword
867 * Details : Unsigned byte elements from in_h are multiplied with
868 * unsigned byte elements from in_l producing a result
869 * twice the size of input i.e. signed halfword.
870 * Then these multiplied results of adjacent odd-even elements
871 * are added to the out vector
872 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
873 * =============================================================================
874 */
__lasx_xvdp2_h_bu(__m256i in_h,__m256i in_l)875 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
876 __m256i out;
877
878 out = __lasx_xvmulwev_h_bu(in_h, in_l);
879 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
880 return out;
881 }
882
883 /*
884 * =============================================================================
885 * Description : Dot product of byte vector elements
886 * Arguments : Inputs - in_h, in_l
887 * Output - out
888 * Return Type - signed halfword
889 * Details : Signed byte elements from in_h are multiplied with
890 * signed byte elements from in_l producing a result
891 * twice the size of input i.e. signed halfword.
892 * Then these multiplication results of adjacent odd-even elements
893 * are added to the out vector
894 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
895 * =============================================================================
896 */
__lasx_xvdp2_h_b(__m256i in_h,__m256i in_l)897 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
898 __m256i out;
899
900 out = __lasx_xvmulwev_h_b(in_h, in_l);
901 out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
902 return out;
903 }
904
905 /*
906 * =============================================================================
907 * Description : Dot product of halfword vector elements
908 * Arguments : Inputs - in_h, in_l
909 * Output - out
910 * Return Type - signed word
911 * Details : Signed halfword elements from in_h are multiplied with
912 * signed halfword elements from in_l producing a result
913 * twice the size of input i.e. signed word.
914 * Then these multiplied results of adjacent odd-even elements
915 * are added to the out vector.
916 * Example : out = __lasx_xvdp2_w_h(in_h, in_l)
917 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
918 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
919 * out : 22,38,38,22, 22,38,38,22
920 * =============================================================================
921 */
__lasx_xvdp2_w_h(__m256i in_h,__m256i in_l)922 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
923 __m256i out;
924
925 out = __lasx_xvmulwev_w_h(in_h, in_l);
926 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
927 return out;
928 }
929
930 /*
931 * =============================================================================
932 * Description : Dot product of word vector elements
933 * Arguments : Inputs - in_h, in_l
934 * Output - out
935 * Return Type - signed double
936 * Details : Signed word elements from in_h are multiplied with
937 * signed word elements from in_l producing a result
938 * twice the size of input i.e. signed double-word.
939 * Then these multiplied results of adjacent odd-even elements
940 * are added to the out vector.
941 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
942 * =============================================================================
943 */
__lasx_xvdp2_d_w(__m256i in_h,__m256i in_l)944 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
945 __m256i out;
946
947 out = __lasx_xvmulwev_d_w(in_h, in_l);
948 out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
949 return out;
950 }
951
952 /*
953 * =============================================================================
954 * Description : Dot product of halfword vector elements
955 * Arguments : Inputs - in_h, in_l
956 * Output - out
957 * Return Type - signed word
958 * Details : Unsigned halfword elements from in_h are multiplied with
959 * signed halfword elements from in_l producing a result
960 * twice the size of input i.e. unsigned word.
961 * Multiplication result of adjacent odd-even elements
962 * are added to the out vector
963 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
964 * =============================================================================
965 */
__lasx_xvdp2_w_hu_h(__m256i in_h,__m256i in_l)966 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
967 __m256i out;
968
969 out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
970 out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
971 return out;
972 }
973
974 /*
975 * =============================================================================
976 * Description : Dot product & addition of byte vector elements
977 * Arguments : Inputs - in_h, in_l
978 * Output - out
979 * Return Type - halfword
980 * Details : Signed byte elements from in_h are multiplied with
981 * signed byte elements from in_l producing a result
982 * twice the size of input i.e. signed halfword.
983 * Then these multiplied results of adjacent odd-even elements
984 * are added to the in_c vector.
985 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
986 * =============================================================================
987 */
__lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h,__m256i in_l)988 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
989 __m256i in_l) {
990 __m256i out;
991
992 out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
993 out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
994 return out;
995 }
996
997 /*
998 * =============================================================================
999 * Description : Dot product & addition of byte vector elements
1000 * Arguments : Inputs - in_h, in_l
1001 * Output - out
1002 * Return Type - halfword
1003 * Details : Unsigned byte elements from in_h are multiplied with
1004 * unsigned byte elements from in_l producing a result
1005 * twice the size of input i.e. signed halfword.
1006 * Then these multiplied results of adjacent odd-even elements
1007 * are added to the in_c vector.
1008 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1009 * =============================================================================
1010 */
__lasx_xvdp2add_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)1011 static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
1012 __m256i in_l) {
1013 __m256i out;
1014
1015 out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
1016 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
1017 return out;
1018 }
1019
1020 /*
1021 * =============================================================================
1022 * Description : Dot product & addition of byte vector elements
1023 * Arguments : Inputs - in_h, in_l
1024 * Output - out
1025 * Return Type - halfword
1026 * Details : Unsigned byte elements from in_h are multiplied with
1027 * signed byte elements from in_l producing a result
1028 * twice the size of input i.e. signed halfword.
1029 * Then these multiplied results of adjacent odd-even elements
1030 * are added to the in_c vector.
1031 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1032 * =============================================================================
1033 */
__lasx_xvdp2add_h_bu_b(__m256i in_c,__m256i in_h,__m256i in_l)1034 static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
1035 __m256i in_l) {
1036 __m256i out;
1037
1038 out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
1039 out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
1040 return out;
1041 }
1042
1043 /*
1044 * =============================================================================
1045 * Description : Dot product of halfword vector elements
1046 * Arguments : Inputs - in_c, in_h, in_l
1047 * Output - out
1048 * Return Type - per RTYPE
1049 * Details : Signed halfword elements from in_h are multiplied with
1050 * signed halfword elements from in_l producing a result
1051 * twice the size of input i.e. signed word.
1052 * Multiplication result of adjacent odd-even elements
1053 * are added to the in_c vector.
1054 * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1055 * in_c : 1,2,3,4, 1,2,3,4
1056 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
1057 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
1058 * out : 23,40,41,26, 23,40,41,26
1059 * =============================================================================
1060 */
__lasx_xvdp2add_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1061 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
1062 __m256i in_l) {
1063 __m256i out;
1064
1065 out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
1066 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1067 return out;
1068 }
1069
1070 /*
1071 * =============================================================================
1072 * Description : Dot product of halfword vector elements
1073 * Arguments : Inputs - in_c, in_h, in_l
1074 * Output - out
1075 * Return Type - signed word
1076 * Details : Unsigned halfword elements from in_h are multiplied with
1077 * unsigned halfword elements from in_l producing a result
1078 * twice the size of input i.e. signed word.
1079 * Multiplication result of adjacent odd-even elements
1080 * are added to the in_c vector.
1081 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1082 * =============================================================================
1083 */
__lasx_xvdp2add_w_hu(__m256i in_c,__m256i in_h,__m256i in_l)1084 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
1085 __m256i in_l) {
1086 __m256i out;
1087
1088 out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
1089 out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
1090 return out;
1091 }
1092
1093 /*
1094 * =============================================================================
1095 * Description : Dot product of halfword vector elements
1096 * Arguments : Inputs - in_c, in_h, in_l
1097 * Output - out
1098 * Return Type - signed word
1099 * Details : Unsigned halfword elements from in_h are multiplied with
1100 * signed halfword elements from in_l producing a result
1101 * twice the size of input i.e. signed word.
1102 * Multiplication result of adjacent odd-even elements
1103 * are added to the in_c vector
1104 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
1105 * =============================================================================
1106 */
__lasx_xvdp2add_w_hu_h(__m256i in_c,__m256i in_h,__m256i in_l)1107 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
1108 __m256i in_l) {
1109 __m256i out;
1110
1111 out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
1112 out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
1113 return out;
1114 }
1115
1116 /*
1117 * =============================================================================
1118 * Description : Vector Unsigned Dot Product and Subtract
1119 * Arguments : Inputs - in_c, in_h, in_l
1120 * Output - out
1121 * Return Type - signed halfword
1122 * Details : Unsigned byte elements from in_h are multiplied with
1123 * unsigned byte elements from in_l producing a result
1124 * twice the size of input i.e. signed halfword.
1125 * Multiplication result of adjacent odd-even elements
1126 * are added together and subtracted from double width elements
1127 * in_c vector.
1128 * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1129 * =============================================================================
1130 */
__lasx_xvdp2sub_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)1131 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
1132 __m256i in_l) {
1133 __m256i out;
1134
1135 out = __lasx_xvmulwev_h_bu(in_h, in_l);
1136 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
1137 out = __lasx_xvsub_h(in_c, out);
1138 return out;
1139 }
1140
1141 /*
1142 * =============================================================================
1143 * Description : Vector Signed Dot Product and Subtract
1144 * Arguments : Inputs - in_c, in_h, in_l
1145 * Output - out
1146 * Return Type - signed word
1147 * Details : Signed halfword elements from in_h are multiplied with
1148 * Signed halfword elements from in_l producing a result
1149 * twice the size of input i.e. signed word.
1150 * Multiplication result of adjacent odd-even elements
1151 * are added together and subtracted from double width elements
1152 * in_c vector.
1153 * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1154 * in_c : 0,0,0,0, 0,0,0,0
1155 * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1156 * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
1157 * out : -7,-3,0,0, 0,-1,0,-1
1158 * =============================================================================
1159 */
__lasx_xvdp2sub_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1160 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
1161 __m256i in_l) {
1162 __m256i out;
1163
1164 out = __lasx_xvmulwev_w_h(in_h, in_l);
1165 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1166 out = __lasx_xvsub_w(in_c, out);
1167 return out;
1168 }
1169
1170 /*
1171 * =============================================================================
1172 * Description : Dot product of halfword vector elements
1173 * Arguments : Inputs - in_h, in_l
1174 * Output - out
1175 * Return Type - signed word
1176 * Details : Signed halfword elements from in_h are multiplied with
1177 * signed halfword elements from in_l producing a result
1178 * four times the size of input i.e. signed doubleword.
1179 * Then these multiplication results of four adjacent elements
1180 * are added together and stored to the out vector.
1181 * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
1182 * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1183 * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1184 * out : -2,0,1,1
1185 * =============================================================================
1186 */
__lasx_xvdp4_d_h(__m256i in_h,__m256i in_l)1187 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1188 __m256i out;
1189
1190 out = __lasx_xvmulwev_w_h(in_h, in_l);
1191 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1192 out = __lasx_xvhaddw_d_w(out, out);
1193 return out;
1194 }
1195
1196 /*
1197 * =============================================================================
1198 * Description : The high half of the vector elements are expanded and
1199 * added after being doubled.
1200 * Arguments : Inputs - in_h, in_l
1201 * Output - out
1202 * Details : The in_h vector and the in_l vector are added after the
1203 * higher half of the two-fold sign extension (signed byte
1204 * to signed halfword) and stored to the out vector.
1205 * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1206 * =============================================================================
1207 */
__lasx_xvaddwh_h_b(__m256i in_h,__m256i in_l)1208 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1209 __m256i out;
1210
1211 out = __lasx_xvilvh_b(in_h, in_l);
1212 out = __lasx_xvhaddw_h_b(out, out);
1213 return out;
1214 }
1215
1216 /*
1217 * =============================================================================
1218 * Description : The high half of the vector elements are expanded and
1219 * added after being doubled.
1220 * Arguments : Inputs - in_h, in_l
1221 * Output - out
1222 * Details : The in_h vector and the in_l vector are added after the
1223 * higher half of the two-fold sign extension (signed halfword
1224 * to signed word) and stored to the out vector.
1225 * Example : out = __lasx_xvaddwh_w_h(in_h, in_l)
1226 * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1227 * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1228 * out : 1,0,0,-1, 1,0,0, 2
1229 * =============================================================================
1230 */
__lasx_xvaddwh_w_h(__m256i in_h,__m256i in_l)1231 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1232 __m256i out;
1233
1234 out = __lasx_xvilvh_h(in_h, in_l);
1235 out = __lasx_xvhaddw_w_h(out, out);
1236 return out;
1237 }
1238
1239 /*
1240 * =============================================================================
1241 * Description : The low half of the vector elements are expanded and
1242 * added after being doubled.
1243 * Arguments : Inputs - in_h, in_l
1244 * Output - out
1245 * Details : The in_h vector and the in_l vector are added after the
1246 * lower half of the two-fold sign extension (signed byte
1247 * to signed halfword) and stored to the out vector.
1248 * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1249 * =============================================================================
1250 */
__lasx_xvaddwl_h_b(__m256i in_h,__m256i in_l)1251 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1252 __m256i out;
1253
1254 out = __lasx_xvilvl_b(in_h, in_l);
1255 out = __lasx_xvhaddw_h_b(out, out);
1256 return out;
1257 }
1258
1259 /*
1260 * =============================================================================
1261 * Description : The low half of the vector elements are expanded and
1262 * added after being doubled.
1263 * Arguments : Inputs - in_h, in_l
1264 * Output - out
1265 * Details : The in_h vector and the in_l vector are added after the
1266 * lower half of the two-fold sign extension (signed halfword
1267 * to signed word) and stored to the out vector.
1268 * Example : out = __lasx_xvaddwl_w_h(in_h, in_l)
1269 * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1270 * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1271 * out : 5,-1,4,2, 1,0,2,-1
1272 * =============================================================================
1273 */
__lasx_xvaddwl_w_h(__m256i in_h,__m256i in_l)1274 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1275 __m256i out;
1276
1277 out = __lasx_xvilvl_h(in_h, in_l);
1278 out = __lasx_xvhaddw_w_h(out, out);
1279 return out;
1280 }
1281
1282 /*
1283 * =============================================================================
1284 * Description : The low half of the vector elements are expanded and
1285 * added after being doubled.
1286 * Arguments : Inputs - in_h, in_l
1287 * Output - out
1288 * Details : The out vector and the out vector are added after the
1289 * lower half of the two-fold zero extension (unsigned byte
1290 * to unsigned halfword) and stored to the out vector.
1291 * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1292 * =============================================================================
1293 */
__lasx_xvaddwl_h_bu(__m256i in_h,__m256i in_l)1294 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1295 __m256i out;
1296
1297 out = __lasx_xvilvl_b(in_h, in_l);
1298 out = __lasx_xvhaddw_hu_bu(out, out);
1299 return out;
1300 }
1301
1302 /*
1303 * =============================================================================
1304 * Description : The low half of the vector elements are expanded and
1305 * added after being doubled.
1306 * Arguments : Inputs - in_h, in_l
1307 * Output - out
1308 * Details : The in_l vector after double zero extension (unsigned byte to
1309 * signed halfword),added to the in_h vector.
1310 * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1311 * =============================================================================
1312 */
__lasx_xvaddw_h_h_bu(__m256i in_h,__m256i in_l)1313 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1314 __m256i out;
1315
1316 out = __lasx_xvsllwil_hu_bu(in_l, 0);
1317 out = __lasx_xvadd_h(in_h, out);
1318 return out;
1319 }
1320
1321 /*
1322 * =============================================================================
1323 * Description : The low half of the vector elements are expanded and
1324 * added after being doubled.
1325 * Arguments : Inputs - in_h, in_l
1326 * Output - out
1327 * Details : The in_l vector after double sign extension (signed halfword to
1328 * signed word), added to the in_h vector.
1329 * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1330 * in_h : 0, 1,0,0, -1,0,0,1,
1331 * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1,
1332 * out : 2, 0,1,2, -1,0,1,1,
1333 * =============================================================================
1334 */
__lasx_xvaddw_w_w_h(__m256i in_h,__m256i in_l)1335 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1336 __m256i out;
1337
1338 out = __lasx_xvsllwil_w_h(in_l, 0);
1339 out = __lasx_xvadd_w(in_h, out);
1340 return out;
1341 }
1342
1343 /*
1344 * =============================================================================
1345 * Description : Multiplication and addition calculation after expansion
1346 * of the lower half of the vector.
1347 * Arguments : Inputs - in_c, in_h, in_l
1348 * Output - out
1349 * Details : The in_h vector and the in_l vector are multiplied after
1350 * the lower half of the two-fold sign extension (signed halfword
1351 * to signed word), and the result is added to the vector in_c,
1352 * then stored to the out vector.
1353 * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1354 * in_c : 1,2,3,4, 5,6,7,8
1355 * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1356 * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000,
1357 * -200,-300,-400,-500, -2000,-3000,-4000,-5000
1358 * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1359 * =============================================================================
1360 */
__lasx_xvmaddwl_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1361 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
1362 __m256i in_l) {
1363 __m256i tmp0, tmp1, out;
1364
1365 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1366 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1367 tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1368 out = __lasx_xvadd_w(tmp0, in_c);
1369 return out;
1370 }
1371
1372 /*
1373 * =============================================================================
1374 * Description : Multiplication and addition calculation after expansion
1375 * of the higher half of the vector.
1376 * Arguments : Inputs - in_c, in_h, in_l
1377 * Output - out
1378 * Details : The in_h vector and the in_l vector are multiplied after
1379 * the higher half of the two-fold sign extension (signed
1380 * halfword to signed word), and the result is added to
1381 * the vector in_c, then stored to the out vector.
1382 * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1383 * =============================================================================
1384 */
__lasx_xvmaddwh_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1385 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
1386 __m256i in_l) {
1387 __m256i tmp0, tmp1, out;
1388
1389 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1390 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1391 tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1392 out = __lasx_xvadd_w(tmp0, in_c);
1393 return out;
1394 }
1395
1396 /*
1397 * =============================================================================
1398 * Description : Multiplication calculation after expansion of the lower
1399 * half of the vector.
1400 * Arguments : Inputs - in_h, in_l
1401 * Output - out
1402 * Details : The in_h vector and the in_l vector are multiplied after
1403 * the lower half of the two-fold sign extension (signed
1404 * halfword to signed word), then stored to the out vector.
1405 * Example : out = __lasx_xvmulwl_w_h(in_h, in_l)
1406 * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1407 * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1408 * out : 6,1,3,0, 0,0,1,0
1409 * =============================================================================
1410 */
__lasx_xvmulwl_w_h(__m256i in_h,__m256i in_l)1411 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1412 __m256i tmp0, tmp1, out;
1413
1414 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1415 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1416 out = __lasx_xvmul_w(tmp0, tmp1);
1417 return out;
1418 }
1419
1420 /*
1421 * =============================================================================
1422 * Description : Multiplication calculation after expansion of the lower
1423 * half of the vector.
1424 * Arguments : Inputs - in_h, in_l
1425 * Output - out
1426 * Details : The in_h vector and the in_l vector are multiplied after
1427 * the lower half of the two-fold sign extension (signed
1428 * halfword to signed word), then stored to the out vector.
1429 * Example : out = __lasx_xvmulwh_w_h(in_h, in_l)
1430 * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1431 * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1432 * out : 0,0,0,0, 0,0,0,1
1433 * =============================================================================
1434 */
__lasx_xvmulwh_w_h(__m256i in_h,__m256i in_l)1435 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1436 __m256i tmp0, tmp1, out;
1437
1438 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1439 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1440 out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1441 return out;
1442 }
1443
1444 /*
1445 * =============================================================================
1446 * Description : The low half of the vector elements are added to the high half
1447 * after being doubled, then saturated.
1448 * Arguments : Inputs - in_h, in_l
1449 * Output - out
1450 * Details : The in_h vector adds the in_l vector after the lower half of
1451 * the two-fold zero extension (unsigned byte to unsigned
1452 * halfword) and then saturated. The results are stored to the out
1453 * vector.
1454 * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1455 * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1456 * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1457 * 0,0,0,1
1458 * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1459 * =============================================================================
1460 */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h,__m256i in_l)1461 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1462 __m256i tmp1, out;
1463 __m256i zero = { 0 };
1464
1465 tmp1 = __lasx_xvilvl_b(zero, in_l);
1466 out = __lasx_xvsadd_hu(in_h, tmp1);
1467 return out;
1468 }
1469
1470 /*
1471 * =============================================================================
1472 * Description : Clip all halfword elements of input vector between min & max
1473 * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1474 * Arguments : Inputs - in (input vector)
1475 * - min (min threshold)
1476 * - max (max threshold)
1477 * Outputs - in (output vector with clipped elements)
1478 * Return Type - signed halfword
1479 * Example : out = __lasx_xvclip_h(in, min, max)
1480 * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1481 * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1482 * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1483 * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1484 * =============================================================================
1485 */
__lasx_xvclip_h(__m256i in,__m256i min,__m256i max)1486 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1487 __m256i out;
1488
1489 out = __lasx_xvmax_h(min, in);
1490 out = __lasx_xvmin_h(max, out);
1491 return out;
1492 }
1493
1494 /*
1495 * =============================================================================
1496 * Description : Clip all signed halfword elements of input vector
1497 * between 0 & 255
1498 * Arguments : Inputs - in (input vector)
1499 * Outputs - out (output vector with clipped elements)
1500 * Return Type - signed halfword
1501 * Example : See out = __lasx_xvclip255_w(in)
1502 * =============================================================================
1503 */
__lasx_xvclip255_h(__m256i in)1504 static inline __m256i __lasx_xvclip255_h(__m256i in) {
1505 __m256i out;
1506
1507 out = __lasx_xvmaxi_h(in, 0);
1508 out = __lasx_xvsat_hu(out, 7);
1509 return out;
1510 }
1511
1512 /*
1513 * =============================================================================
1514 * Description : Clip all signed word elements of input vector
1515 * between 0 & 255
1516 * Arguments : Inputs - in (input vector)
1517 * Output - out (output vector with clipped elements)
1518 * Return Type - signed word
1519 * Example : out = __lasx_xvclip255_w(in)
1520 * in : -8,255,280,249, -8,255,280,249
1521 * out : 0,255,255,249, 0,255,255,249
1522 * =============================================================================
1523 */
__lasx_xvclip255_w(__m256i in)1524 static inline __m256i __lasx_xvclip255_w(__m256i in) {
1525 __m256i out;
1526
1527 out = __lasx_xvmaxi_w(in, 0);
1528 out = __lasx_xvsat_wu(out, 7);
1529 return out;
1530 }
1531
1532 /*
1533 * =============================================================================
1534 * Description : Indexed halfword element values are replicated to all
1535 * elements in output vector. If 'idx < 8' use xvsplati_l_*,
1536 * if 'idx >= 8' use xvsplati_h_*.
1537 * Arguments : Inputs - in, idx
1538 * Output - out
1539 * Details : Idx element value from in vector is replicated to all
1540 * elements in out vector.
1541 * Valid index range for halfword operation is 0-7
1542 * Example : out = __lasx_xvsplati_l_h(in, idx)
1543 * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1544 * idx : 0x02
1545 * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1546 * =============================================================================
1547 */
__lasx_xvsplati_l_h(__m256i in,int idx)1548 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1549 __m256i out;
1550
1551 out = __lasx_xvpermi_q(in, in, 0x02);
1552 out = __lasx_xvreplve_h(out, idx);
1553 return out;
1554 }
1555
1556 /*
1557 * =============================================================================
1558 * Description : Indexed halfword element values are replicated to all
1559 * elements in output vector. If 'idx < 8' use xvsplati_l_*,
1560 * if 'idx >= 8' use xvsplati_h_*.
1561 * Arguments : Inputs - in, idx
1562 * Output - out
1563 * Details : Idx element value from in vector is replicated to all
1564 * elements in out vector.
1565 * Valid index range for halfword operation is 0-7
1566 * Example : out = __lasx_xvsplati_h_h(in, idx)
1567 * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1568 * idx : 0x09
1569 * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1570 * =============================================================================
1571 */
__lasx_xvsplati_h_h(__m256i in,int idx)1572 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1573 __m256i out;
1574
1575 out = __lasx_xvpermi_q(in, in, 0x13);
1576 out = __lasx_xvreplve_h(out, idx);
1577 return out;
1578 }
1579
1580 /*
1581 * =============================================================================
1582 * Description : Transpose 4x4 block with double-word elements in vectors
1583 * Arguments : Inputs - _in0, _in1, _in2, _in3
1584 * Outputs - _out0, _out1, _out2, _out3
1585 * Example : LASX_TRANSPOSE4x4_D
1586 * _in0 : 1,2,3,4
1587 * _in1 : 1,2,3,4
1588 * _in2 : 1,2,3,4
1589 * _in3 : 1,2,3,4
1590 *
1591 * _out0 : 1,1,1,1
1592 * _out1 : 2,2,2,2
1593 * _out2 : 3,3,3,3
1594 * _out3 : 4,4,4,4
1595 * =============================================================================
1596 */
1597 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1598 _out3) \
1599 { \
1600 __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
1601 _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
1602 _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
1603 _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
1604 _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
1605 _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
1606 _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
1607 _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
1608 _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
1609 }
1610
1611 /*
1612 * =============================================================================
1613 * Description : Transpose 8x8 block with word elements in vectors
1614 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1615 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1616 * _out7
1617 * Example : LASX_TRANSPOSE8x8_W
1618 * _in0 : 1,2,3,4,5,6,7,8
1619 * _in1 : 2,2,3,4,5,6,7,8
1620 * _in2 : 3,2,3,4,5,6,7,8
1621 * _in3 : 4,2,3,4,5,6,7,8
1622 * _in4 : 5,2,3,4,5,6,7,8
1623 * _in5 : 6,2,3,4,5,6,7,8
1624 * _in6 : 7,2,3,4,5,6,7,8
1625 * _in7 : 8,2,3,4,5,6,7,8
1626 *
1627 * _out0 : 1,2,3,4,5,6,7,8
1628 * _out1 : 2,2,2,2,2,2,2,2
1629 * _out2 : 3,3,3,3,3,3,3,3
1630 * _out3 : 4,4,4,4,4,4,4,4
1631 * _out4 : 5,5,5,5,5,5,5,5
1632 * _out5 : 6,6,6,6,6,6,6,6
1633 * _out6 : 7,7,7,7,7,7,7,7
1634 * _out7 : 8,8,8,8,8,8,8,8
1635 * =============================================================================
1636 */
1637 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1638 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1639 _out7) \
1640 { \
1641 __m256i _s0_m, _s1_m; \
1642 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1643 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1644 \
1645 _s0_m = __lasx_xvilvl_w(_in2, _in0); \
1646 _s1_m = __lasx_xvilvl_w(_in3, _in1); \
1647 _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1648 _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1649 _s0_m = __lasx_xvilvh_w(_in2, _in0); \
1650 _s1_m = __lasx_xvilvh_w(_in3, _in1); \
1651 _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1652 _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1653 _s0_m = __lasx_xvilvl_w(_in6, _in4); \
1654 _s1_m = __lasx_xvilvl_w(_in7, _in5); \
1655 _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1656 _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1657 _s0_m = __lasx_xvilvh_w(_in6, _in4); \
1658 _s1_m = __lasx_xvilvh_w(_in7, _in5); \
1659 _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1660 _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1661 _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
1662 _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
1663 _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
1664 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
1665 _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
1666 _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
1667 _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
1668 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
1669 }
1670
1671 /*
1672 * =============================================================================
1673 * Description : Transpose input 16x8 byte block
1674 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1675 * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1676 * (input 16x8 byte block)
1677 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1678 * _out7 (output 8x16 byte block)
1679 * Details : The rows of the matrix become columns, and the columns become
1680 * rows.
1681 * Example : See LASX_TRANSPOSE16x8_H
1682 * =============================================================================
1683 */
1684 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1685 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1686 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1687 _out6, _out7) \
1688 { \
1689 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1690 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1691 \
1692 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1693 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1694 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1695 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1696 _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
1697 _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
1698 _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
1699 _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
1700 _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1701 _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1702 _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1703 _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1704 _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
1705 _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
1706 _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
1707 _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
1708 _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
1709 _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
1710 _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
1711 _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
1712 _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
1713 _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
1714 _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
1715 _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
1716 _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
1717 _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
1718 _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
1719 _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
1720 _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
1721 _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
1722 _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
1723 _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
1724 }
1725
1726 /*
1727 * =============================================================================
1728 * Description : Transpose input 16x8 byte block
1729 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1730 * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1731 * (input 16x8 byte block)
1732 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1733 * _out7 (output 8x16 byte block)
1734 * Details : The rows of the matrix become columns, and the columns become
1735 * rows.
1736 * Example : LASX_TRANSPOSE16x8_H
1737 * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1738 * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1739 * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1740 * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1741 * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1742 * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1743 * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1744 * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1745 * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1746 * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1747 * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1748 * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1749 * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1750 * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1751 * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1752 * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1753 *
1754 * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1755 * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1756 * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1757 * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1758 * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1759 * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1760 * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1761 * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1762 * =============================================================================
1763 */
1764 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1765 _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1766 _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1767 _out6, _out7) \
1768 { \
1769 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1770 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1771 __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
1772 \
1773 _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
1774 _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
1775 _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
1776 _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
1777 _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
1778 _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
1779 _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
1780 _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
1781 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1782 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1783 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1784 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1785 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1786 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1787 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1788 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1789 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1790 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1791 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1792 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1793 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1794 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1795 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1796 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1797 _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1798 _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1799 _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1800 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1801 \
1802 _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
1803 _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
1804 _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
1805 _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
1806 _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
1807 _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
1808 _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
1809 _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
1810 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1811 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1812 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1813 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1814 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1815 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1816 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1817 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1818 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1819 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1820 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1821 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1822 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1823 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1824 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1825 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1826 _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1827 _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1828 _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1829 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1830 }
1831
1832 /*
1833 * =============================================================================
1834 * Description : Transpose 4x4 block with halfword elements in vectors
1835 * Arguments : Inputs - _in0, _in1, _in2, _in3
1836 * Outputs - _out0, _out1, _out2, _out3
1837 * Return Type - signed halfword
1838 * Details : The rows of the matrix become columns, and the columns become
1839 * rows.
1840 * Example : See LASX_TRANSPOSE8x8_H
1841 * =============================================================================
1842 */
1843 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1844 _out3) \
1845 { \
1846 __m256i _s0_m, _s1_m; \
1847 \
1848 _s0_m = __lasx_xvilvl_h(_in1, _in0); \
1849 _s1_m = __lasx_xvilvl_h(_in3, _in2); \
1850 _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
1851 _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
1852 _out1 = __lasx_xvilvh_d(_out0, _out0); \
1853 _out3 = __lasx_xvilvh_d(_out2, _out2); \
1854 }
1855
1856 /*
1857 * =============================================================================
1858 * Description : Transpose input 8x8 byte block
1859 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1860 * (input 8x8 byte block)
1861 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1862 * _out7 (output 8x8 byte block)
1863 * Example : See LASX_TRANSPOSE8x8_H
1864 * =============================================================================
1865 */
1866 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1867 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1868 _out7) \
1869 { \
1870 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1871 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1872 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1873 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1874 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1875 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1876 _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1877 _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1878 _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1879 _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1880 _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
1881 _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
1882 _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
1883 _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
1884 _out1 = __lasx_xvbsrl_v(_out0, 8); \
1885 _out3 = __lasx_xvbsrl_v(_out2, 8); \
1886 _out5 = __lasx_xvbsrl_v(_out4, 8); \
1887 _out7 = __lasx_xvbsrl_v(_out6, 8); \
1888 }
1889
1890 /*
1891 * =============================================================================
1892 * Description : Transpose 8x8 block with halfword elements in vectors.
1893 * Arguments : Inputs - _in0, _in1, ~
1894 * Outputs - _out0, _out1, ~
1895 * Details : The rows of the matrix become columns, and the columns become
1896 * rows.
1897 * Example : LASX_TRANSPOSE8x8_H
1898 * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1899 * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1900 * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1901 * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1902 * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1903 * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1904 * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1905 * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1906 *
1907 * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1908 * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1909 * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1910 * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1911 * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1912 * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1913 * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1914 * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1915 * =============================================================================
1916 */
1917 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1918 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1919 _out7) \
1920 { \
1921 __m256i _s0_m, _s1_m; \
1922 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1923 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1924 \
1925 _s0_m = __lasx_xvilvl_h(_in6, _in4); \
1926 _s1_m = __lasx_xvilvl_h(_in7, _in5); \
1927 _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1928 _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1929 _s0_m = __lasx_xvilvh_h(_in6, _in4); \
1930 _s1_m = __lasx_xvilvh_h(_in7, _in5); \
1931 _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1932 _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1933 \
1934 _s0_m = __lasx_xvilvl_h(_in2, _in0); \
1935 _s1_m = __lasx_xvilvl_h(_in3, _in1); \
1936 _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1937 _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1938 _s0_m = __lasx_xvilvh_h(_in2, _in0); \
1939 _s1_m = __lasx_xvilvh_h(_in3, _in1); \
1940 _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1941 _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1942 \
1943 _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
1944 _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
1945 _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
1946 _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
1947 _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
1948 _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
1949 _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
1950 _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
1951 }
1952
1953 /*
1954 * =============================================================================
1955 * Description : Butterfly of 4 input vectors
1956 * Arguments : Inputs - _in0, _in1, _in2, _in3
1957 * Outputs - _out0, _out1, _out2, _out3
1958 * Details : Butterfly operation
1959 * Example : LASX_BUTTERFLY_4
1960 * _out0 = _in0 + _in3;
1961 * _out1 = _in1 + _in2;
1962 * _out2 = _in1 - _in2;
1963 * _out3 = _in0 - _in3;
1964 * =============================================================================
1965 */
1966 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1967 { \
1968 _out0 = __lasx_xvadd_b(_in0, _in3); \
1969 _out1 = __lasx_xvadd_b(_in1, _in2); \
1970 _out2 = __lasx_xvsub_b(_in1, _in2); \
1971 _out3 = __lasx_xvsub_b(_in0, _in3); \
1972 }
1973 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1974 { \
1975 _out0 = __lasx_xvadd_h(_in0, _in3); \
1976 _out1 = __lasx_xvadd_h(_in1, _in2); \
1977 _out2 = __lasx_xvsub_h(_in1, _in2); \
1978 _out3 = __lasx_xvsub_h(_in0, _in3); \
1979 }
1980 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1981 { \
1982 _out0 = __lasx_xvadd_w(_in0, _in3); \
1983 _out1 = __lasx_xvadd_w(_in1, _in2); \
1984 _out2 = __lasx_xvsub_w(_in1, _in2); \
1985 _out3 = __lasx_xvsub_w(_in0, _in3); \
1986 }
1987 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1988 { \
1989 _out0 = __lasx_xvadd_d(_in0, _in3); \
1990 _out1 = __lasx_xvadd_d(_in1, _in2); \
1991 _out2 = __lasx_xvsub_d(_in1, _in2); \
1992 _out3 = __lasx_xvsub_d(_in0, _in3); \
1993 }
1994
1995 /*
1996 * =============================================================================
1997 * Description : Butterfly of 8 input vectors
1998 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
1999 * Outputs - _out0, _out1, _out2, _out3, ~
2000 * Details : Butterfly operation
2001 * Example : LASX_BUTTERFLY_8
2002 * _out0 = _in0 + _in7;
2003 * _out1 = _in1 + _in6;
2004 * _out2 = _in2 + _in5;
2005 * _out3 = _in3 + _in4;
2006 * _out4 = _in3 - _in4;
2007 * _out5 = _in2 - _in5;
2008 * _out6 = _in1 - _in6;
2009 * _out7 = _in0 - _in7;
2010 * =============================================================================
2011 */
2012 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
2013 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2014 _out7) \
2015 { \
2016 _out0 = __lasx_xvadd_b(_in0, _in7); \
2017 _out1 = __lasx_xvadd_b(_in1, _in6); \
2018 _out2 = __lasx_xvadd_b(_in2, _in5); \
2019 _out3 = __lasx_xvadd_b(_in3, _in4); \
2020 _out4 = __lasx_xvsub_b(_in3, _in4); \
2021 _out5 = __lasx_xvsub_b(_in2, _in5); \
2022 _out6 = __lasx_xvsub_b(_in1, _in6); \
2023 _out7 = __lasx_xvsub_b(_in0, _in7); \
2024 }
2025
2026 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
2027 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2028 _out7) \
2029 { \
2030 _out0 = __lasx_xvadd_h(_in0, _in7); \
2031 _out1 = __lasx_xvadd_h(_in1, _in6); \
2032 _out2 = __lasx_xvadd_h(_in2, _in5); \
2033 _out3 = __lasx_xvadd_h(_in3, _in4); \
2034 _out4 = __lasx_xvsub_h(_in3, _in4); \
2035 _out5 = __lasx_xvsub_h(_in2, _in5); \
2036 _out6 = __lasx_xvsub_h(_in1, _in6); \
2037 _out7 = __lasx_xvsub_h(_in0, _in7); \
2038 }
2039
2040 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
2041 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2042 _out7) \
2043 { \
2044 _out0 = __lasx_xvadd_w(_in0, _in7); \
2045 _out1 = __lasx_xvadd_w(_in1, _in6); \
2046 _out2 = __lasx_xvadd_w(_in2, _in5); \
2047 _out3 = __lasx_xvadd_w(_in3, _in4); \
2048 _out4 = __lasx_xvsub_w(_in3, _in4); \
2049 _out5 = __lasx_xvsub_w(_in2, _in5); \
2050 _out6 = __lasx_xvsub_w(_in1, _in6); \
2051 _out7 = __lasx_xvsub_w(_in0, _in7); \
2052 }
2053
2054 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
2055 _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
2056 _out7) \
2057 { \
2058 _out0 = __lasx_xvadd_d(_in0, _in7); \
2059 _out1 = __lasx_xvadd_d(_in1, _in6); \
2060 _out2 = __lasx_xvadd_d(_in2, _in5); \
2061 _out3 = __lasx_xvadd_d(_in3, _in4); \
2062 _out4 = __lasx_xvsub_d(_in3, _in4); \
2063 _out5 = __lasx_xvsub_d(_in2, _in5); \
2064 _out6 = __lasx_xvsub_d(_in1, _in6); \
2065 _out7 = __lasx_xvsub_d(_in0, _in7); \
2066 }
2067
2068 #endif // LASX
2069
2070 /*
2071 * =============================================================================
2072 * Description : Print out elements in vector.
2073 * Arguments : Inputs - RTYPE, _element_num, _in0, _enter
2074 * Outputs -
2075 * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
2076 * '_enter' is TRUE, prefix "\nVP:" will be added first.
2077 * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
2078 * VP:1,2,3,4,
2079 * =============================================================================
2080 */
2081 #define VECT_PRINT(RTYPE, element_num, in0, enter) \
2082 { \
2083 RTYPE _tmp0 = (RTYPE)in0; \
2084 int _i = 0; \
2085 if (enter) printf("\nVP:"); \
2086 for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
2087 }
2088
2089 #endif /* LOONGSON_INTRINSICS_H */
2090 #endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */
2091