1*b2055c35SXin Li // Copyright 2016 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // MSA common macros
11*b2055c35SXin Li //
12*b2055c35SXin Li // Author(s): Prashant Patil ([email protected])
13*b2055c35SXin Li
14*b2055c35SXin Li #ifndef WEBP_DSP_MSA_MACRO_H_
15*b2055c35SXin Li #define WEBP_DSP_MSA_MACRO_H_
16*b2055c35SXin Li
17*b2055c35SXin Li #include "src/dsp/dsp.h"
18*b2055c35SXin Li
19*b2055c35SXin Li #if defined(WEBP_USE_MSA)
20*b2055c35SXin Li
21*b2055c35SXin Li #include <stdint.h>
22*b2055c35SXin Li #include <msa.h>
23*b2055c35SXin Li
24*b2055c35SXin Li #if defined(__clang__)
25*b2055c35SXin Li #define CLANG_BUILD
26*b2055c35SXin Li #endif
27*b2055c35SXin Li
28*b2055c35SXin Li #ifdef CLANG_BUILD
29*b2055c35SXin Li #define ALPHAVAL (-1)
30*b2055c35SXin Li #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)
31*b2055c35SXin Li #define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b)
32*b2055c35SXin Li #define SRAI_B(a, b) __msa_srai_b((v16i8)a, b)
33*b2055c35SXin Li #define SRAI_H(a, b) __msa_srai_h((v8i16)a, b)
34*b2055c35SXin Li #define SRAI_W(a, b) __msa_srai_w((v4i32)a, b)
35*b2055c35SXin Li #define SRLI_H(a, b) __msa_srli_h((v8i16)a, b)
36*b2055c35SXin Li #define SLLI_B(a, b) __msa_slli_b((v4i32)a, b)
37*b2055c35SXin Li #define ANDI_B(a, b) __msa_andi_b((v16u8)a, b)
38*b2055c35SXin Li #define ORI_B(a, b) __msa_ori_b((v16u8)a, b)
39*b2055c35SXin Li #else
40*b2055c35SXin Li #define ALPHAVAL (0xff)
41*b2055c35SXin Li #define ADDVI_H(a, b) (a + b)
42*b2055c35SXin Li #define ADDVI_W(a, b) (a + b)
43*b2055c35SXin Li #define SRAI_B(a, b) (a >> b)
44*b2055c35SXin Li #define SRAI_H(a, b) (a >> b)
45*b2055c35SXin Li #define SRAI_W(a, b) (a >> b)
46*b2055c35SXin Li #define SRLI_H(a, b) (a << b)
47*b2055c35SXin Li #define SLLI_B(a, b) (a << b)
48*b2055c35SXin Li #define ANDI_B(a, b) (a & b)
49*b2055c35SXin Li #define ORI_B(a, b) (a | b)
50*b2055c35SXin Li #endif
51*b2055c35SXin Li
52*b2055c35SXin Li #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
53*b2055c35SXin Li #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
54*b2055c35SXin Li #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
55*b2055c35SXin Li
56*b2055c35SXin Li #define LD_H(RTYPE, psrc) *((RTYPE*)(psrc))
57*b2055c35SXin Li #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
58*b2055c35SXin Li #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
59*b2055c35SXin Li
60*b2055c35SXin Li #define LD_W(RTYPE, psrc) *((RTYPE*)(psrc))
61*b2055c35SXin Li #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
62*b2055c35SXin Li #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
63*b2055c35SXin Li
64*b2055c35SXin Li #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
65*b2055c35SXin Li #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
66*b2055c35SXin Li #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
67*b2055c35SXin Li
68*b2055c35SXin Li #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
69*b2055c35SXin Li #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
70*b2055c35SXin Li #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
71*b2055c35SXin Li
72*b2055c35SXin Li #define ST_W(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
73*b2055c35SXin Li #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
74*b2055c35SXin Li #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
75*b2055c35SXin Li
76*b2055c35SXin Li #define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME) \
77*b2055c35SXin Li static inline TYPE FUNC_NAME(const void* const psrc) { \
78*b2055c35SXin Li const uint8_t* const psrc_m = (const uint8_t*)psrc; \
79*b2055c35SXin Li TYPE val_m; \
80*b2055c35SXin Li __asm__ volatile("" #INSTR " %[val_m], %[psrc_m] \n\t" \
81*b2055c35SXin Li : [val_m] "=r"(val_m) \
82*b2055c35SXin Li : [psrc_m] "m"(*psrc_m)); \
83*b2055c35SXin Li return val_m; \
84*b2055c35SXin Li }
85*b2055c35SXin Li
86*b2055c35SXin Li #define MSA_LOAD(psrc, FUNC_NAME) FUNC_NAME(psrc)
87*b2055c35SXin Li
88*b2055c35SXin Li #define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME) \
89*b2055c35SXin Li static inline void FUNC_NAME(TYPE val, void* const pdst) { \
90*b2055c35SXin Li uint8_t* const pdst_m = (uint8_t*)pdst; \
91*b2055c35SXin Li TYPE val_m = val; \
92*b2055c35SXin Li __asm__ volatile(" " #INSTR " %[val_m], %[pdst_m] \n\t" \
93*b2055c35SXin Li : [pdst_m] "=m"(*pdst_m) \
94*b2055c35SXin Li : [val_m] "r"(val_m)); \
95*b2055c35SXin Li }
96*b2055c35SXin Li
97*b2055c35SXin Li #define MSA_STORE(val, pdst, FUNC_NAME) FUNC_NAME(val, pdst)
98*b2055c35SXin Li
99*b2055c35SXin Li #if (__mips_isa_rev >= 6)
100*b2055c35SXin Li MSA_LOAD_FUNC(uint16_t, lh, msa_lh);
101*b2055c35SXin Li #define LH(psrc) MSA_LOAD(psrc, msa_lh)
102*b2055c35SXin Li MSA_LOAD_FUNC(uint32_t, lw, msa_lw);
103*b2055c35SXin Li #define LW(psrc) MSA_LOAD(psrc, msa_lw)
104*b2055c35SXin Li #if (__mips == 64)
105*b2055c35SXin Li MSA_LOAD_FUNC(uint64_t, ld, msa_ld);
106*b2055c35SXin Li #define LD(psrc) MSA_LOAD(psrc, msa_ld)
107*b2055c35SXin Li #else // !(__mips == 64)
108*b2055c35SXin Li #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_lw)) << 32) | \
109*b2055c35SXin Li MSA_LOAD(psrc, msa_lw))
110*b2055c35SXin Li #endif // (__mips == 64)
111*b2055c35SXin Li
112*b2055c35SXin Li MSA_STORE_FUNC(uint16_t, sh, msa_sh);
113*b2055c35SXin Li #define SH(val, pdst) MSA_STORE(val, pdst, msa_sh)
114*b2055c35SXin Li MSA_STORE_FUNC(uint32_t, sw, msa_sw);
115*b2055c35SXin Li #define SW(val, pdst) MSA_STORE(val, pdst, msa_sw)
116*b2055c35SXin Li MSA_STORE_FUNC(uint64_t, sd, msa_sd);
117*b2055c35SXin Li #define SD(val, pdst) MSA_STORE(val, pdst, msa_sd)
118*b2055c35SXin Li #else // !(__mips_isa_rev >= 6)
119*b2055c35SXin Li MSA_LOAD_FUNC(uint16_t, ulh, msa_ulh);
120*b2055c35SXin Li #define LH(psrc) MSA_LOAD(psrc, msa_ulh)
121*b2055c35SXin Li MSA_LOAD_FUNC(uint32_t, ulw, msa_ulw);
122*b2055c35SXin Li #define LW(psrc) MSA_LOAD(psrc, msa_ulw)
123*b2055c35SXin Li #if (__mips == 64)
124*b2055c35SXin Li MSA_LOAD_FUNC(uint64_t, uld, msa_uld);
125*b2055c35SXin Li #define LD(psrc) MSA_LOAD(psrc, msa_uld)
126*b2055c35SXin Li #else // !(__mips == 64)
127*b2055c35SXin Li #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) | \
128*b2055c35SXin Li MSA_LOAD(psrc, msa_ulw))
129*b2055c35SXin Li #endif // (__mips == 64)
130*b2055c35SXin Li
131*b2055c35SXin Li MSA_STORE_FUNC(uint16_t, ush, msa_ush);
132*b2055c35SXin Li #define SH(val, pdst) MSA_STORE(val, pdst, msa_ush)
133*b2055c35SXin Li MSA_STORE_FUNC(uint32_t, usw, msa_usw);
134*b2055c35SXin Li #define SW(val, pdst) MSA_STORE(val, pdst, msa_usw)
135*b2055c35SXin Li #define SD(val, pdst) do { \
136*b2055c35SXin Li uint8_t* const pdst_sd_m = (uint8_t*)(pdst); \
137*b2055c35SXin Li const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF); \
138*b2055c35SXin Li const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF); \
139*b2055c35SXin Li SW(val0_m, pdst_sd_m); \
140*b2055c35SXin Li SW(val1_m, pdst_sd_m + 4); \
141*b2055c35SXin Li } while (0)
142*b2055c35SXin Li #endif // (__mips_isa_rev >= 6)
143*b2055c35SXin Li
144*b2055c35SXin Li /* Description : Load 4 words with stride
145*b2055c35SXin Li * Arguments : Inputs - psrc, stride
146*b2055c35SXin Li * Outputs - out0, out1, out2, out3
147*b2055c35SXin Li * Details : Load word in 'out0' from (psrc)
148*b2055c35SXin Li * Load word in 'out1' from (psrc + stride)
149*b2055c35SXin Li * Load word in 'out2' from (psrc + 2 * stride)
150*b2055c35SXin Li * Load word in 'out3' from (psrc + 3 * stride)
151*b2055c35SXin Li */
152*b2055c35SXin Li #define LW4(psrc, stride, out0, out1, out2, out3) do { \
153*b2055c35SXin Li const uint8_t* ptmp = (const uint8_t*)psrc; \
154*b2055c35SXin Li out0 = LW(ptmp); \
155*b2055c35SXin Li ptmp += stride; \
156*b2055c35SXin Li out1 = LW(ptmp); \
157*b2055c35SXin Li ptmp += stride; \
158*b2055c35SXin Li out2 = LW(ptmp); \
159*b2055c35SXin Li ptmp += stride; \
160*b2055c35SXin Li out3 = LW(ptmp); \
161*b2055c35SXin Li } while (0)
162*b2055c35SXin Li
163*b2055c35SXin Li /* Description : Store words with stride
164*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3, pdst, stride
165*b2055c35SXin Li * Details : Store word from 'in0' to (pdst)
166*b2055c35SXin Li * Store word from 'in1' to (pdst + stride)
167*b2055c35SXin Li * Store word from 'in2' to (pdst + 2 * stride)
168*b2055c35SXin Li * Store word from 'in3' to (pdst + 3 * stride)
169*b2055c35SXin Li */
170*b2055c35SXin Li #define SW4(in0, in1, in2, in3, pdst, stride) do { \
171*b2055c35SXin Li uint8_t* ptmp = (uint8_t*)pdst; \
172*b2055c35SXin Li SW(in0, ptmp); \
173*b2055c35SXin Li ptmp += stride; \
174*b2055c35SXin Li SW(in1, ptmp); \
175*b2055c35SXin Li ptmp += stride; \
176*b2055c35SXin Li SW(in2, ptmp); \
177*b2055c35SXin Li ptmp += stride; \
178*b2055c35SXin Li SW(in3, ptmp); \
179*b2055c35SXin Li } while (0)
180*b2055c35SXin Li
181*b2055c35SXin Li #define SW3(in0, in1, in2, pdst, stride) do { \
182*b2055c35SXin Li uint8_t* ptmp = (uint8_t*)pdst; \
183*b2055c35SXin Li SW(in0, ptmp); \
184*b2055c35SXin Li ptmp += stride; \
185*b2055c35SXin Li SW(in1, ptmp); \
186*b2055c35SXin Li ptmp += stride; \
187*b2055c35SXin Li SW(in2, ptmp); \
188*b2055c35SXin Li } while (0)
189*b2055c35SXin Li
190*b2055c35SXin Li #define SW2(in0, in1, pdst, stride) do { \
191*b2055c35SXin Li uint8_t* ptmp = (uint8_t*)pdst; \
192*b2055c35SXin Li SW(in0, ptmp); \
193*b2055c35SXin Li ptmp += stride; \
194*b2055c35SXin Li SW(in1, ptmp); \
195*b2055c35SXin Li } while (0)
196*b2055c35SXin Li
197*b2055c35SXin Li /* Description : Store 4 double words with stride
198*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3, pdst, stride
199*b2055c35SXin Li * Details : Store double word from 'in0' to (pdst)
200*b2055c35SXin Li * Store double word from 'in1' to (pdst + stride)
201*b2055c35SXin Li * Store double word from 'in2' to (pdst + 2 * stride)
202*b2055c35SXin Li * Store double word from 'in3' to (pdst + 3 * stride)
203*b2055c35SXin Li */
204*b2055c35SXin Li #define SD4(in0, in1, in2, in3, pdst, stride) do { \
205*b2055c35SXin Li uint8_t* ptmp = (uint8_t*)pdst; \
206*b2055c35SXin Li SD(in0, ptmp); \
207*b2055c35SXin Li ptmp += stride; \
208*b2055c35SXin Li SD(in1, ptmp); \
209*b2055c35SXin Li ptmp += stride; \
210*b2055c35SXin Li SD(in2, ptmp); \
211*b2055c35SXin Li ptmp += stride; \
212*b2055c35SXin Li SD(in3, ptmp); \
213*b2055c35SXin Li } while (0)
214*b2055c35SXin Li
215*b2055c35SXin Li /* Description : Load vectors with 16 byte elements with stride
216*b2055c35SXin Li * Arguments : Inputs - psrc, stride
217*b2055c35SXin Li * Outputs - out0, out1
218*b2055c35SXin Li * Return Type - as per RTYPE
219*b2055c35SXin Li * Details : Load 16 byte elements in 'out0' from (psrc)
220*b2055c35SXin Li * Load 16 byte elements in 'out1' from (psrc + stride)
221*b2055c35SXin Li */
222*b2055c35SXin Li #define LD_B2(RTYPE, psrc, stride, out0, out1) do { \
223*b2055c35SXin Li out0 = LD_B(RTYPE, psrc); \
224*b2055c35SXin Li out1 = LD_B(RTYPE, psrc + stride); \
225*b2055c35SXin Li } while (0)
226*b2055c35SXin Li #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
227*b2055c35SXin Li #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
228*b2055c35SXin Li
229*b2055c35SXin Li #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) do { \
230*b2055c35SXin Li LD_B2(RTYPE, psrc, stride, out0, out1); \
231*b2055c35SXin Li out2 = LD_B(RTYPE, psrc + 2 * stride); \
232*b2055c35SXin Li } while (0)
233*b2055c35SXin Li #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
234*b2055c35SXin Li #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
235*b2055c35SXin Li
236*b2055c35SXin Li #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) do { \
237*b2055c35SXin Li LD_B2(RTYPE, psrc, stride, out0, out1); \
238*b2055c35SXin Li LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3); \
239*b2055c35SXin Li } while (0)
240*b2055c35SXin Li #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
241*b2055c35SXin Li #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
242*b2055c35SXin Li
243*b2055c35SXin Li #define LD_B8(RTYPE, psrc, stride, \
244*b2055c35SXin Li out0, out1, out2, out3, out4, out5, out6, out7) do { \
245*b2055c35SXin Li LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3); \
246*b2055c35SXin Li LD_B4(RTYPE, psrc + 4 * stride, stride, out4, out5, out6, out7); \
247*b2055c35SXin Li } while (0)
248*b2055c35SXin Li #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
249*b2055c35SXin Li #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
250*b2055c35SXin Li
251*b2055c35SXin Li /* Description : Load vectors with 8 halfword elements with stride
252*b2055c35SXin Li * Arguments : Inputs - psrc, stride
253*b2055c35SXin Li * Outputs - out0, out1
254*b2055c35SXin Li * Details : Load 8 halfword elements in 'out0' from (psrc)
255*b2055c35SXin Li * Load 8 halfword elements in 'out1' from (psrc + stride)
256*b2055c35SXin Li */
257*b2055c35SXin Li #define LD_H2(RTYPE, psrc, stride, out0, out1) do { \
258*b2055c35SXin Li out0 = LD_H(RTYPE, psrc); \
259*b2055c35SXin Li out1 = LD_H(RTYPE, psrc + stride); \
260*b2055c35SXin Li } while (0)
261*b2055c35SXin Li #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
262*b2055c35SXin Li #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
263*b2055c35SXin Li
264*b2055c35SXin Li /* Description : Load vectors with 4 word elements with stride
265*b2055c35SXin Li * Arguments : Inputs - psrc, stride
266*b2055c35SXin Li * Outputs - out0, out1, out2, out3
267*b2055c35SXin Li * Details : Load 4 word elements in 'out0' from (psrc + 0 * stride)
268*b2055c35SXin Li * Load 4 word elements in 'out1' from (psrc + 1 * stride)
269*b2055c35SXin Li * Load 4 word elements in 'out2' from (psrc + 2 * stride)
270*b2055c35SXin Li * Load 4 word elements in 'out3' from (psrc + 3 * stride)
271*b2055c35SXin Li */
272*b2055c35SXin Li #define LD_W2(RTYPE, psrc, stride, out0, out1) do { \
273*b2055c35SXin Li out0 = LD_W(RTYPE, psrc); \
274*b2055c35SXin Li out1 = LD_W(RTYPE, psrc + stride); \
275*b2055c35SXin Li } while (0)
276*b2055c35SXin Li #define LD_UW2(...) LD_W2(v4u32, __VA_ARGS__)
277*b2055c35SXin Li #define LD_SW2(...) LD_W2(v4i32, __VA_ARGS__)
278*b2055c35SXin Li
279*b2055c35SXin Li #define LD_W3(RTYPE, psrc, stride, out0, out1, out2) do { \
280*b2055c35SXin Li LD_W2(RTYPE, psrc, stride, out0, out1); \
281*b2055c35SXin Li out2 = LD_W(RTYPE, psrc + 2 * stride); \
282*b2055c35SXin Li } while (0)
283*b2055c35SXin Li #define LD_UW3(...) LD_W3(v4u32, __VA_ARGS__)
284*b2055c35SXin Li #define LD_SW3(...) LD_W3(v4i32, __VA_ARGS__)
285*b2055c35SXin Li
286*b2055c35SXin Li #define LD_W4(RTYPE, psrc, stride, out0, out1, out2, out3) do { \
287*b2055c35SXin Li LD_W2(RTYPE, psrc, stride, out0, out1); \
288*b2055c35SXin Li LD_W2(RTYPE, psrc + 2 * stride, stride, out2, out3); \
289*b2055c35SXin Li } while (0)
290*b2055c35SXin Li #define LD_UW4(...) LD_W4(v4u32, __VA_ARGS__)
291*b2055c35SXin Li #define LD_SW4(...) LD_W4(v4i32, __VA_ARGS__)
292*b2055c35SXin Li
293*b2055c35SXin Li /* Description : Store vectors of 16 byte elements with stride
294*b2055c35SXin Li * Arguments : Inputs - in0, in1, pdst, stride
295*b2055c35SXin Li * Details : Store 16 byte elements from 'in0' to (pdst)
296*b2055c35SXin Li * Store 16 byte elements from 'in1' to (pdst + stride)
297*b2055c35SXin Li */
298*b2055c35SXin Li #define ST_B2(RTYPE, in0, in1, pdst, stride) do { \
299*b2055c35SXin Li ST_B(RTYPE, in0, pdst); \
300*b2055c35SXin Li ST_B(RTYPE, in1, pdst + stride); \
301*b2055c35SXin Li } while (0)
302*b2055c35SXin Li #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
303*b2055c35SXin Li #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
304*b2055c35SXin Li
305*b2055c35SXin Li #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) do { \
306*b2055c35SXin Li ST_B2(RTYPE, in0, in1, pdst, stride); \
307*b2055c35SXin Li ST_B2(RTYPE, in2, in3, pdst + 2 * stride, stride); \
308*b2055c35SXin Li } while (0)
309*b2055c35SXin Li #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
310*b2055c35SXin Li #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
311*b2055c35SXin Li
312*b2055c35SXin Li #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
313*b2055c35SXin Li pdst, stride) do { \
314*b2055c35SXin Li ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
315*b2055c35SXin Li ST_B4(RTYPE, in4, in5, in6, in7, pdst + 4 * stride, stride); \
316*b2055c35SXin Li } while (0)
317*b2055c35SXin Li #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
318*b2055c35SXin Li
319*b2055c35SXin Li /* Description : Store vectors of 4 word elements with stride
320*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3, pdst, stride
321*b2055c35SXin Li * Details : Store 4 word elements from 'in0' to (pdst + 0 * stride)
322*b2055c35SXin Li * Store 4 word elements from 'in1' to (pdst + 1 * stride)
323*b2055c35SXin Li * Store 4 word elements from 'in2' to (pdst + 2 * stride)
324*b2055c35SXin Li * Store 4 word elements from 'in3' to (pdst + 3 * stride)
325*b2055c35SXin Li */
326*b2055c35SXin Li #define ST_W2(RTYPE, in0, in1, pdst, stride) do { \
327*b2055c35SXin Li ST_W(RTYPE, in0, pdst); \
328*b2055c35SXin Li ST_W(RTYPE, in1, pdst + stride); \
329*b2055c35SXin Li } while (0)
330*b2055c35SXin Li #define ST_UW2(...) ST_W2(v4u32, __VA_ARGS__)
331*b2055c35SXin Li #define ST_SW2(...) ST_W2(v4i32, __VA_ARGS__)
332*b2055c35SXin Li
333*b2055c35SXin Li #define ST_W3(RTYPE, in0, in1, in2, pdst, stride) do { \
334*b2055c35SXin Li ST_W2(RTYPE, in0, in1, pdst, stride); \
335*b2055c35SXin Li ST_W(RTYPE, in2, pdst + 2 * stride); \
336*b2055c35SXin Li } while (0)
337*b2055c35SXin Li #define ST_UW3(...) ST_W3(v4u32, __VA_ARGS__)
338*b2055c35SXin Li #define ST_SW3(...) ST_W3(v4i32, __VA_ARGS__)
339*b2055c35SXin Li
340*b2055c35SXin Li #define ST_W4(RTYPE, in0, in1, in2, in3, pdst, stride) do { \
341*b2055c35SXin Li ST_W2(RTYPE, in0, in1, pdst, stride); \
342*b2055c35SXin Li ST_W2(RTYPE, in2, in3, pdst + 2 * stride, stride); \
343*b2055c35SXin Li } while (0)
344*b2055c35SXin Li #define ST_UW4(...) ST_W4(v4u32, __VA_ARGS__)
345*b2055c35SXin Li #define ST_SW4(...) ST_W4(v4i32, __VA_ARGS__)
346*b2055c35SXin Li
347*b2055c35SXin Li /* Description : Store vectors of 8 halfword elements with stride
348*b2055c35SXin Li * Arguments : Inputs - in0, in1, pdst, stride
349*b2055c35SXin Li * Details : Store 8 halfword elements from 'in0' to (pdst)
350*b2055c35SXin Li * Store 8 halfword elements from 'in1' to (pdst + stride)
351*b2055c35SXin Li */
352*b2055c35SXin Li #define ST_H2(RTYPE, in0, in1, pdst, stride) do { \
353*b2055c35SXin Li ST_H(RTYPE, in0, pdst); \
354*b2055c35SXin Li ST_H(RTYPE, in1, pdst + stride); \
355*b2055c35SXin Li } while (0)
356*b2055c35SXin Li #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
357*b2055c35SXin Li #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
358*b2055c35SXin Li
359*b2055c35SXin Li /* Description : Store 2x4 byte block to destination memory from input vector
360*b2055c35SXin Li * Arguments : Inputs - in, stidx, pdst, stride
361*b2055c35SXin Li * Details : Index 'stidx' halfword element from 'in' vector is copied to
362*b2055c35SXin Li * the GP register and stored to (pdst)
363*b2055c35SXin Li * Index 'stidx+1' halfword element from 'in' vector is copied to
364*b2055c35SXin Li * the GP register and stored to (pdst + stride)
365*b2055c35SXin Li * Index 'stidx+2' halfword element from 'in' vector is copied to
366*b2055c35SXin Li * the GP register and stored to (pdst + 2 * stride)
367*b2055c35SXin Li * Index 'stidx+3' halfword element from 'in' vector is copied to
368*b2055c35SXin Li * the GP register and stored to (pdst + 3 * stride)
369*b2055c35SXin Li */
370*b2055c35SXin Li #define ST2x4_UB(in, stidx, pdst, stride) do { \
371*b2055c35SXin Li uint8_t* pblk_2x4_m = (uint8_t*)pdst; \
372*b2055c35SXin Li const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx); \
373*b2055c35SXin Li const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1); \
374*b2055c35SXin Li const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2); \
375*b2055c35SXin Li const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3); \
376*b2055c35SXin Li SH(out0_m, pblk_2x4_m); \
377*b2055c35SXin Li pblk_2x4_m += stride; \
378*b2055c35SXin Li SH(out1_m, pblk_2x4_m); \
379*b2055c35SXin Li pblk_2x4_m += stride; \
380*b2055c35SXin Li SH(out2_m, pblk_2x4_m); \
381*b2055c35SXin Li pblk_2x4_m += stride; \
382*b2055c35SXin Li SH(out3_m, pblk_2x4_m); \
383*b2055c35SXin Li } while (0)
384*b2055c35SXin Li
385*b2055c35SXin Li /* Description : Store 4x4 byte block to destination memory from input vector
386*b2055c35SXin Li * Arguments : Inputs - in0, in1, pdst, stride
387*b2055c35SXin Li * Details : 'Idx0' word element from input vector 'in0' is copied to the
388*b2055c35SXin Li * GP register and stored to (pdst)
389*b2055c35SXin Li * 'Idx1' word element from input vector 'in0' is copied to the
390*b2055c35SXin Li * GP register and stored to (pdst + stride)
391*b2055c35SXin Li * 'Idx2' word element from input vector 'in0' is copied to the
392*b2055c35SXin Li * GP register and stored to (pdst + 2 * stride)
393*b2055c35SXin Li * 'Idx3' word element from input vector 'in0' is copied to the
394*b2055c35SXin Li * GP register and stored to (pdst + 3 * stride)
395*b2055c35SXin Li */
396*b2055c35SXin Li #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) do { \
397*b2055c35SXin Li uint8_t* const pblk_4x4_m = (uint8_t*)pdst; \
398*b2055c35SXin Li const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0); \
399*b2055c35SXin Li const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1); \
400*b2055c35SXin Li const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2); \
401*b2055c35SXin Li const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3); \
402*b2055c35SXin Li SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
403*b2055c35SXin Li } while (0)
404*b2055c35SXin Li
405*b2055c35SXin Li #define ST4x8_UB(in0, in1, pdst, stride) do { \
406*b2055c35SXin Li uint8_t* const pblk_4x8 = (uint8_t*)pdst; \
407*b2055c35SXin Li ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
408*b2055c35SXin Li ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
409*b2055c35SXin Li } while (0)
410*b2055c35SXin Li
411*b2055c35SXin Li /* Description : Immediate number of elements to slide
412*b2055c35SXin Li * Arguments : Inputs - in0, in1, slide_val
413*b2055c35SXin Li * Outputs - out
414*b2055c35SXin Li * Return Type - as per RTYPE
415*b2055c35SXin Li * Details : Byte elements from 'in1' vector are slid into 'in0' by
416*b2055c35SXin Li * value specified in the 'slide_val'
417*b2055c35SXin Li */
418*b2055c35SXin Li #define SLDI_B(RTYPE, in0, in1, slide_val) \
419*b2055c35SXin Li (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val) \
420*b2055c35SXin Li
421*b2055c35SXin Li #define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__)
422*b2055c35SXin Li #define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__)
423*b2055c35SXin Li #define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__)
424*b2055c35SXin Li
425*b2055c35SXin Li /* Description : Shuffle byte vector elements as per mask vector
426*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
427*b2055c35SXin Li * Outputs - out0, out1
428*b2055c35SXin Li * Return Type - as per RTYPE
429*b2055c35SXin Li * Details : Byte elements from 'in0' & 'in1' are copied selectively to
430*b2055c35SXin Li * 'out0' as per control vector 'mask0'
431*b2055c35SXin Li */
432*b2055c35SXin Li #define VSHF_B(RTYPE, in0, in1, mask) \
433*b2055c35SXin Li (RTYPE)__msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0)
434*b2055c35SXin Li
435*b2055c35SXin Li #define VSHF_UB(...) VSHF_B(v16u8, __VA_ARGS__)
436*b2055c35SXin Li #define VSHF_SB(...) VSHF_B(v16i8, __VA_ARGS__)
437*b2055c35SXin Li #define VSHF_UH(...) VSHF_B(v8u16, __VA_ARGS__)
438*b2055c35SXin Li #define VSHF_SH(...) VSHF_B(v8i16, __VA_ARGS__)
439*b2055c35SXin Li
440*b2055c35SXin Li #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do { \
441*b2055c35SXin Li out0 = VSHF_B(RTYPE, in0, in1, mask0); \
442*b2055c35SXin Li out1 = VSHF_B(RTYPE, in2, in3, mask1); \
443*b2055c35SXin Li } while (0)
444*b2055c35SXin Li #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
445*b2055c35SXin Li #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
446*b2055c35SXin Li #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
447*b2055c35SXin Li #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
448*b2055c35SXin Li
449*b2055c35SXin Li /* Description : Shuffle halfword vector elements as per mask vector
450*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
451*b2055c35SXin Li * Outputs - out0, out1
452*b2055c35SXin Li * Return Type - as per RTYPE
453*b2055c35SXin Li * Details : halfword elements from 'in0' & 'in1' are copied selectively to
454*b2055c35SXin Li * 'out0' as per control vector 'mask0'
455*b2055c35SXin Li */
456*b2055c35SXin Li #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do { \
457*b2055c35SXin Li out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \
458*b2055c35SXin Li out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \
459*b2055c35SXin Li } while (0)
460*b2055c35SXin Li #define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__)
461*b2055c35SXin Li #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
462*b2055c35SXin Li
463*b2055c35SXin Li /* Description : Dot product of byte vector elements
464*b2055c35SXin Li * Arguments : Inputs - mult0, mult1, cnst0, cnst1
465*b2055c35SXin Li * Outputs - out0, out1
466*b2055c35SXin Li * Return Type - as per RTYPE
467*b2055c35SXin Li * Details : Signed byte elements from 'mult0' are multiplied with
468*b2055c35SXin Li * signed byte elements from 'cnst0' producing a result
469*b2055c35SXin Li * twice the size of input i.e. signed halfword.
470*b2055c35SXin Li * The multiplication result of adjacent odd-even elements
471*b2055c35SXin Li * are added together and written to the 'out0' vector
472*b2055c35SXin Li */
473*b2055c35SXin Li #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
474*b2055c35SXin Li out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
475*b2055c35SXin Li out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
476*b2055c35SXin Li } while (0)
477*b2055c35SXin Li #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
478*b2055c35SXin Li
479*b2055c35SXin Li /* Description : Dot product of halfword vector elements
480*b2055c35SXin Li * Arguments : Inputs - mult0, mult1, cnst0, cnst1
481*b2055c35SXin Li * Outputs - out0, out1
482*b2055c35SXin Li * Return Type - as per RTYPE
483*b2055c35SXin Li * Details : Signed halfword elements from 'mult0' are multiplied with
484*b2055c35SXin Li * signed halfword elements from 'cnst0' producing a result
485*b2055c35SXin Li * twice the size of input i.e. signed word.
486*b2055c35SXin Li * The multiplication result of adjacent odd-even elements
487*b2055c35SXin Li * are added together and written to the 'out0' vector
488*b2055c35SXin Li */
489*b2055c35SXin Li #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
490*b2055c35SXin Li out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
491*b2055c35SXin Li out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
492*b2055c35SXin Li } while (0)
493*b2055c35SXin Li #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
494*b2055c35SXin Li
495*b2055c35SXin Li /* Description : Dot product of unsigned word vector elements
496*b2055c35SXin Li * Arguments : Inputs - mult0, mult1, cnst0, cnst1
497*b2055c35SXin Li * Outputs - out0, out1
498*b2055c35SXin Li * Return Type - as per RTYPE
499*b2055c35SXin Li * Details : Unsigned word elements from 'mult0' are multiplied with
500*b2055c35SXin Li * unsigned word elements from 'cnst0' producing a result
501*b2055c35SXin Li * twice the size of input i.e. unsigned double word.
502*b2055c35SXin Li * The multiplication result of adjacent odd-even elements
503*b2055c35SXin Li * are added together and written to the 'out0' vector
504*b2055c35SXin Li */
505*b2055c35SXin Li #define DOTP_UW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
506*b2055c35SXin Li out0 = (RTYPE)__msa_dotp_u_d((v4u32)mult0, (v4u32)cnst0); \
507*b2055c35SXin Li out1 = (RTYPE)__msa_dotp_u_d((v4u32)mult1, (v4u32)cnst1); \
508*b2055c35SXin Li } while (0)
509*b2055c35SXin Li #define DOTP_UW2_UD(...) DOTP_UW2(v2u64, __VA_ARGS__)
510*b2055c35SXin Li
511*b2055c35SXin Li /* Description : Dot product & addition of halfword vector elements
512*b2055c35SXin Li * Arguments : Inputs - mult0, mult1, cnst0, cnst1
513*b2055c35SXin Li * Outputs - out0, out1
514*b2055c35SXin Li * Return Type - as per RTYPE
515*b2055c35SXin Li * Details : Signed halfword elements from 'mult0' are multiplied with
516*b2055c35SXin Li * signed halfword elements from 'cnst0' producing a result
517*b2055c35SXin Li * twice the size of input i.e. signed word.
518*b2055c35SXin Li * The multiplication result of adjacent odd-even elements
519*b2055c35SXin Li * are added to the 'out0' vector
520*b2055c35SXin Li */
521*b2055c35SXin Li #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do { \
522*b2055c35SXin Li out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
523*b2055c35SXin Li out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
524*b2055c35SXin Li } while (0)
525*b2055c35SXin Li #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
526*b2055c35SXin Li
527*b2055c35SXin Li /* Description : Clips all signed halfword elements of input vector
528*b2055c35SXin Li * between 0 & 255
529*b2055c35SXin Li * Arguments : Input/output - val
530*b2055c35SXin Li * Return Type - signed halfword
531*b2055c35SXin Li */
532*b2055c35SXin Li #define CLIP_SH_0_255(val) do { \
533*b2055c35SXin Li const v8i16 max_m = __msa_ldi_h(255); \
534*b2055c35SXin Li val = __msa_maxi_s_h((v8i16)val, 0); \
535*b2055c35SXin Li val = __msa_min_s_h(max_m, (v8i16)val); \
536*b2055c35SXin Li } while (0)
537*b2055c35SXin Li
538*b2055c35SXin Li #define CLIP_SH2_0_255(in0, in1) do { \
539*b2055c35SXin Li CLIP_SH_0_255(in0); \
540*b2055c35SXin Li CLIP_SH_0_255(in1); \
541*b2055c35SXin Li } while (0)
542*b2055c35SXin Li
543*b2055c35SXin Li #define CLIP_SH4_0_255(in0, in1, in2, in3) do { \
544*b2055c35SXin Li CLIP_SH2_0_255(in0, in1); \
545*b2055c35SXin Li CLIP_SH2_0_255(in2, in3); \
546*b2055c35SXin Li } while (0)
547*b2055c35SXin Li
548*b2055c35SXin Li /* Description : Clips all unsigned halfword elements of input vector
549*b2055c35SXin Li * between 0 & 255
550*b2055c35SXin Li * Arguments : Input - in
551*b2055c35SXin Li * Output - out_m
552*b2055c35SXin Li * Return Type - unsigned halfword
553*b2055c35SXin Li */
554*b2055c35SXin Li #define CLIP_UH_0_255(in) do { \
555*b2055c35SXin Li const v8u16 max_m = (v8u16)__msa_ldi_h(255); \
556*b2055c35SXin Li in = __msa_maxi_u_h((v8u16) in, 0); \
557*b2055c35SXin Li in = __msa_min_u_h((v8u16) max_m, (v8u16) in); \
558*b2055c35SXin Li } while (0)
559*b2055c35SXin Li
560*b2055c35SXin Li #define CLIP_UH2_0_255(in0, in1) do { \
561*b2055c35SXin Li CLIP_UH_0_255(in0); \
562*b2055c35SXin Li CLIP_UH_0_255(in1); \
563*b2055c35SXin Li } while (0)
564*b2055c35SXin Li
565*b2055c35SXin Li /* Description : Clips all signed word elements of input vector
566*b2055c35SXin Li * between 0 & 255
567*b2055c35SXin Li * Arguments : Input/output - val
568*b2055c35SXin Li * Return Type - signed word
569*b2055c35SXin Li */
570*b2055c35SXin Li #define CLIP_SW_0_255(val) do { \
571*b2055c35SXin Li const v4i32 max_m = __msa_ldi_w(255); \
572*b2055c35SXin Li val = __msa_maxi_s_w((v4i32)val, 0); \
573*b2055c35SXin Li val = __msa_min_s_w(max_m, (v4i32)val); \
574*b2055c35SXin Li } while (0)
575*b2055c35SXin Li
576*b2055c35SXin Li #define CLIP_SW4_0_255(in0, in1, in2, in3) do { \
577*b2055c35SXin Li CLIP_SW_0_255(in0); \
578*b2055c35SXin Li CLIP_SW_0_255(in1); \
579*b2055c35SXin Li CLIP_SW_0_255(in2); \
580*b2055c35SXin Li CLIP_SW_0_255(in3); \
581*b2055c35SXin Li } while (0)
582*b2055c35SXin Li
583*b2055c35SXin Li /* Description : Horizontal addition of 4 signed word elements of input vector
584*b2055c35SXin Li * Arguments : Input - in (signed word vector)
585*b2055c35SXin Li * Output - sum_m (i32 sum)
586*b2055c35SXin Li * Return Type - signed word (GP)
587*b2055c35SXin Li * Details : 4 signed word elements of 'in' vector are added together and
588*b2055c35SXin Li * the resulting integer sum is returned
589*b2055c35SXin Li */
func_hadd_sw_s32(v4i32 in)590*b2055c35SXin Li static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) {
591*b2055c35SXin Li const v2i64 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);
592*b2055c35SXin Li const v2i64 res1_m = __msa_splati_d(res0_m, 1);
593*b2055c35SXin Li const v2i64 out = res0_m + res1_m;
594*b2055c35SXin Li int32_t sum_m = __msa_copy_s_w((v4i32)out, 0);
595*b2055c35SXin Li return sum_m;
596*b2055c35SXin Li }
597*b2055c35SXin Li #define HADD_SW_S32(in) func_hadd_sw_s32(in)
598*b2055c35SXin Li
599*b2055c35SXin Li /* Description : Horizontal addition of 8 signed halfword elements
600*b2055c35SXin Li * Arguments : Input - in (signed halfword vector)
601*b2055c35SXin Li * Output - sum_m (s32 sum)
602*b2055c35SXin Li * Return Type - signed word
603*b2055c35SXin Li * Details : 8 signed halfword elements of input vector are added
604*b2055c35SXin Li * together and the resulting integer sum is returned
605*b2055c35SXin Li */
func_hadd_sh_s32(v8i16 in)606*b2055c35SXin Li static WEBP_INLINE int32_t func_hadd_sh_s32(v8i16 in) {
607*b2055c35SXin Li const v4i32 res = __msa_hadd_s_w(in, in);
608*b2055c35SXin Li const v2i64 res0 = __msa_hadd_s_d(res, res);
609*b2055c35SXin Li const v2i64 res1 = __msa_splati_d(res0, 1);
610*b2055c35SXin Li const v2i64 res2 = res0 + res1;
611*b2055c35SXin Li const int32_t sum_m = __msa_copy_s_w((v4i32)res2, 0);
612*b2055c35SXin Li return sum_m;
613*b2055c35SXin Li }
614*b2055c35SXin Li #define HADD_SH_S32(in) func_hadd_sh_s32(in)
615*b2055c35SXin Li
616*b2055c35SXin Li /* Description : Horizontal addition of 8 unsigned halfword elements
617*b2055c35SXin Li * Arguments : Input - in (unsigned halfword vector)
618*b2055c35SXin Li * Output - sum_m (u32 sum)
619*b2055c35SXin Li * Return Type - unsigned word
620*b2055c35SXin Li * Details : 8 unsigned halfword elements of input vector are added
621*b2055c35SXin Li * together and the resulting integer sum is returned
622*b2055c35SXin Li */
func_hadd_uh_u32(v8u16 in)623*b2055c35SXin Li static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
624*b2055c35SXin Li uint32_t sum_m;
625*b2055c35SXin Li const v4u32 res_m = __msa_hadd_u_w(in, in);
626*b2055c35SXin Li v2u64 res0_m = __msa_hadd_u_d(res_m, res_m);
627*b2055c35SXin Li v2u64 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);
628*b2055c35SXin Li res0_m = res0_m + res1_m;
629*b2055c35SXin Li sum_m = __msa_copy_s_w((v4i32)res0_m, 0);
630*b2055c35SXin Li return sum_m;
631*b2055c35SXin Li }
632*b2055c35SXin Li #define HADD_UH_U32(in) func_hadd_uh_u32(in)
633*b2055c35SXin Li
634*b2055c35SXin Li /* Description : Horizontal addition of signed half word vector elements
635*b2055c35SXin Li Arguments : Inputs - in0, in1
636*b2055c35SXin Li Outputs - out0, out1
637*b2055c35SXin Li Return Type - as per RTYPE
638*b2055c35SXin Li Details : Each signed odd half word element from 'in0' is added to
639*b2055c35SXin Li even signed half word element from 'in0' (pairwise) and the
640*b2055c35SXin Li halfword result is written in 'out0'
641*b2055c35SXin Li */
642*b2055c35SXin Li #define HADD_SH2(RTYPE, in0, in1, out0, out1) do { \
643*b2055c35SXin Li out0 = (RTYPE)__msa_hadd_s_w((v8i16)in0, (v8i16)in0); \
644*b2055c35SXin Li out1 = (RTYPE)__msa_hadd_s_w((v8i16)in1, (v8i16)in1); \
645*b2055c35SXin Li } while (0)
646*b2055c35SXin Li #define HADD_SH2_SW(...) HADD_SH2(v4i32, __VA_ARGS__)
647*b2055c35SXin Li
648*b2055c35SXin Li #define HADD_SH4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) do { \
649*b2055c35SXin Li HADD_SH2(RTYPE, in0, in1, out0, out1); \
650*b2055c35SXin Li HADD_SH2(RTYPE, in2, in3, out2, out3); \
651*b2055c35SXin Li } while (0)
652*b2055c35SXin Li #define HADD_SH4_SW(...) HADD_SH4(v4i32, __VA_ARGS__)
653*b2055c35SXin Li
654*b2055c35SXin Li /* Description : Horizontal subtraction of unsigned byte vector elements
655*b2055c35SXin Li * Arguments : Inputs - in0, in1
656*b2055c35SXin Li * Outputs - out0, out1
657*b2055c35SXin Li * Return Type - as per RTYPE
658*b2055c35SXin Li * Details : Each unsigned odd byte element from 'in0' is subtracted from
659*b2055c35SXin Li * even unsigned byte element from 'in0' (pairwise) and the
660*b2055c35SXin Li * halfword result is written to 'out0'
661*b2055c35SXin Li */
662*b2055c35SXin Li #define HSUB_UB2(RTYPE, in0, in1, out0, out1) do { \
663*b2055c35SXin Li out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
664*b2055c35SXin Li out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
665*b2055c35SXin Li } while (0)
666*b2055c35SXin Li #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
667*b2055c35SXin Li #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
668*b2055c35SXin Li #define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__)
669*b2055c35SXin Li
670*b2055c35SXin Li /* Description : Set element n input vector to GPR value
671*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
672*b2055c35SXin Li * Output - out
673*b2055c35SXin Li * Return Type - as per RTYPE
674*b2055c35SXin Li * Details : Set element 0 in vector 'out' to value specified in 'in0'
675*b2055c35SXin Li */
676*b2055c35SXin Li #define INSERT_W2(RTYPE, in0, in1, out) do { \
677*b2055c35SXin Li out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
678*b2055c35SXin Li out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
679*b2055c35SXin Li } while (0)
680*b2055c35SXin Li #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
681*b2055c35SXin Li #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
682*b2055c35SXin Li
683*b2055c35SXin Li #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) do { \
684*b2055c35SXin Li out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
685*b2055c35SXin Li out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
686*b2055c35SXin Li out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
687*b2055c35SXin Li out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
688*b2055c35SXin Li } while (0)
689*b2055c35SXin Li #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
690*b2055c35SXin Li #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
691*b2055c35SXin Li #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
692*b2055c35SXin Li
693*b2055c35SXin Li /* Description : Set element n of double word input vector to GPR value
694*b2055c35SXin Li * Arguments : Inputs - in0, in1
695*b2055c35SXin Li * Output - out
696*b2055c35SXin Li * Return Type - as per RTYPE
697*b2055c35SXin Li * Details : Set element 0 in vector 'out' to GPR value specified in 'in0'
698*b2055c35SXin Li * Set element 1 in vector 'out' to GPR value specified in 'in1'
699*b2055c35SXin Li */
700*b2055c35SXin Li #define INSERT_D2(RTYPE, in0, in1, out) do { \
701*b2055c35SXin Li out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
702*b2055c35SXin Li out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
703*b2055c35SXin Li } while (0)
704*b2055c35SXin Li #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
705*b2055c35SXin Li #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
706*b2055c35SXin Li
707*b2055c35SXin Li /* Description : Interleave even byte elements from vectors
708*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
709*b2055c35SXin Li * Outputs - out0, out1
710*b2055c35SXin Li * Return Type - as per RTYPE
711*b2055c35SXin Li * Details : Even byte elements of 'in0' and 'in1' are interleaved
712*b2055c35SXin Li * and written to 'out0'
713*b2055c35SXin Li */
714*b2055c35SXin Li #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
715*b2055c35SXin Li out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
716*b2055c35SXin Li out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
717*b2055c35SXin Li } while (0)
718*b2055c35SXin Li #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
719*b2055c35SXin Li #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
720*b2055c35SXin Li #define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__)
721*b2055c35SXin Li #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
722*b2055c35SXin Li #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
723*b2055c35SXin Li
724*b2055c35SXin Li /* Description : Interleave odd byte elements from vectors
725*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
726*b2055c35SXin Li * Outputs - out0, out1
727*b2055c35SXin Li * Return Type - as per RTYPE
728*b2055c35SXin Li * Details : Odd byte elements of 'in0' and 'in1' are interleaved
729*b2055c35SXin Li * and written to 'out0'
730*b2055c35SXin Li */
731*b2055c35SXin Li #define ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
732*b2055c35SXin Li out0 = (RTYPE)__msa_ilvod_b((v16i8)in1, (v16i8)in0); \
733*b2055c35SXin Li out1 = (RTYPE)__msa_ilvod_b((v16i8)in3, (v16i8)in2); \
734*b2055c35SXin Li } while (0)
735*b2055c35SXin Li #define ILVOD_B2_UB(...) ILVOD_B2(v16u8, __VA_ARGS__)
736*b2055c35SXin Li #define ILVOD_B2_SB(...) ILVOD_B2(v16i8, __VA_ARGS__)
737*b2055c35SXin Li #define ILVOD_B2_UH(...) ILVOD_B2(v8u16, __VA_ARGS__)
738*b2055c35SXin Li #define ILVOD_B2_SH(...) ILVOD_B2(v8i16, __VA_ARGS__)
739*b2055c35SXin Li #define ILVOD_B2_SD(...) ILVOD_B2(v2i64, __VA_ARGS__)
740*b2055c35SXin Li
741*b2055c35SXin Li /* Description : Interleave even halfword elements from vectors
742*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
743*b2055c35SXin Li * Outputs - out0, out1
744*b2055c35SXin Li * Return Type - as per RTYPE
745*b2055c35SXin Li * Details : Even halfword elements of 'in0' and 'in1' are interleaved
746*b2055c35SXin Li * and written to 'out0'
747*b2055c35SXin Li */
748*b2055c35SXin Li #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
749*b2055c35SXin Li out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
750*b2055c35SXin Li out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
751*b2055c35SXin Li } while (0)
752*b2055c35SXin Li #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
753*b2055c35SXin Li #define ILVEV_H2_UH(...) ILVEV_H2(v8u16, __VA_ARGS__)
754*b2055c35SXin Li #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
755*b2055c35SXin Li #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
756*b2055c35SXin Li
757*b2055c35SXin Li /* Description : Interleave odd halfword elements from vectors
758*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
759*b2055c35SXin Li * Outputs - out0, out1
760*b2055c35SXin Li * Return Type - as per RTYPE
761*b2055c35SXin Li * Details : Odd halfword elements of 'in0' and 'in1' are interleaved
762*b2055c35SXin Li * and written to 'out0'
763*b2055c35SXin Li */
764*b2055c35SXin Li #define ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
765*b2055c35SXin Li out0 = (RTYPE)__msa_ilvod_h((v8i16)in1, (v8i16)in0); \
766*b2055c35SXin Li out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \
767*b2055c35SXin Li } while (0)
768*b2055c35SXin Li #define ILVOD_H2_UB(...) ILVOD_H2(v16u8, __VA_ARGS__)
769*b2055c35SXin Li #define ILVOD_H2_UH(...) ILVOD_H2(v8u16, __VA_ARGS__)
770*b2055c35SXin Li #define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)
771*b2055c35SXin Li #define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)
772*b2055c35SXin Li
773*b2055c35SXin Li /* Description : Interleave even word elements from vectors
774*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
775*b2055c35SXin Li * Outputs - out0, out1
776*b2055c35SXin Li * Return Type - as per RTYPE
777*b2055c35SXin Li * Details : Even word elements of 'in0' and 'in1' are interleaved
778*b2055c35SXin Li * and written to 'out0'
779*b2055c35SXin Li */
780*b2055c35SXin Li #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
781*b2055c35SXin Li out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
782*b2055c35SXin Li out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
783*b2055c35SXin Li } while (0)
784*b2055c35SXin Li #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
785*b2055c35SXin Li #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
786*b2055c35SXin Li #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
787*b2055c35SXin Li #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
788*b2055c35SXin Li
789*b2055c35SXin Li /* Description : Interleave even-odd word elements from vectors
790*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
791*b2055c35SXin Li * Outputs - out0, out1
792*b2055c35SXin Li * Return Type - as per RTYPE
793*b2055c35SXin Li * Details : Even word elements of 'in0' and 'in1' are interleaved
794*b2055c35SXin Li * and written to 'out0'
795*b2055c35SXin Li * Odd word elements of 'in2' and 'in3' are interleaved
796*b2055c35SXin Li * and written to 'out1'
797*b2055c35SXin Li */
798*b2055c35SXin Li #define ILVEVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
799*b2055c35SXin Li out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
800*b2055c35SXin Li out1 = (RTYPE)__msa_ilvod_w((v4i32)in3, (v4i32)in2); \
801*b2055c35SXin Li } while (0)
802*b2055c35SXin Li #define ILVEVOD_W2_UB(...) ILVEVOD_W2(v16u8, __VA_ARGS__)
803*b2055c35SXin Li #define ILVEVOD_W2_UH(...) ILVEVOD_W2(v8u16, __VA_ARGS__)
804*b2055c35SXin Li #define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)
805*b2055c35SXin Li #define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)
806*b2055c35SXin Li
807*b2055c35SXin Li /* Description : Interleave even-odd half-word elements from vectors
808*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
809*b2055c35SXin Li * Outputs - out0, out1
810*b2055c35SXin Li * Return Type - as per RTYPE
811*b2055c35SXin Li * Details : Even half-word elements of 'in0' and 'in1' are interleaved
812*b2055c35SXin Li * and written to 'out0'
813*b2055c35SXin Li * Odd half-word elements of 'in2' and 'in3' are interleaved
814*b2055c35SXin Li * and written to 'out1'
815*b2055c35SXin Li */
816*b2055c35SXin Li #define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
817*b2055c35SXin Li out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
818*b2055c35SXin Li out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2); \
819*b2055c35SXin Li } while (0)
820*b2055c35SXin Li #define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__)
821*b2055c35SXin Li #define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__)
822*b2055c35SXin Li #define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__)
823*b2055c35SXin Li #define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__)
824*b2055c35SXin Li
825*b2055c35SXin Li /* Description : Interleave even double word elements from vectors
826*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
827*b2055c35SXin Li * Outputs - out0, out1
828*b2055c35SXin Li * Return Type - as per RTYPE
829*b2055c35SXin Li * Details : Even double word elements of 'in0' and 'in1' are interleaved
830*b2055c35SXin Li * and written to 'out0'
831*b2055c35SXin Li */
832*b2055c35SXin Li #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
833*b2055c35SXin Li out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
834*b2055c35SXin Li out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
835*b2055c35SXin Li } while (0)
836*b2055c35SXin Li #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
837*b2055c35SXin Li #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
838*b2055c35SXin Li #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
839*b2055c35SXin Li #define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__)
840*b2055c35SXin Li
841*b2055c35SXin Li /* Description : Interleave left half of byte elements from vectors
842*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
843*b2055c35SXin Li * Outputs - out0, out1
844*b2055c35SXin Li * Return Type - as per RTYPE
845*b2055c35SXin Li * Details : Left half of byte elements of 'in0' and 'in1' are interleaved
846*b2055c35SXin Li * and written to 'out0'.
847*b2055c35SXin Li */
848*b2055c35SXin Li #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
849*b2055c35SXin Li out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
850*b2055c35SXin Li out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
851*b2055c35SXin Li } while (0)
852*b2055c35SXin Li #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
853*b2055c35SXin Li #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
854*b2055c35SXin Li #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
855*b2055c35SXin Li #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
856*b2055c35SXin Li #define ILVL_B2_SW(...) ILVL_B2(v4i32, __VA_ARGS__)
857*b2055c35SXin Li
858*b2055c35SXin Li /* Description : Interleave right half of byte elements from vectors
859*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
860*b2055c35SXin Li * Outputs - out0, out1
861*b2055c35SXin Li * Return Type - as per RTYPE
862*b2055c35SXin Li * Details : Right half of byte elements of 'in0' and 'in1' are interleaved
863*b2055c35SXin Li * and written to out0.
864*b2055c35SXin Li */
865*b2055c35SXin Li #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
866*b2055c35SXin Li out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
867*b2055c35SXin Li out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
868*b2055c35SXin Li } while (0)
869*b2055c35SXin Li #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
870*b2055c35SXin Li #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
871*b2055c35SXin Li #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
872*b2055c35SXin Li #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
873*b2055c35SXin Li #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
874*b2055c35SXin Li
875*b2055c35SXin Li #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
876*b2055c35SXin Li out0, out1, out2, out3) do { \
877*b2055c35SXin Li ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
878*b2055c35SXin Li ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
879*b2055c35SXin Li } while (0)
880*b2055c35SXin Li #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
881*b2055c35SXin Li #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
882*b2055c35SXin Li #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
883*b2055c35SXin Li #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
884*b2055c35SXin Li #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
885*b2055c35SXin Li
886*b2055c35SXin Li /* Description : Interleave right half of halfword elements from vectors
887*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
888*b2055c35SXin Li * Outputs - out0, out1
889*b2055c35SXin Li * Return Type - as per RTYPE
890*b2055c35SXin Li * Details : Right half of halfword elements of 'in0' and 'in1' are
891*b2055c35SXin Li * interleaved and written to 'out0'.
892*b2055c35SXin Li */
893*b2055c35SXin Li #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
894*b2055c35SXin Li out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
895*b2055c35SXin Li out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
896*b2055c35SXin Li } while (0)
897*b2055c35SXin Li #define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__)
898*b2055c35SXin Li #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
899*b2055c35SXin Li #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
900*b2055c35SXin Li
901*b2055c35SXin Li #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
902*b2055c35SXin Li out0, out1, out2, out3) do { \
903*b2055c35SXin Li ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
904*b2055c35SXin Li ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
905*b2055c35SXin Li } while (0)
906*b2055c35SXin Li #define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__)
907*b2055c35SXin Li #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
908*b2055c35SXin Li #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
909*b2055c35SXin Li
910*b2055c35SXin Li /* Description : Interleave right half of double word elements from vectors
911*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
912*b2055c35SXin Li * Outputs - out0, out1
913*b2055c35SXin Li * Return Type - as per RTYPE
914*b2055c35SXin Li * Details : Right half of double word elements of 'in0' and 'in1' are
915*b2055c35SXin Li * interleaved and written to 'out0'.
916*b2055c35SXin Li */
917*b2055c35SXin Li #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
918*b2055c35SXin Li out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1); \
919*b2055c35SXin Li out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3); \
920*b2055c35SXin Li } while (0)
921*b2055c35SXin Li #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
922*b2055c35SXin Li #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
923*b2055c35SXin Li #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
924*b2055c35SXin Li
925*b2055c35SXin Li #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
926*b2055c35SXin Li out0, out1, out2, out3) do { \
927*b2055c35SXin Li ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
928*b2055c35SXin Li ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
929*b2055c35SXin Li } while (0)
930*b2055c35SXin Li #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
931*b2055c35SXin Li #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
932*b2055c35SXin Li
933*b2055c35SXin Li /* Description : Interleave both left and right half of input vectors
934*b2055c35SXin Li * Arguments : Inputs - in0, in1
935*b2055c35SXin Li * Outputs - out0, out1
936*b2055c35SXin Li * Return Type - as per RTYPE
937*b2055c35SXin Li * Details : Right half of byte elements from 'in0' and 'in1' are
938*b2055c35SXin Li * interleaved and written to 'out0'
939*b2055c35SXin Li */
940*b2055c35SXin Li #define ILVRL_B2(RTYPE, in0, in1, out0, out1) do { \
941*b2055c35SXin Li out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
942*b2055c35SXin Li out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
943*b2055c35SXin Li } while (0)
944*b2055c35SXin Li #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
945*b2055c35SXin Li #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
946*b2055c35SXin Li #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
947*b2055c35SXin Li #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
948*b2055c35SXin Li #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
949*b2055c35SXin Li
950*b2055c35SXin Li #define ILVRL_H2(RTYPE, in0, in1, out0, out1) do { \
951*b2055c35SXin Li out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
952*b2055c35SXin Li out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
953*b2055c35SXin Li } while (0)
954*b2055c35SXin Li #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
955*b2055c35SXin Li #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
956*b2055c35SXin Li #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
957*b2055c35SXin Li #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
958*b2055c35SXin Li #define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
959*b2055c35SXin Li
960*b2055c35SXin Li #define ILVRL_W2(RTYPE, in0, in1, out0, out1) do { \
961*b2055c35SXin Li out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
962*b2055c35SXin Li out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
963*b2055c35SXin Li } while (0)
964*b2055c35SXin Li #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
965*b2055c35SXin Li #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
966*b2055c35SXin Li #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
967*b2055c35SXin Li #define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
968*b2055c35SXin Li
969*b2055c35SXin Li /* Description : Pack even byte elements of vector pairs
970*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
971*b2055c35SXin Li * Outputs - out0, out1
972*b2055c35SXin Li * Return Type - as per RTYPE
973*b2055c35SXin Li * Details : Even byte elements of 'in0' are copied to the left half of
974*b2055c35SXin Li * 'out0' & even byte elements of 'in1' are copied to the right
975*b2055c35SXin Li * half of 'out0'.
976*b2055c35SXin Li */
977*b2055c35SXin Li #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
978*b2055c35SXin Li out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
979*b2055c35SXin Li out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
980*b2055c35SXin Li } while (0)
981*b2055c35SXin Li #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
982*b2055c35SXin Li #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
983*b2055c35SXin Li #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
984*b2055c35SXin Li #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
985*b2055c35SXin Li
986*b2055c35SXin Li #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
987*b2055c35SXin Li out0, out1, out2, out3) do { \
988*b2055c35SXin Li PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
989*b2055c35SXin Li PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
990*b2055c35SXin Li } while (0)
991*b2055c35SXin Li #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
992*b2055c35SXin Li #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
993*b2055c35SXin Li #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
994*b2055c35SXin Li #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
995*b2055c35SXin Li
996*b2055c35SXin Li /* Description : Pack even halfword elements of vector pairs
997*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
998*b2055c35SXin Li * Outputs - out0, out1
999*b2055c35SXin Li * Return Type - as per RTYPE
1000*b2055c35SXin Li * Details : Even halfword elements of 'in0' are copied to the left half of
1001*b2055c35SXin Li * 'out0' & even halfword elements of 'in1' are copied to the
1002*b2055c35SXin Li * right half of 'out0'.
1003*b2055c35SXin Li */
1004*b2055c35SXin Li #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
1005*b2055c35SXin Li out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
1006*b2055c35SXin Li out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
1007*b2055c35SXin Li } while (0)
1008*b2055c35SXin Li #define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__)
1009*b2055c35SXin Li #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1010*b2055c35SXin Li #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1011*b2055c35SXin Li #define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__)
1012*b2055c35SXin Li
1013*b2055c35SXin Li /* Description : Pack even word elements of vector pairs
1014*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
1015*b2055c35SXin Li * Outputs - out0, out1
1016*b2055c35SXin Li * Return Type - as per RTYPE
1017*b2055c35SXin Li * Details : Even word elements of 'in0' are copied to the left half of
1018*b2055c35SXin Li * 'out0' & even word elements of 'in1' are copied to the
1019*b2055c35SXin Li * right half of 'out0'.
1020*b2055c35SXin Li */
1021*b2055c35SXin Li #define PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
1022*b2055c35SXin Li out0 = (RTYPE)__msa_pckev_w((v4i32)in0, (v4i32)in1); \
1023*b2055c35SXin Li out1 = (RTYPE)__msa_pckev_w((v4i32)in2, (v4i32)in3); \
1024*b2055c35SXin Li } while (0)
1025*b2055c35SXin Li #define PCKEV_W2_UH(...) PCKEV_W2(v8u16, __VA_ARGS__)
1026*b2055c35SXin Li #define PCKEV_W2_SH(...) PCKEV_W2(v8i16, __VA_ARGS__)
1027*b2055c35SXin Li #define PCKEV_W2_SW(...) PCKEV_W2(v4i32, __VA_ARGS__)
1028*b2055c35SXin Li #define PCKEV_W2_UW(...) PCKEV_W2(v4u32, __VA_ARGS__)
1029*b2055c35SXin Li
1030*b2055c35SXin Li /* Description : Pack odd halfword elements of vector pairs
1031*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
1032*b2055c35SXin Li * Outputs - out0, out1
1033*b2055c35SXin Li * Return Type - as per RTYPE
1034*b2055c35SXin Li * Details : Odd halfword elements of 'in0' are copied to the left half of
1035*b2055c35SXin Li * 'out0' & odd halfword elements of 'in1' are copied to the
1036*b2055c35SXin Li * right half of 'out0'.
1037*b2055c35SXin Li */
1038*b2055c35SXin Li #define PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
1039*b2055c35SXin Li out0 = (RTYPE)__msa_pckod_h((v8i16)in0, (v8i16)in1); \
1040*b2055c35SXin Li out1 = (RTYPE)__msa_pckod_h((v8i16)in2, (v8i16)in3); \
1041*b2055c35SXin Li } while (0)
1042*b2055c35SXin Li #define PCKOD_H2_UH(...) PCKOD_H2(v8u16, __VA_ARGS__)
1043*b2055c35SXin Li #define PCKOD_H2_SH(...) PCKOD_H2(v8i16, __VA_ARGS__)
1044*b2055c35SXin Li #define PCKOD_H2_SW(...) PCKOD_H2(v4i32, __VA_ARGS__)
1045*b2055c35SXin Li #define PCKOD_H2_UW(...) PCKOD_H2(v4u32, __VA_ARGS__)
1046*b2055c35SXin Li
1047*b2055c35SXin Li /* Description : Arithmetic immediate shift right all elements of word vector
1048*b2055c35SXin Li * Arguments : Inputs - in0, in1, shift
1049*b2055c35SXin Li * Outputs - in place operation
1050*b2055c35SXin Li * Return Type - as per input vector RTYPE
1051*b2055c35SXin Li * Details : Each element of vector 'in0' is right shifted by 'shift' and
1052*b2055c35SXin Li * the result is written in-place. 'shift' is a GP variable.
1053*b2055c35SXin Li */
1054*b2055c35SXin Li #define SRAI_W2(RTYPE, in0, in1, shift_val) do { \
1055*b2055c35SXin Li in0 = (RTYPE)SRAI_W(in0, shift_val); \
1056*b2055c35SXin Li in1 = (RTYPE)SRAI_W(in1, shift_val); \
1057*b2055c35SXin Li } while (0)
1058*b2055c35SXin Li #define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__)
1059*b2055c35SXin Li #define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__)
1060*b2055c35SXin Li
1061*b2055c35SXin Li #define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) do { \
1062*b2055c35SXin Li SRAI_W2(RTYPE, in0, in1, shift_val); \
1063*b2055c35SXin Li SRAI_W2(RTYPE, in2, in3, shift_val); \
1064*b2055c35SXin Li } while (0)
1065*b2055c35SXin Li #define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__)
1066*b2055c35SXin Li #define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__)
1067*b2055c35SXin Li
1068*b2055c35SXin Li /* Description : Arithmetic shift right all elements of half-word vector
1069*b2055c35SXin Li * Arguments : Inputs - in0, in1, shift
1070*b2055c35SXin Li * Outputs - in place operation
1071*b2055c35SXin Li * Return Type - as per input vector RTYPE
1072*b2055c35SXin Li * Details : Each element of vector 'in0' is right shifted by 'shift' and
1073*b2055c35SXin Li * the result is written in-place. 'shift' is a GP variable.
1074*b2055c35SXin Li */
1075*b2055c35SXin Li #define SRAI_H2(RTYPE, in0, in1, shift_val) do { \
1076*b2055c35SXin Li in0 = (RTYPE)SRAI_H(in0, shift_val); \
1077*b2055c35SXin Li in1 = (RTYPE)SRAI_H(in1, shift_val); \
1078*b2055c35SXin Li } while (0)
1079*b2055c35SXin Li #define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__)
1080*b2055c35SXin Li #define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__)
1081*b2055c35SXin Li
1082*b2055c35SXin Li /* Description : Arithmetic rounded shift right all elements of word vector
1083*b2055c35SXin Li * Arguments : Inputs - in0, in1, shift
1084*b2055c35SXin Li * Outputs - in place operation
1085*b2055c35SXin Li * Return Type - as per input vector RTYPE
1086*b2055c35SXin Li * Details : Each element of vector 'in0' is right shifted by 'shift' and
1087*b2055c35SXin Li * the result is written in-place. 'shift' is a GP variable.
1088*b2055c35SXin Li */
1089*b2055c35SXin Li #define SRARI_W2(RTYPE, in0, in1, shift) do { \
1090*b2055c35SXin Li in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
1091*b2055c35SXin Li in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
1092*b2055c35SXin Li } while (0)
1093*b2055c35SXin Li #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1094*b2055c35SXin Li
1095*b2055c35SXin Li #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) do { \
1096*b2055c35SXin Li SRARI_W2(RTYPE, in0, in1, shift); \
1097*b2055c35SXin Li SRARI_W2(RTYPE, in2, in3, shift); \
1098*b2055c35SXin Li } while (0)
1099*b2055c35SXin Li #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
1100*b2055c35SXin Li #define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)
1101*b2055c35SXin Li #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1102*b2055c35SXin Li
1103*b2055c35SXin Li /* Description : Shift right arithmetic rounded double words
1104*b2055c35SXin Li * Arguments : Inputs - in0, in1, shift
1105*b2055c35SXin Li * Outputs - in place operation
1106*b2055c35SXin Li * Return Type - as per RTYPE
1107*b2055c35SXin Li * Details : Each element of vector 'in0' is shifted right arithmetically by
1108*b2055c35SXin Li * the number of bits in the corresponding element in the vector
1109*b2055c35SXin Li * 'shift'. The last discarded bit is added to shifted value for
1110*b2055c35SXin Li * rounding and the result is written in-place.
1111*b2055c35SXin Li * 'shift' is a vector.
1112*b2055c35SXin Li */
1113*b2055c35SXin Li #define SRAR_D2(RTYPE, in0, in1, shift) do { \
1114*b2055c35SXin Li in0 = (RTYPE)__msa_srar_d((v2i64)in0, (v2i64)shift); \
1115*b2055c35SXin Li in1 = (RTYPE)__msa_srar_d((v2i64)in1, (v2i64)shift); \
1116*b2055c35SXin Li } while (0)
1117*b2055c35SXin Li #define SRAR_D2_SW(...) SRAR_D2(v4i32, __VA_ARGS__)
1118*b2055c35SXin Li #define SRAR_D2_SD(...) SRAR_D2(v2i64, __VA_ARGS__)
1119*b2055c35SXin Li #define SRAR_D2_UD(...) SRAR_D2(v2u64, __VA_ARGS__)
1120*b2055c35SXin Li
1121*b2055c35SXin Li #define SRAR_D4(RTYPE, in0, in1, in2, in3, shift) do { \
1122*b2055c35SXin Li SRAR_D2(RTYPE, in0, in1, shift); \
1123*b2055c35SXin Li SRAR_D2(RTYPE, in2, in3, shift); \
1124*b2055c35SXin Li } while (0)
1125*b2055c35SXin Li #define SRAR_D4_SD(...) SRAR_D4(v2i64, __VA_ARGS__)
1126*b2055c35SXin Li #define SRAR_D4_UD(...) SRAR_D4(v2u64, __VA_ARGS__)
1127*b2055c35SXin Li
1128*b2055c35SXin Li /* Description : Addition of 2 pairs of half-word vectors
1129*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
1130*b2055c35SXin Li * Outputs - out0, out1
1131*b2055c35SXin Li * Details : Each element in 'in0' is added to 'in1' and result is written
1132*b2055c35SXin Li * to 'out0'.
1133*b2055c35SXin Li */
1134*b2055c35SXin Li #define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
1135*b2055c35SXin Li out0 = (RTYPE)ADDVI_H(in0, in1); \
1136*b2055c35SXin Li out1 = (RTYPE)ADDVI_H(in2, in3); \
1137*b2055c35SXin Li } while (0)
1138*b2055c35SXin Li #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)
1139*b2055c35SXin Li #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)
1140*b2055c35SXin Li
1141*b2055c35SXin Li /* Description : Addition of 2 pairs of word vectors
1142*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
1143*b2055c35SXin Li * Outputs - out0, out1
1144*b2055c35SXin Li * Details : Each element in 'in0' is added to 'in1' and result is written
1145*b2055c35SXin Li * to 'out0'.
1146*b2055c35SXin Li */
1147*b2055c35SXin Li #define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
1148*b2055c35SXin Li out0 = (RTYPE)ADDVI_W(in0, in1); \
1149*b2055c35SXin Li out1 = (RTYPE)ADDVI_W(in2, in3); \
1150*b2055c35SXin Li } while (0)
1151*b2055c35SXin Li #define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__)
1152*b2055c35SXin Li
1153*b2055c35SXin Li /* Description : Fill 2 pairs of word vectors with GP registers
1154*b2055c35SXin Li * Arguments : Inputs - in0, in1
1155*b2055c35SXin Li * Outputs - out0, out1
1156*b2055c35SXin Li * Details : GP register in0 is replicated in each word element of out0
1157*b2055c35SXin Li * GP register in1 is replicated in each word element of out1
1158*b2055c35SXin Li */
1159*b2055c35SXin Li #define FILL_W2(RTYPE, in0, in1, out0, out1) do { \
1160*b2055c35SXin Li out0 = (RTYPE)__msa_fill_w(in0); \
1161*b2055c35SXin Li out1 = (RTYPE)__msa_fill_w(in1); \
1162*b2055c35SXin Li } while (0)
1163*b2055c35SXin Li #define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__)
1164*b2055c35SXin Li
1165*b2055c35SXin Li /* Description : Addition of 2 pairs of vectors
1166*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
1167*b2055c35SXin Li * Outputs - out0, out1
1168*b2055c35SXin Li * Details : Each element in 'in0' is added to 'in1' and result is written
1169*b2055c35SXin Li * to 'out0'.
1170*b2055c35SXin Li */
1171*b2055c35SXin Li #define ADD2(in0, in1, in2, in3, out0, out1) do { \
1172*b2055c35SXin Li out0 = in0 + in1; \
1173*b2055c35SXin Li out1 = in2 + in3; \
1174*b2055c35SXin Li } while (0)
1175*b2055c35SXin Li
1176*b2055c35SXin Li #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
1177*b2055c35SXin Li out0, out1, out2, out3) do { \
1178*b2055c35SXin Li ADD2(in0, in1, in2, in3, out0, out1); \
1179*b2055c35SXin Li ADD2(in4, in5, in6, in7, out2, out3); \
1180*b2055c35SXin Li } while (0)
1181*b2055c35SXin Li
1182*b2055c35SXin Li /* Description : Subtraction of 2 pairs of vectors
1183*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
1184*b2055c35SXin Li * Outputs - out0, out1
1185*b2055c35SXin Li * Details : Each element in 'in1' is subtracted from 'in0' and result is
1186*b2055c35SXin Li * written to 'out0'.
1187*b2055c35SXin Li */
1188*b2055c35SXin Li #define SUB2(in0, in1, in2, in3, out0, out1) do { \
1189*b2055c35SXin Li out0 = in0 - in1; \
1190*b2055c35SXin Li out1 = in2 - in3; \
1191*b2055c35SXin Li } while (0)
1192*b2055c35SXin Li
1193*b2055c35SXin Li #define SUB3(in0, in1, in2, in3, in4, in5, out0, out1, out2) do { \
1194*b2055c35SXin Li out0 = in0 - in1; \
1195*b2055c35SXin Li out1 = in2 - in3; \
1196*b2055c35SXin Li out2 = in4 - in5; \
1197*b2055c35SXin Li } while (0)
1198*b2055c35SXin Li
1199*b2055c35SXin Li #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \
1200*b2055c35SXin Li out0, out1, out2, out3) do { \
1201*b2055c35SXin Li out0 = in0 - in1; \
1202*b2055c35SXin Li out1 = in2 - in3; \
1203*b2055c35SXin Li out2 = in4 - in5; \
1204*b2055c35SXin Li out3 = in6 - in7; \
1205*b2055c35SXin Li } while (0)
1206*b2055c35SXin Li
1207*b2055c35SXin Li /* Description : Addition - Subtraction of input vectors
1208*b2055c35SXin Li * Arguments : Inputs - in0, in1
1209*b2055c35SXin Li * Outputs - out0, out1
1210*b2055c35SXin Li * Details : Each element in 'in1' is added to 'in0' and result is
1211*b2055c35SXin Li * written to 'out0'.
1212*b2055c35SXin Li * Each element in 'in1' is subtracted from 'in0' and result is
1213*b2055c35SXin Li * written to 'out1'.
1214*b2055c35SXin Li */
1215*b2055c35SXin Li #define ADDSUB2(in0, in1, out0, out1) do { \
1216*b2055c35SXin Li out0 = in0 + in1; \
1217*b2055c35SXin Li out1 = in0 - in1; \
1218*b2055c35SXin Li } while (0)
1219*b2055c35SXin Li
1220*b2055c35SXin Li /* Description : Multiplication of pairs of vectors
1221*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
1222*b2055c35SXin Li * Outputs - out0, out1
1223*b2055c35SXin Li * Details : Each element from 'in0' is multiplied with elements from 'in1'
1224*b2055c35SXin Li * and the result is written to 'out0'
1225*b2055c35SXin Li */
1226*b2055c35SXin Li #define MUL2(in0, in1, in2, in3, out0, out1) do { \
1227*b2055c35SXin Li out0 = in0 * in1; \
1228*b2055c35SXin Li out1 = in2 * in3; \
1229*b2055c35SXin Li } while (0)
1230*b2055c35SXin Li
1231*b2055c35SXin Li #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
1232*b2055c35SXin Li out0, out1, out2, out3) do { \
1233*b2055c35SXin Li MUL2(in0, in1, in2, in3, out0, out1); \
1234*b2055c35SXin Li MUL2(in4, in5, in6, in7, out2, out3); \
1235*b2055c35SXin Li } while (0)
1236*b2055c35SXin Li
1237*b2055c35SXin Li /* Description : Sign extend halfword elements from right half of the vector
1238*b2055c35SXin Li * Arguments : Input - in (halfword vector)
1239*b2055c35SXin Li * Output - out (sign extended word vector)
1240*b2055c35SXin Li * Return Type - signed word
1241*b2055c35SXin Li * Details : Sign bit of halfword elements from input vector 'in' is
1242*b2055c35SXin Li * extracted and interleaved with same vector 'in0' to generate
1243*b2055c35SXin Li * 4 word elements keeping sign intact
1244*b2055c35SXin Li */
1245*b2055c35SXin Li #define UNPCK_R_SH_SW(in, out) do { \
1246*b2055c35SXin Li const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0); \
1247*b2055c35SXin Li out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
1248*b2055c35SXin Li } while (0)
1249*b2055c35SXin Li
1250*b2055c35SXin Li /* Description : Sign extend halfword elements from input vector and return
1251*b2055c35SXin Li * the result in pair of vectors
1252*b2055c35SXin Li * Arguments : Input - in (halfword vector)
1253*b2055c35SXin Li * Outputs - out0, out1 (sign extended word vectors)
1254*b2055c35SXin Li * Return Type - signed word
1255*b2055c35SXin Li * Details : Sign bit of halfword elements from input vector 'in' is
1256*b2055c35SXin Li * extracted and interleaved right with same vector 'in0' to
1257*b2055c35SXin Li * generate 4 signed word elements in 'out0'
1258*b2055c35SXin Li * Then interleaved left with same vector 'in0' to
1259*b2055c35SXin Li * generate 4 signed word elements in 'out1'
1260*b2055c35SXin Li */
1261*b2055c35SXin Li #define UNPCK_SH_SW(in, out0, out1) do { \
1262*b2055c35SXin Li const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0); \
1263*b2055c35SXin Li ILVRL_H2_SW(tmp_m, in, out0, out1); \
1264*b2055c35SXin Li } while (0)
1265*b2055c35SXin Li
1266*b2055c35SXin Li /* Description : Butterfly of 4 input vectors
1267*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
1268*b2055c35SXin Li * Outputs - out0, out1, out2, out3
1269*b2055c35SXin Li * Details : Butterfly operation
1270*b2055c35SXin Li */
1271*b2055c35SXin Li #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) do { \
1272*b2055c35SXin Li out0 = in0 + in3; \
1273*b2055c35SXin Li out1 = in1 + in2; \
1274*b2055c35SXin Li out2 = in1 - in2; \
1275*b2055c35SXin Li out3 = in0 - in3; \
1276*b2055c35SXin Li } while (0)
1277*b2055c35SXin Li
1278*b2055c35SXin Li /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
1279*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
1280*b2055c35SXin Li * in8, in9, in10, in11, in12, in13, in14, in15
1281*b2055c35SXin Li * Outputs - out0, out1, out2, out3
1282*b2055c35SXin Li * Return Type - unsigned byte
1283*b2055c35SXin Li */
1284*b2055c35SXin Li #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
1285*b2055c35SXin Li in8, in9, in10, in11, in12, in13, in14, in15, \
1286*b2055c35SXin Li out0, out1, out2, out3) do { \
1287*b2055c35SXin Li v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m; \
1288*b2055c35SXin Li ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m); \
1289*b2055c35SXin Li ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
1290*b2055c35SXin Li ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3); \
1291*b2055c35SXin Li ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m); \
1292*b2055c35SXin Li ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
1293*b2055c35SXin Li ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
1294*b2055c35SXin Li ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
1295*b2055c35SXin Li ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2); \
1296*b2055c35SXin Li ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
1297*b2055c35SXin Li ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3); \
1298*b2055c35SXin Li } while (0)
1299*b2055c35SXin Li
1300*b2055c35SXin Li /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1301*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
1302*b2055c35SXin Li * in8, in9, in10, in11, in12, in13, in14, in15
1303*b2055c35SXin Li * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1304*b2055c35SXin Li * Return Type - unsigned byte
1305*b2055c35SXin Li */
1306*b2055c35SXin Li #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
1307*b2055c35SXin Li in8, in9, in10, in11, in12, in13, in14, in15, \
1308*b2055c35SXin Li out0, out1, out2, out3, out4, out5, \
1309*b2055c35SXin Li out6, out7) do { \
1310*b2055c35SXin Li v8i16 tmp0_m, tmp1_m, tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1311*b2055c35SXin Li v4i32 tmp2_m, tmp3_m; \
1312*b2055c35SXin Li ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
1313*b2055c35SXin Li ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
1314*b2055c35SXin Li ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
1315*b2055c35SXin Li ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
1316*b2055c35SXin Li ILVEV_B2_SH(out7, out6, out5, out4, tmp0_m, tmp1_m); \
1317*b2055c35SXin Li ILVOD_B2_SH(out7, out6, out5, out4, tmp4_m, tmp5_m); \
1318*b2055c35SXin Li ILVEV_B2_UB(out3, out2, out1, out0, out5, out7); \
1319*b2055c35SXin Li ILVOD_B2_SH(out3, out2, out1, out0, tmp6_m, tmp7_m); \
1320*b2055c35SXin Li ILVEV_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
1321*b2055c35SXin Li ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out0, out4); \
1322*b2055c35SXin Li ILVOD_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
1323*b2055c35SXin Li ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out2, out6); \
1324*b2055c35SXin Li ILVEV_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
1325*b2055c35SXin Li ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out1, out5); \
1326*b2055c35SXin Li ILVOD_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
1327*b2055c35SXin Li ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out3, out7); \
1328*b2055c35SXin Li } while (0)
1329*b2055c35SXin Li
1330*b2055c35SXin Li /* Description : Transpose 4x4 block with word elements in vectors
1331*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3
1332*b2055c35SXin Li * Outputs - out0, out1, out2, out3
1333*b2055c35SXin Li * Return Type - as per RTYPE
1334*b2055c35SXin Li */
1335*b2055c35SXin Li #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \
1336*b2055c35SXin Li out0, out1, out2, out3) do { \
1337*b2055c35SXin Li v4i32 s0_m, s1_m, s2_m, s3_m; \
1338*b2055c35SXin Li ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
1339*b2055c35SXin Li ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
1340*b2055c35SXin Li out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
1341*b2055c35SXin Li out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
1342*b2055c35SXin Li out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
1343*b2055c35SXin Li out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
1344*b2055c35SXin Li } while (0)
1345*b2055c35SXin Li #define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__)
1346*b2055c35SXin Li
1347*b2055c35SXin Li /* Description : Add block 4x4
1348*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3, pdst, stride
1349*b2055c35SXin Li * Details : Least significant 4 bytes from each input vector are added to
1350*b2055c35SXin Li * the destination bytes, clipped between 0-255 and stored.
1351*b2055c35SXin Li */
1352*b2055c35SXin Li #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do { \
1353*b2055c35SXin Li uint32_t src0_m, src1_m, src2_m, src3_m; \
1354*b2055c35SXin Li v8i16 inp0_m, inp1_m, res0_m, res1_m; \
1355*b2055c35SXin Li v16i8 dst0_m = { 0 }; \
1356*b2055c35SXin Li v16i8 dst1_m = { 0 }; \
1357*b2055c35SXin Li const v16i8 zero_m = { 0 }; \
1358*b2055c35SXin Li ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m); \
1359*b2055c35SXin Li LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
1360*b2055c35SXin Li INSERT_W2_SB(src0_m, src1_m, dst0_m); \
1361*b2055c35SXin Li INSERT_W2_SB(src2_m, src3_m, dst1_m); \
1362*b2055c35SXin Li ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
1363*b2055c35SXin Li ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
1364*b2055c35SXin Li CLIP_SH2_0_255(res0_m, res1_m); \
1365*b2055c35SXin Li PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
1366*b2055c35SXin Li ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \
1367*b2055c35SXin Li } while (0)
1368*b2055c35SXin Li
1369*b2055c35SXin Li /* Description : Pack even byte elements, extract 0 & 2 index words from pair
1370*b2055c35SXin Li * of results and store 4 words in destination memory as per
1371*b2055c35SXin Li * stride
1372*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3, pdst, stride
1373*b2055c35SXin Li */
1374*b2055c35SXin Li #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do { \
1375*b2055c35SXin Li v16i8 tmp0_m, tmp1_m; \
1376*b2055c35SXin Li PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
1377*b2055c35SXin Li ST4x4_UB(tmp0_m, tmp1_m, 0, 2, 0, 2, pdst, stride); \
1378*b2055c35SXin Li } while (0)
1379*b2055c35SXin Li
1380*b2055c35SXin Li /* Description : average with rounding (in0 + in1 + 1) / 2.
1381*b2055c35SXin Li * Arguments : Inputs - in0, in1, in2, in3,
1382*b2055c35SXin Li * Outputs - out0, out1
1383*b2055c35SXin Li * Return Type - as per RTYPE
1384*b2055c35SXin Li * Details : Each unsigned byte element from 'in0' vector is added with
1385*b2055c35SXin Li * each unsigned byte element from 'in1' vector. Then the average
1386*b2055c35SXin Li * with rounding is calculated and written to 'out0'
1387*b2055c35SXin Li */
1388*b2055c35SXin Li #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) do { \
1389*b2055c35SXin Li out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
1390*b2055c35SXin Li out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
1391*b2055c35SXin Li } while (0)
1392*b2055c35SXin Li #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
1393*b2055c35SXin Li
1394*b2055c35SXin Li #endif // WEBP_USE_MSA
1395*b2055c35SXin Li #endif // WEBP_DSP_MSA_MACRO_H_
1396