xref: /aosp_15_r20/external/libgav1/src/dsp/arm/common_neon.h (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1*09537850SAkhilesh Sanikop /*
2*09537850SAkhilesh Sanikop  * Copyright 2019 The libgav1 Authors
3*09537850SAkhilesh Sanikop  *
4*09537850SAkhilesh Sanikop  * Licensed under the Apache License, Version 2.0 (the "License");
5*09537850SAkhilesh Sanikop  * you may not use this file except in compliance with the License.
6*09537850SAkhilesh Sanikop  * You may obtain a copy of the License at
7*09537850SAkhilesh Sanikop  *
8*09537850SAkhilesh Sanikop  *      http://www.apache.org/licenses/LICENSE-2.0
9*09537850SAkhilesh Sanikop  *
10*09537850SAkhilesh Sanikop  * Unless required by applicable law or agreed to in writing, software
11*09537850SAkhilesh Sanikop  * distributed under the License is distributed on an "AS IS" BASIS,
12*09537850SAkhilesh Sanikop  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*09537850SAkhilesh Sanikop  * See the License for the specific language governing permissions and
14*09537850SAkhilesh Sanikop  * limitations under the License.
15*09537850SAkhilesh Sanikop  */
16*09537850SAkhilesh Sanikop 
17*09537850SAkhilesh Sanikop #ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
18*09537850SAkhilesh Sanikop #define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
19*09537850SAkhilesh Sanikop 
20*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
21*09537850SAkhilesh Sanikop 
22*09537850SAkhilesh Sanikop #if LIBGAV1_ENABLE_NEON
23*09537850SAkhilesh Sanikop 
24*09537850SAkhilesh Sanikop #include <arm_neon.h>
25*09537850SAkhilesh Sanikop 
26*09537850SAkhilesh Sanikop #include <algorithm>
27*09537850SAkhilesh Sanikop #include <cstddef>
28*09537850SAkhilesh Sanikop #include <cstdint>
29*09537850SAkhilesh Sanikop #include <cstring>
30*09537850SAkhilesh Sanikop 
31*09537850SAkhilesh Sanikop #include "src/utils/compiler_attributes.h"
32*09537850SAkhilesh Sanikop 
33*09537850SAkhilesh Sanikop #if 0
34*09537850SAkhilesh Sanikop #include <cstdio>
35*09537850SAkhilesh Sanikop #include <string>
36*09537850SAkhilesh Sanikop 
37*09537850SAkhilesh Sanikop constexpr bool kEnablePrintRegs = true;
38*09537850SAkhilesh Sanikop 
39*09537850SAkhilesh Sanikop union DebugRegister {
40*09537850SAkhilesh Sanikop   int8_t i8[8];
41*09537850SAkhilesh Sanikop   int16_t i16[4];
42*09537850SAkhilesh Sanikop   int32_t i32[2];
43*09537850SAkhilesh Sanikop   uint8_t u8[8];
44*09537850SAkhilesh Sanikop   uint16_t u16[4];
45*09537850SAkhilesh Sanikop   uint32_t u32[2];
46*09537850SAkhilesh Sanikop };
47*09537850SAkhilesh Sanikop 
48*09537850SAkhilesh Sanikop union DebugRegisterQ {
49*09537850SAkhilesh Sanikop   int8_t i8[16];
50*09537850SAkhilesh Sanikop   int16_t i16[8];
51*09537850SAkhilesh Sanikop   int32_t i32[4];
52*09537850SAkhilesh Sanikop   uint8_t u8[16];
53*09537850SAkhilesh Sanikop   uint16_t u16[8];
54*09537850SAkhilesh Sanikop   uint32_t u32[4];
55*09537850SAkhilesh Sanikop };
56*09537850SAkhilesh Sanikop 
57*09537850SAkhilesh Sanikop // Quite useful macro for debugging. Left here for convenience.
58*09537850SAkhilesh Sanikop inline void PrintVect(const DebugRegister r, const char* const name, int size) {
59*09537850SAkhilesh Sanikop   int n;
60*09537850SAkhilesh Sanikop   if (kEnablePrintRegs) {
61*09537850SAkhilesh Sanikop     fprintf(stderr, "%s\t: ", name);
62*09537850SAkhilesh Sanikop     if (size == 8) {
63*09537850SAkhilesh Sanikop       for (n = 0; n < 8; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
64*09537850SAkhilesh Sanikop     } else if (size == 16) {
65*09537850SAkhilesh Sanikop       for (n = 0; n < 4; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
66*09537850SAkhilesh Sanikop     } else if (size == 32) {
67*09537850SAkhilesh Sanikop       for (n = 0; n < 2; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
68*09537850SAkhilesh Sanikop     }
69*09537850SAkhilesh Sanikop     fprintf(stderr, "\n");
70*09537850SAkhilesh Sanikop   }
71*09537850SAkhilesh Sanikop }
72*09537850SAkhilesh Sanikop 
73*09537850SAkhilesh Sanikop // Debugging macro for 128-bit types.
74*09537850SAkhilesh Sanikop inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
75*09537850SAkhilesh Sanikop                        int size) {
76*09537850SAkhilesh Sanikop   int n;
77*09537850SAkhilesh Sanikop   if (kEnablePrintRegs) {
78*09537850SAkhilesh Sanikop     fprintf(stderr, "%s\t: ", name);
79*09537850SAkhilesh Sanikop     if (size == 8) {
80*09537850SAkhilesh Sanikop       for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
81*09537850SAkhilesh Sanikop     } else if (size == 16) {
82*09537850SAkhilesh Sanikop       for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
83*09537850SAkhilesh Sanikop     } else if (size == 32) {
84*09537850SAkhilesh Sanikop       for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
85*09537850SAkhilesh Sanikop     }
86*09537850SAkhilesh Sanikop     fprintf(stderr, "\n");
87*09537850SAkhilesh Sanikop   }
88*09537850SAkhilesh Sanikop }
89*09537850SAkhilesh Sanikop 
90*09537850SAkhilesh Sanikop inline void PrintReg(const int32x4x2_t val, const std::string& name) {
91*09537850SAkhilesh Sanikop   DebugRegisterQ r;
92*09537850SAkhilesh Sanikop   vst1q_s32(r.i32, val.val[0]);
93*09537850SAkhilesh Sanikop   const std::string name0 = name + std::string(".val[0]");
94*09537850SAkhilesh Sanikop   PrintVectQ(r, name0.c_str(), 32);
95*09537850SAkhilesh Sanikop   vst1q_s32(r.i32, val.val[1]);
96*09537850SAkhilesh Sanikop   const std::string name1 = name + std::string(".val[1]");
97*09537850SAkhilesh Sanikop   PrintVectQ(r, name1.c_str(), 32);
98*09537850SAkhilesh Sanikop }
99*09537850SAkhilesh Sanikop 
100*09537850SAkhilesh Sanikop inline void PrintReg(const uint32x4_t val, const char* name) {
101*09537850SAkhilesh Sanikop   DebugRegisterQ r;
102*09537850SAkhilesh Sanikop   vst1q_u32(r.u32, val);
103*09537850SAkhilesh Sanikop   PrintVectQ(r, name, 32);
104*09537850SAkhilesh Sanikop }
105*09537850SAkhilesh Sanikop 
106*09537850SAkhilesh Sanikop inline void PrintReg(const uint32x2_t val, const char* name) {
107*09537850SAkhilesh Sanikop   DebugRegister r;
108*09537850SAkhilesh Sanikop   vst1_u32(r.u32, val);
109*09537850SAkhilesh Sanikop   PrintVect(r, name, 32);
110*09537850SAkhilesh Sanikop }
111*09537850SAkhilesh Sanikop 
112*09537850SAkhilesh Sanikop inline void PrintReg(const uint16x8_t val, const char* name) {
113*09537850SAkhilesh Sanikop   DebugRegisterQ r;
114*09537850SAkhilesh Sanikop   vst1q_u16(r.u16, val);
115*09537850SAkhilesh Sanikop   PrintVectQ(r, name, 16);
116*09537850SAkhilesh Sanikop }
117*09537850SAkhilesh Sanikop 
118*09537850SAkhilesh Sanikop inline void PrintReg(const uint16x4_t val, const char* name) {
119*09537850SAkhilesh Sanikop   DebugRegister r;
120*09537850SAkhilesh Sanikop   vst1_u16(r.u16, val);
121*09537850SAkhilesh Sanikop   PrintVect(r, name, 16);
122*09537850SAkhilesh Sanikop }
123*09537850SAkhilesh Sanikop 
124*09537850SAkhilesh Sanikop inline void PrintReg(const uint8x16_t val, const char* name) {
125*09537850SAkhilesh Sanikop   DebugRegisterQ r;
126*09537850SAkhilesh Sanikop   vst1q_u8(r.u8, val);
127*09537850SAkhilesh Sanikop   PrintVectQ(r, name, 8);
128*09537850SAkhilesh Sanikop }
129*09537850SAkhilesh Sanikop 
130*09537850SAkhilesh Sanikop inline void PrintReg(const uint8x8_t val, const char* name) {
131*09537850SAkhilesh Sanikop   DebugRegister r;
132*09537850SAkhilesh Sanikop   vst1_u8(r.u8, val);
133*09537850SAkhilesh Sanikop   PrintVect(r, name, 8);
134*09537850SAkhilesh Sanikop }
135*09537850SAkhilesh Sanikop 
136*09537850SAkhilesh Sanikop inline void PrintReg(const int32x4_t val, const char* name) {
137*09537850SAkhilesh Sanikop   DebugRegisterQ r;
138*09537850SAkhilesh Sanikop   vst1q_s32(r.i32, val);
139*09537850SAkhilesh Sanikop   PrintVectQ(r, name, 32);
140*09537850SAkhilesh Sanikop }
141*09537850SAkhilesh Sanikop 
142*09537850SAkhilesh Sanikop inline void PrintReg(const int32x2_t val, const char* name) {
143*09537850SAkhilesh Sanikop   DebugRegister r;
144*09537850SAkhilesh Sanikop   vst1_s32(r.i32, val);
145*09537850SAkhilesh Sanikop   PrintVect(r, name, 32);
146*09537850SAkhilesh Sanikop }
147*09537850SAkhilesh Sanikop 
148*09537850SAkhilesh Sanikop inline void PrintReg(const int16x8_t val, const char* name) {
149*09537850SAkhilesh Sanikop   DebugRegisterQ r;
150*09537850SAkhilesh Sanikop   vst1q_s16(r.i16, val);
151*09537850SAkhilesh Sanikop   PrintVectQ(r, name, 16);
152*09537850SAkhilesh Sanikop }
153*09537850SAkhilesh Sanikop 
154*09537850SAkhilesh Sanikop inline void PrintReg(const int16x4_t val, const char* name) {
155*09537850SAkhilesh Sanikop   DebugRegister r;
156*09537850SAkhilesh Sanikop   vst1_s16(r.i16, val);
157*09537850SAkhilesh Sanikop   PrintVect(r, name, 16);
158*09537850SAkhilesh Sanikop }
159*09537850SAkhilesh Sanikop 
160*09537850SAkhilesh Sanikop inline void PrintReg(const int8x16_t val, const char* name) {
161*09537850SAkhilesh Sanikop   DebugRegisterQ r;
162*09537850SAkhilesh Sanikop   vst1q_s8(r.i8, val);
163*09537850SAkhilesh Sanikop   PrintVectQ(r, name, 8);
164*09537850SAkhilesh Sanikop }
165*09537850SAkhilesh Sanikop 
166*09537850SAkhilesh Sanikop inline void PrintReg(const int8x8_t val, const char* name) {
167*09537850SAkhilesh Sanikop   DebugRegister r;
168*09537850SAkhilesh Sanikop   vst1_s8(r.i8, val);
169*09537850SAkhilesh Sanikop   PrintVect(r, name, 8);
170*09537850SAkhilesh Sanikop }
171*09537850SAkhilesh Sanikop 
172*09537850SAkhilesh Sanikop // Print an individual (non-vector) value in decimal format.
173*09537850SAkhilesh Sanikop inline void PrintReg(const int x, const char* name) {
174*09537850SAkhilesh Sanikop   if (kEnablePrintRegs) {
175*09537850SAkhilesh Sanikop     fprintf(stderr, "%s: %d\n", name, x);
176*09537850SAkhilesh Sanikop   }
177*09537850SAkhilesh Sanikop }
178*09537850SAkhilesh Sanikop 
179*09537850SAkhilesh Sanikop // Print an individual (non-vector) value in hexadecimal format.
180*09537850SAkhilesh Sanikop inline void PrintHex(const int x, const char* name) {
181*09537850SAkhilesh Sanikop   if (kEnablePrintRegs) {
182*09537850SAkhilesh Sanikop     fprintf(stderr, "%s: %x\n", name, x);
183*09537850SAkhilesh Sanikop   }
184*09537850SAkhilesh Sanikop }
185*09537850SAkhilesh Sanikop 
186*09537850SAkhilesh Sanikop #define PR(x) PrintReg(x, #x)
187*09537850SAkhilesh Sanikop #define PD(x) PrintReg(x, #x)
188*09537850SAkhilesh Sanikop #define PX(x) PrintHex(x, #x)
189*09537850SAkhilesh Sanikop 
190*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
191*09537850SAkhilesh Sanikop #include <sanitizer/msan_interface.h>
192*09537850SAkhilesh Sanikop 
193*09537850SAkhilesh Sanikop inline void PrintShadow(const void* r, const char* const name,
194*09537850SAkhilesh Sanikop                         const size_t size) {
195*09537850SAkhilesh Sanikop   if (kEnablePrintRegs) {
196*09537850SAkhilesh Sanikop     fprintf(stderr, "Shadow for %s:\n", name);
197*09537850SAkhilesh Sanikop     __msan_print_shadow(r, size);
198*09537850SAkhilesh Sanikop   }
199*09537850SAkhilesh Sanikop }
200*09537850SAkhilesh Sanikop #define PS(var, N) PrintShadow(var, #var, N)
201*09537850SAkhilesh Sanikop 
202*09537850SAkhilesh Sanikop #endif  // LIBGAV1_MSAN
203*09537850SAkhilesh Sanikop 
204*09537850SAkhilesh Sanikop #endif  // 0
205*09537850SAkhilesh Sanikop 
206*09537850SAkhilesh Sanikop namespace libgav1 {
207*09537850SAkhilesh Sanikop namespace dsp {
208*09537850SAkhilesh Sanikop 
209*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
210*09537850SAkhilesh Sanikop // Load functions.
211*09537850SAkhilesh Sanikop 
212*09537850SAkhilesh Sanikop // Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading
213*09537850SAkhilesh Sanikop // the values. Use caution when using this in loops because it will re-zero the
214*09537850SAkhilesh Sanikop // register before loading on every iteration.
Load2(const void * const buf)215*09537850SAkhilesh Sanikop inline uint8x8_t Load2(const void* const buf) {
216*09537850SAkhilesh Sanikop   const uint16x4_t zero = vdup_n_u16(0);
217*09537850SAkhilesh Sanikop   uint16_t temp;
218*09537850SAkhilesh Sanikop   memcpy(&temp, buf, 2);
219*09537850SAkhilesh Sanikop   return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0));
220*09537850SAkhilesh Sanikop }
221*09537850SAkhilesh Sanikop 
222*09537850SAkhilesh Sanikop // Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
223*09537850SAkhilesh Sanikop template <int lane>
Load2(const void * const buf,uint8x8_t val)224*09537850SAkhilesh Sanikop inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
225*09537850SAkhilesh Sanikop   uint16_t temp;
226*09537850SAkhilesh Sanikop   memcpy(&temp, buf, 2);
227*09537850SAkhilesh Sanikop   return vreinterpret_u8_u16(
228*09537850SAkhilesh Sanikop       vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
229*09537850SAkhilesh Sanikop }
230*09537850SAkhilesh Sanikop 
231*09537850SAkhilesh Sanikop template <int lane>
Load2(const void * const buf,uint16x4_t val)232*09537850SAkhilesh Sanikop inline uint16x4_t Load2(const void* const buf, uint16x4_t val) {
233*09537850SAkhilesh Sanikop   uint32_t temp;
234*09537850SAkhilesh Sanikop   memcpy(&temp, buf, 4);
235*09537850SAkhilesh Sanikop   return vreinterpret_u16_u32(
236*09537850SAkhilesh Sanikop       vld1_lane_u32(&temp, vreinterpret_u32_u16(val), lane));
237*09537850SAkhilesh Sanikop }
238*09537850SAkhilesh Sanikop 
239*09537850SAkhilesh Sanikop // Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
240*09537850SAkhilesh Sanikop // register before loading the values. Use caution when using this in loops
241*09537850SAkhilesh Sanikop // because it will re-zero the register before loading on every iteration.
Load4(const void * const buf)242*09537850SAkhilesh Sanikop inline uint8x8_t Load4(const void* const buf) {
243*09537850SAkhilesh Sanikop   const uint32x2_t zero = vdup_n_u32(0);
244*09537850SAkhilesh Sanikop   uint32_t temp;
245*09537850SAkhilesh Sanikop   memcpy(&temp, buf, 4);
246*09537850SAkhilesh Sanikop   return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0));
247*09537850SAkhilesh Sanikop }
248*09537850SAkhilesh Sanikop 
249*09537850SAkhilesh Sanikop // Load 4 uint8_t values into 4 lanes staring with |lane| * 4.
250*09537850SAkhilesh Sanikop template <int lane>
Load4(const void * const buf,uint8x8_t val)251*09537850SAkhilesh Sanikop inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
252*09537850SAkhilesh Sanikop   uint32_t temp;
253*09537850SAkhilesh Sanikop   memcpy(&temp, buf, 4);
254*09537850SAkhilesh Sanikop   return vreinterpret_u8_u32(
255*09537850SAkhilesh Sanikop       vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
256*09537850SAkhilesh Sanikop }
257*09537850SAkhilesh Sanikop 
258*09537850SAkhilesh Sanikop // Convenience functions for 16-bit loads from a uint8_t* source.
Load4U16(const void * const buf)259*09537850SAkhilesh Sanikop inline uint16x4_t Load4U16(const void* const buf) {
260*09537850SAkhilesh Sanikop   return vld1_u16(static_cast<const uint16_t*>(buf));
261*09537850SAkhilesh Sanikop }
262*09537850SAkhilesh Sanikop 
Load8U16(const void * const buf)263*09537850SAkhilesh Sanikop inline uint16x8_t Load8U16(const void* const buf) {
264*09537850SAkhilesh Sanikop   return vld1q_u16(static_cast<const uint16_t*>(buf));
265*09537850SAkhilesh Sanikop }
266*09537850SAkhilesh Sanikop 
267*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
268*09537850SAkhilesh Sanikop // Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
269*09537850SAkhilesh Sanikop 
MaskOverreads(const uint8x8_t source,const ptrdiff_t over_read_in_bytes)270*09537850SAkhilesh Sanikop inline uint8x8_t MaskOverreads(const uint8x8_t source,
271*09537850SAkhilesh Sanikop                                const ptrdiff_t over_read_in_bytes) {
272*09537850SAkhilesh Sanikop   uint8x8_t dst = source;
273*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
274*09537850SAkhilesh Sanikop   if (over_read_in_bytes > 0) {
275*09537850SAkhilesh Sanikop     uint8x8_t mask = vdup_n_u8(0);
276*09537850SAkhilesh Sanikop     uint8x8_t valid_element_mask = vdup_n_u8(-1);
277*09537850SAkhilesh Sanikop     const int valid_bytes =
278*09537850SAkhilesh Sanikop         std::min(8, 8 - static_cast<int>(over_read_in_bytes));
279*09537850SAkhilesh Sanikop     for (int i = 0; i < valid_bytes; ++i) {
280*09537850SAkhilesh Sanikop       // Feed ff bytes into |mask| one at a time.
281*09537850SAkhilesh Sanikop       mask = vext_u8(valid_element_mask, mask, 7);
282*09537850SAkhilesh Sanikop     }
283*09537850SAkhilesh Sanikop     dst = vand_u8(dst, mask);
284*09537850SAkhilesh Sanikop   }
285*09537850SAkhilesh Sanikop #else
286*09537850SAkhilesh Sanikop   static_cast<void>(over_read_in_bytes);
287*09537850SAkhilesh Sanikop #endif
288*09537850SAkhilesh Sanikop   return dst;
289*09537850SAkhilesh Sanikop }
290*09537850SAkhilesh Sanikop 
MaskOverreadsQ(const uint8x16_t source,const ptrdiff_t over_read_in_bytes)291*09537850SAkhilesh Sanikop inline uint8x16_t MaskOverreadsQ(const uint8x16_t source,
292*09537850SAkhilesh Sanikop                                  const ptrdiff_t over_read_in_bytes) {
293*09537850SAkhilesh Sanikop   uint8x16_t dst = source;
294*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
295*09537850SAkhilesh Sanikop   if (over_read_in_bytes > 0) {
296*09537850SAkhilesh Sanikop     uint8x16_t mask = vdupq_n_u8(0);
297*09537850SAkhilesh Sanikop     uint8x16_t valid_element_mask = vdupq_n_u8(-1);
298*09537850SAkhilesh Sanikop     const int valid_bytes =
299*09537850SAkhilesh Sanikop         std::min(16, 16 - static_cast<int>(over_read_in_bytes));
300*09537850SAkhilesh Sanikop     for (int i = 0; i < valid_bytes; ++i) {
301*09537850SAkhilesh Sanikop       // Feed ff bytes into |mask| one at a time.
302*09537850SAkhilesh Sanikop       mask = vextq_u8(valid_element_mask, mask, 15);
303*09537850SAkhilesh Sanikop     }
304*09537850SAkhilesh Sanikop     dst = vandq_u8(dst, mask);
305*09537850SAkhilesh Sanikop   }
306*09537850SAkhilesh Sanikop #else
307*09537850SAkhilesh Sanikop   static_cast<void>(over_read_in_bytes);
308*09537850SAkhilesh Sanikop #endif
309*09537850SAkhilesh Sanikop   return dst;
310*09537850SAkhilesh Sanikop }
311*09537850SAkhilesh Sanikop 
MaskOverreadsQ(const uint16x8_t source,const ptrdiff_t over_read_in_bytes)312*09537850SAkhilesh Sanikop inline uint16x8_t MaskOverreadsQ(const uint16x8_t source,
313*09537850SAkhilesh Sanikop                                  const ptrdiff_t over_read_in_bytes) {
314*09537850SAkhilesh Sanikop   return vreinterpretq_u16_u8(
315*09537850SAkhilesh Sanikop       MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes));
316*09537850SAkhilesh Sanikop }
317*09537850SAkhilesh Sanikop 
Load1MsanU8(const uint8_t * const source,const ptrdiff_t over_read_in_bytes)318*09537850SAkhilesh Sanikop inline uint8x8_t Load1MsanU8(const uint8_t* const source,
319*09537850SAkhilesh Sanikop                              const ptrdiff_t over_read_in_bytes) {
320*09537850SAkhilesh Sanikop   return MaskOverreads(vld1_u8(source), over_read_in_bytes);
321*09537850SAkhilesh Sanikop }
322*09537850SAkhilesh Sanikop 
Load1QMsanU8(const uint8_t * const source,const ptrdiff_t over_read_in_bytes)323*09537850SAkhilesh Sanikop inline uint8x16_t Load1QMsanU8(const uint8_t* const source,
324*09537850SAkhilesh Sanikop                                const ptrdiff_t over_read_in_bytes) {
325*09537850SAkhilesh Sanikop   return MaskOverreadsQ(vld1q_u8(source), over_read_in_bytes);
326*09537850SAkhilesh Sanikop }
327*09537850SAkhilesh Sanikop 
Load1QMsanU16(const uint16_t * const source,const ptrdiff_t over_read_in_bytes)328*09537850SAkhilesh Sanikop inline uint16x8_t Load1QMsanU16(const uint16_t* const source,
329*09537850SAkhilesh Sanikop                                 const ptrdiff_t over_read_in_bytes) {
330*09537850SAkhilesh Sanikop   return vreinterpretq_u16_u8(MaskOverreadsQ(
331*09537850SAkhilesh Sanikop       vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes));
332*09537850SAkhilesh Sanikop }
333*09537850SAkhilesh Sanikop 
Load1QMsanU32(const uint32_t * const source,const ptrdiff_t over_read_in_bytes)334*09537850SAkhilesh Sanikop inline uint32x4_t Load1QMsanU32(const uint32_t* const source,
335*09537850SAkhilesh Sanikop                                 const ptrdiff_t over_read_in_bytes) {
336*09537850SAkhilesh Sanikop   return vreinterpretq_u32_u8(MaskOverreadsQ(
337*09537850SAkhilesh Sanikop       vreinterpretq_u8_u32(vld1q_u32(source)), over_read_in_bytes));
338*09537850SAkhilesh Sanikop }
339*09537850SAkhilesh Sanikop 
340*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
341*09537850SAkhilesh Sanikop // Store functions.
342*09537850SAkhilesh Sanikop 
343*09537850SAkhilesh Sanikop // Propagate type information to the compiler. Without this the compiler may
344*09537850SAkhilesh Sanikop // assume the required alignment of the type (4 bytes in the case of uint32_t)
345*09537850SAkhilesh Sanikop // and add alignment hints to the memory access.
346*09537850SAkhilesh Sanikop template <typename T>
ValueToMem(void * const buf,T val)347*09537850SAkhilesh Sanikop inline void ValueToMem(void* const buf, T val) {
348*09537850SAkhilesh Sanikop   memcpy(buf, &val, sizeof(val));
349*09537850SAkhilesh Sanikop }
350*09537850SAkhilesh Sanikop 
351*09537850SAkhilesh Sanikop // Store 4 int8_t values from the low half of an int8x8_t register.
StoreLo4(void * const buf,const int8x8_t val)352*09537850SAkhilesh Sanikop inline void StoreLo4(void* const buf, const int8x8_t val) {
353*09537850SAkhilesh Sanikop   ValueToMem<int32_t>(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0));
354*09537850SAkhilesh Sanikop }
355*09537850SAkhilesh Sanikop 
356*09537850SAkhilesh Sanikop // Store 4 uint8_t values from the low half of a uint8x8_t register.
StoreLo4(void * const buf,const uint8x8_t val)357*09537850SAkhilesh Sanikop inline void StoreLo4(void* const buf, const uint8x8_t val) {
358*09537850SAkhilesh Sanikop   ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
359*09537850SAkhilesh Sanikop }
360*09537850SAkhilesh Sanikop 
361*09537850SAkhilesh Sanikop // Store 4 uint8_t values from the high half of a uint8x8_t register.
StoreHi4(void * const buf,const uint8x8_t val)362*09537850SAkhilesh Sanikop inline void StoreHi4(void* const buf, const uint8x8_t val) {
363*09537850SAkhilesh Sanikop   ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
364*09537850SAkhilesh Sanikop }
365*09537850SAkhilesh Sanikop 
366*09537850SAkhilesh Sanikop // Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t
367*09537850SAkhilesh Sanikop // register.
368*09537850SAkhilesh Sanikop template <int lane>
Store2(void * const buf,const uint8x8_t val)369*09537850SAkhilesh Sanikop inline void Store2(void* const buf, const uint8x8_t val) {
370*09537850SAkhilesh Sanikop   ValueToMem<uint16_t>(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane));
371*09537850SAkhilesh Sanikop }
372*09537850SAkhilesh Sanikop 
373*09537850SAkhilesh Sanikop // Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t
374*09537850SAkhilesh Sanikop // register.
375*09537850SAkhilesh Sanikop template <int lane>
Store2(void * const buf,const uint16x8_t val)376*09537850SAkhilesh Sanikop inline void Store2(void* const buf, const uint16x8_t val) {
377*09537850SAkhilesh Sanikop   ValueToMem<uint32_t>(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
378*09537850SAkhilesh Sanikop }
379*09537850SAkhilesh Sanikop 
380*09537850SAkhilesh Sanikop // Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t
381*09537850SAkhilesh Sanikop // register.
382*09537850SAkhilesh Sanikop template <int lane>
Store2(void * const buf,const uint16x4_t val)383*09537850SAkhilesh Sanikop inline void Store2(void* const buf, const uint16x4_t val) {
384*09537850SAkhilesh Sanikop   ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
385*09537850SAkhilesh Sanikop }
386*09537850SAkhilesh Sanikop 
387*09537850SAkhilesh Sanikop // Simplify code when caller has |buf| cast as uint8_t*.
Store4(void * const buf,const uint16x4_t val)388*09537850SAkhilesh Sanikop inline void Store4(void* const buf, const uint16x4_t val) {
389*09537850SAkhilesh Sanikop   vst1_u16(static_cast<uint16_t*>(buf), val);
390*09537850SAkhilesh Sanikop }
391*09537850SAkhilesh Sanikop 
392*09537850SAkhilesh Sanikop // Simplify code when caller has |buf| cast as uint8_t*.
Store8(void * const buf,const uint16x8_t val)393*09537850SAkhilesh Sanikop inline void Store8(void* const buf, const uint16x8_t val) {
394*09537850SAkhilesh Sanikop   vst1q_u16(static_cast<uint16_t*>(buf), val);
395*09537850SAkhilesh Sanikop }
396*09537850SAkhilesh Sanikop 
Store4QMsanS16(void * const buf,const int16x8x4_t src)397*09537850SAkhilesh Sanikop inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) {
398*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
399*09537850SAkhilesh Sanikop   // The memory shadow is incorrect for vst4q_u16, only marking the first 16
400*09537850SAkhilesh Sanikop   // bytes of the destination as initialized. To avoid missing truly
401*09537850SAkhilesh Sanikop   // uninitialized memory, check the input vectors first, before marking the
402*09537850SAkhilesh Sanikop   // whole 64 bytes initialized. If any input vector contains unused values, it
403*09537850SAkhilesh Sanikop   // should pass through MaskOverreadsQ first.
404*09537850SAkhilesh Sanikop   __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0]));
405*09537850SAkhilesh Sanikop   __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1]));
406*09537850SAkhilesh Sanikop   __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2]));
407*09537850SAkhilesh Sanikop   __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3]));
408*09537850SAkhilesh Sanikop   vst4q_s16(static_cast<int16_t*>(buf), src);
409*09537850SAkhilesh Sanikop   __msan_unpoison(buf, sizeof(int16x8x4_t));
410*09537850SAkhilesh Sanikop #else
411*09537850SAkhilesh Sanikop   vst4q_s16(static_cast<int16_t*>(buf), src);
412*09537850SAkhilesh Sanikop #endif  // LIBGAV1_MSAN
413*09537850SAkhilesh Sanikop }
414*09537850SAkhilesh Sanikop 
415*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
416*09537850SAkhilesh Sanikop // Pointer helpers.
417*09537850SAkhilesh Sanikop 
418*09537850SAkhilesh Sanikop // This function adds |stride|, given as a number of bytes, to a pointer to a
419*09537850SAkhilesh Sanikop // larger type, using native pointer arithmetic.
420*09537850SAkhilesh Sanikop template <typename T>
AddByteStride(T * ptr,const ptrdiff_t stride)421*09537850SAkhilesh Sanikop inline T* AddByteStride(T* ptr, const ptrdiff_t stride) {
422*09537850SAkhilesh Sanikop   return reinterpret_cast<T*>(
423*09537850SAkhilesh Sanikop       const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(ptr) + stride));
424*09537850SAkhilesh Sanikop }
425*09537850SAkhilesh Sanikop 
426*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
427*09537850SAkhilesh Sanikop // Multiply.
428*09537850SAkhilesh Sanikop 
429*09537850SAkhilesh Sanikop // Shim vmull_high_u16 for armv7.
VMullHighU16(const uint16x8_t a,const uint16x8_t b)430*09537850SAkhilesh Sanikop inline uint32x4_t VMullHighU16(const uint16x8_t a, const uint16x8_t b) {
431*09537850SAkhilesh Sanikop #if defined(__aarch64__)
432*09537850SAkhilesh Sanikop   return vmull_high_u16(a, b);
433*09537850SAkhilesh Sanikop #else
434*09537850SAkhilesh Sanikop   return vmull_u16(vget_high_u16(a), vget_high_u16(b));
435*09537850SAkhilesh Sanikop #endif
436*09537850SAkhilesh Sanikop }
437*09537850SAkhilesh Sanikop 
438*09537850SAkhilesh Sanikop // Shim vmull_high_s16 for armv7.
VMullHighS16(const int16x8_t a,const int16x8_t b)439*09537850SAkhilesh Sanikop inline int32x4_t VMullHighS16(const int16x8_t a, const int16x8_t b) {
440*09537850SAkhilesh Sanikop #if defined(__aarch64__)
441*09537850SAkhilesh Sanikop   return vmull_high_s16(a, b);
442*09537850SAkhilesh Sanikop #else
443*09537850SAkhilesh Sanikop   return vmull_s16(vget_high_s16(a), vget_high_s16(b));
444*09537850SAkhilesh Sanikop #endif
445*09537850SAkhilesh Sanikop }
446*09537850SAkhilesh Sanikop 
447*09537850SAkhilesh Sanikop // Shim vmlal_high_u16 for armv7.
VMlalHighU16(const uint32x4_t a,const uint16x8_t b,const uint16x8_t c)448*09537850SAkhilesh Sanikop inline uint32x4_t VMlalHighU16(const uint32x4_t a, const uint16x8_t b,
449*09537850SAkhilesh Sanikop                                const uint16x8_t c) {
450*09537850SAkhilesh Sanikop #if defined(__aarch64__)
451*09537850SAkhilesh Sanikop   return vmlal_high_u16(a, b, c);
452*09537850SAkhilesh Sanikop #else
453*09537850SAkhilesh Sanikop   return vmlal_u16(a, vget_high_u16(b), vget_high_u16(c));
454*09537850SAkhilesh Sanikop #endif
455*09537850SAkhilesh Sanikop }
456*09537850SAkhilesh Sanikop 
457*09537850SAkhilesh Sanikop // Shim vmlal_high_s16 for armv7.
VMlalHighS16(const int32x4_t a,const int16x8_t b,const int16x8_t c)458*09537850SAkhilesh Sanikop inline int32x4_t VMlalHighS16(const int32x4_t a, const int16x8_t b,
459*09537850SAkhilesh Sanikop                               const int16x8_t c) {
460*09537850SAkhilesh Sanikop #if defined(__aarch64__)
461*09537850SAkhilesh Sanikop   return vmlal_high_s16(a, b, c);
462*09537850SAkhilesh Sanikop #else
463*09537850SAkhilesh Sanikop   return vmlal_s16(a, vget_high_s16(b), vget_high_s16(c));
464*09537850SAkhilesh Sanikop #endif
465*09537850SAkhilesh Sanikop }
466*09537850SAkhilesh Sanikop 
467*09537850SAkhilesh Sanikop // Shim vmul_laneq_u16 for armv7.
468*09537850SAkhilesh Sanikop template <int lane>
VMulLaneQU16(const uint16x4_t a,const uint16x8_t b)469*09537850SAkhilesh Sanikop inline uint16x4_t VMulLaneQU16(const uint16x4_t a, const uint16x8_t b) {
470*09537850SAkhilesh Sanikop #if defined(__aarch64__)
471*09537850SAkhilesh Sanikop   return vmul_laneq_u16(a, b, lane);
472*09537850SAkhilesh Sanikop #else
473*09537850SAkhilesh Sanikop   if (lane < 4) return vmul_lane_u16(a, vget_low_u16(b), lane & 0x3);
474*09537850SAkhilesh Sanikop   return vmul_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
475*09537850SAkhilesh Sanikop #endif
476*09537850SAkhilesh Sanikop }
477*09537850SAkhilesh Sanikop 
478*09537850SAkhilesh Sanikop // Shim vmulq_laneq_u16 for armv7.
479*09537850SAkhilesh Sanikop template <int lane>
VMulQLaneQU16(const uint16x8_t a,const uint16x8_t b)480*09537850SAkhilesh Sanikop inline uint16x8_t VMulQLaneQU16(const uint16x8_t a, const uint16x8_t b) {
481*09537850SAkhilesh Sanikop #if defined(__aarch64__)
482*09537850SAkhilesh Sanikop   return vmulq_laneq_u16(a, b, lane);
483*09537850SAkhilesh Sanikop #else
484*09537850SAkhilesh Sanikop   if (lane < 4) return vmulq_lane_u16(a, vget_low_u16(b), lane & 0x3);
485*09537850SAkhilesh Sanikop   return vmulq_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
486*09537850SAkhilesh Sanikop #endif
487*09537850SAkhilesh Sanikop }
488*09537850SAkhilesh Sanikop 
489*09537850SAkhilesh Sanikop // Shim vmla_laneq_u16 for armv7.
490*09537850SAkhilesh Sanikop template <int lane>
VMlaLaneQU16(const uint16x4_t a,const uint16x4_t b,const uint16x8_t c)491*09537850SAkhilesh Sanikop inline uint16x4_t VMlaLaneQU16(const uint16x4_t a, const uint16x4_t b,
492*09537850SAkhilesh Sanikop                                const uint16x8_t c) {
493*09537850SAkhilesh Sanikop #if defined(__aarch64__)
494*09537850SAkhilesh Sanikop   return vmla_laneq_u16(a, b, c, lane);
495*09537850SAkhilesh Sanikop #else
496*09537850SAkhilesh Sanikop   if (lane < 4) return vmla_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
497*09537850SAkhilesh Sanikop   return vmla_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
498*09537850SAkhilesh Sanikop #endif
499*09537850SAkhilesh Sanikop }
500*09537850SAkhilesh Sanikop 
501*09537850SAkhilesh Sanikop // Shim vmlaq_laneq_u16 for armv7.
502*09537850SAkhilesh Sanikop template <int lane>
VMlaQLaneQU16(const uint16x8_t a,const uint16x8_t b,const uint16x8_t c)503*09537850SAkhilesh Sanikop inline uint16x8_t VMlaQLaneQU16(const uint16x8_t a, const uint16x8_t b,
504*09537850SAkhilesh Sanikop                                 const uint16x8_t c) {
505*09537850SAkhilesh Sanikop #if defined(__aarch64__)
506*09537850SAkhilesh Sanikop   return vmlaq_laneq_u16(a, b, c, lane);
507*09537850SAkhilesh Sanikop #else
508*09537850SAkhilesh Sanikop   if (lane < 4) return vmlaq_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
509*09537850SAkhilesh Sanikop   return vmlaq_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
510*09537850SAkhilesh Sanikop #endif
511*09537850SAkhilesh Sanikop }
512*09537850SAkhilesh Sanikop 
513*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
514*09537850SAkhilesh Sanikop // Bit manipulation.
515*09537850SAkhilesh Sanikop 
516*09537850SAkhilesh Sanikop // vshXX_n_XX() requires an immediate.
517*09537850SAkhilesh Sanikop template <int shift>
LeftShiftVector(const uint8x8_t vector)518*09537850SAkhilesh Sanikop inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
519*09537850SAkhilesh Sanikop   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
520*09537850SAkhilesh Sanikop }
521*09537850SAkhilesh Sanikop 
522*09537850SAkhilesh Sanikop template <int shift>
RightShiftVector(const uint8x8_t vector)523*09537850SAkhilesh Sanikop inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
524*09537850SAkhilesh Sanikop   return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
525*09537850SAkhilesh Sanikop }
526*09537850SAkhilesh Sanikop 
527*09537850SAkhilesh Sanikop template <int shift>
RightShiftVector(const int8x8_t vector)528*09537850SAkhilesh Sanikop inline int8x8_t RightShiftVector(const int8x8_t vector) {
529*09537850SAkhilesh Sanikop   return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
530*09537850SAkhilesh Sanikop }
531*09537850SAkhilesh Sanikop 
532*09537850SAkhilesh Sanikop // Shim vqtbl1_u8 for armv7.
VQTbl1U8(const uint8x16_t a,const uint8x8_t index)533*09537850SAkhilesh Sanikop inline uint8x8_t VQTbl1U8(const uint8x16_t a, const uint8x8_t index) {
534*09537850SAkhilesh Sanikop #if defined(__aarch64__)
535*09537850SAkhilesh Sanikop   return vqtbl1_u8(a, index);
536*09537850SAkhilesh Sanikop #else
537*09537850SAkhilesh Sanikop   const uint8x8x2_t b = {vget_low_u8(a), vget_high_u8(a)};
538*09537850SAkhilesh Sanikop   return vtbl2_u8(b, index);
539*09537850SAkhilesh Sanikop #endif
540*09537850SAkhilesh Sanikop }
541*09537850SAkhilesh Sanikop 
542*09537850SAkhilesh Sanikop // Shim vqtbl2_u8 for armv7.
VQTbl2U8(const uint8x16x2_t a,const uint8x8_t index)543*09537850SAkhilesh Sanikop inline uint8x8_t VQTbl2U8(const uint8x16x2_t a, const uint8x8_t index) {
544*09537850SAkhilesh Sanikop #if defined(__aarch64__)
545*09537850SAkhilesh Sanikop   return vqtbl2_u8(a, index);
546*09537850SAkhilesh Sanikop #else
547*09537850SAkhilesh Sanikop   const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
548*09537850SAkhilesh Sanikop                          vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
549*09537850SAkhilesh Sanikop   return vtbl4_u8(b, index);
550*09537850SAkhilesh Sanikop #endif
551*09537850SAkhilesh Sanikop }
552*09537850SAkhilesh Sanikop 
553*09537850SAkhilesh Sanikop // Shim vqtbl2q_u8 for armv7.
VQTbl2QU8(const uint8x16x2_t a,const uint8x16_t index)554*09537850SAkhilesh Sanikop inline uint8x16_t VQTbl2QU8(const uint8x16x2_t a, const uint8x16_t index) {
555*09537850SAkhilesh Sanikop #if defined(__aarch64__)
556*09537850SAkhilesh Sanikop   return vqtbl2q_u8(a, index);
557*09537850SAkhilesh Sanikop #else
558*09537850SAkhilesh Sanikop   return vcombine_u8(VQTbl2U8(a, vget_low_u8(index)),
559*09537850SAkhilesh Sanikop                      VQTbl2U8(a, vget_high_u8(index)));
560*09537850SAkhilesh Sanikop #endif
561*09537850SAkhilesh Sanikop }
562*09537850SAkhilesh Sanikop 
563*09537850SAkhilesh Sanikop // Shim vqtbl3q_u8 for armv7.
VQTbl3U8(const uint8x16x3_t a,const uint8x8_t index)564*09537850SAkhilesh Sanikop inline uint8x8_t VQTbl3U8(const uint8x16x3_t a, const uint8x8_t index) {
565*09537850SAkhilesh Sanikop #if defined(__aarch64__)
566*09537850SAkhilesh Sanikop   return vqtbl3_u8(a, index);
567*09537850SAkhilesh Sanikop #else
568*09537850SAkhilesh Sanikop   const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
569*09537850SAkhilesh Sanikop                          vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
570*09537850SAkhilesh Sanikop   const uint8x8x2_t c = {vget_low_u8(a.val[2]), vget_high_u8(a.val[2])};
571*09537850SAkhilesh Sanikop   const uint8x8_t index_ext = vsub_u8(index, vdup_n_u8(32));
572*09537850SAkhilesh Sanikop   const uint8x8_t partial_lookup = vtbl4_u8(b, index);
573*09537850SAkhilesh Sanikop   return vtbx2_u8(partial_lookup, c, index_ext);
574*09537850SAkhilesh Sanikop #endif
575*09537850SAkhilesh Sanikop }
576*09537850SAkhilesh Sanikop 
577*09537850SAkhilesh Sanikop // Shim vqtbl3q_u8 for armv7.
VQTbl3QU8(const uint8x16x3_t a,const uint8x16_t index)578*09537850SAkhilesh Sanikop inline uint8x16_t VQTbl3QU8(const uint8x16x3_t a, const uint8x16_t index) {
579*09537850SAkhilesh Sanikop #if defined(__aarch64__)
580*09537850SAkhilesh Sanikop   return vqtbl3q_u8(a, index);
581*09537850SAkhilesh Sanikop #else
582*09537850SAkhilesh Sanikop   return vcombine_u8(VQTbl3U8(a, vget_low_u8(index)),
583*09537850SAkhilesh Sanikop                      VQTbl3U8(a, vget_high_u8(index)));
584*09537850SAkhilesh Sanikop #endif
585*09537850SAkhilesh Sanikop }
586*09537850SAkhilesh Sanikop 
587*09537850SAkhilesh Sanikop // Shim vqtbl1_s8 for armv7.
VQTbl1S8(const int8x16_t a,const uint8x8_t index)588*09537850SAkhilesh Sanikop inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
589*09537850SAkhilesh Sanikop #if defined(__aarch64__)
590*09537850SAkhilesh Sanikop   return vqtbl1_s8(a, index);
591*09537850SAkhilesh Sanikop #else
592*09537850SAkhilesh Sanikop   const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)};
593*09537850SAkhilesh Sanikop   return vtbl2_s8(b, vreinterpret_s8_u8(index));
594*09537850SAkhilesh Sanikop #endif
595*09537850SAkhilesh Sanikop }
596*09537850SAkhilesh Sanikop 
597*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
598*09537850SAkhilesh Sanikop // Saturation helpers.
599*09537850SAkhilesh Sanikop 
Clip3S16(const int16x4_t val,const int16x4_t low,const int16x4_t high)600*09537850SAkhilesh Sanikop inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low,
601*09537850SAkhilesh Sanikop                           const int16x4_t high) {
602*09537850SAkhilesh Sanikop   return vmin_s16(vmax_s16(val, low), high);
603*09537850SAkhilesh Sanikop }
604*09537850SAkhilesh Sanikop 
Clip3S16(const int16x8_t val,const int16x8_t low,const int16x8_t high)605*09537850SAkhilesh Sanikop inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low,
606*09537850SAkhilesh Sanikop                           const int16x8_t high) {
607*09537850SAkhilesh Sanikop   return vminq_s16(vmaxq_s16(val, low), high);
608*09537850SAkhilesh Sanikop }
609*09537850SAkhilesh Sanikop 
ConvertToUnsignedPixelU16(const int16x8_t val,int bitdepth)610*09537850SAkhilesh Sanikop inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) {
611*09537850SAkhilesh Sanikop   const int16x8_t low = vdupq_n_s16(0);
612*09537850SAkhilesh Sanikop   const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
613*09537850SAkhilesh Sanikop 
614*09537850SAkhilesh Sanikop   return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
615*09537850SAkhilesh Sanikop }
616*09537850SAkhilesh Sanikop 
617*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
618*09537850SAkhilesh Sanikop // Interleave.
619*09537850SAkhilesh Sanikop 
620*09537850SAkhilesh Sanikop // vzipN is exclusive to A64.
InterleaveLow8(const uint8x8_t a,const uint8x8_t b)621*09537850SAkhilesh Sanikop inline uint8x8_t InterleaveLow8(const uint8x8_t a, const uint8x8_t b) {
622*09537850SAkhilesh Sanikop #if defined(__aarch64__)
623*09537850SAkhilesh Sanikop   return vzip1_u8(a, b);
624*09537850SAkhilesh Sanikop #else
625*09537850SAkhilesh Sanikop   // Discard |.val[1]|
626*09537850SAkhilesh Sanikop   return vzip_u8(a, b).val[0];
627*09537850SAkhilesh Sanikop #endif
628*09537850SAkhilesh Sanikop }
629*09537850SAkhilesh Sanikop 
InterleaveLow32(const uint8x8_t a,const uint8x8_t b)630*09537850SAkhilesh Sanikop inline uint8x8_t InterleaveLow32(const uint8x8_t a, const uint8x8_t b) {
631*09537850SAkhilesh Sanikop #if defined(__aarch64__)
632*09537850SAkhilesh Sanikop   return vreinterpret_u8_u32(
633*09537850SAkhilesh Sanikop       vzip1_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
634*09537850SAkhilesh Sanikop #else
635*09537850SAkhilesh Sanikop   // Discard |.val[1]|
636*09537850SAkhilesh Sanikop   return vreinterpret_u8_u32(
637*09537850SAkhilesh Sanikop       vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[0]);
638*09537850SAkhilesh Sanikop #endif
639*09537850SAkhilesh Sanikop }
640*09537850SAkhilesh Sanikop 
InterleaveLow32(const int8x8_t a,const int8x8_t b)641*09537850SAkhilesh Sanikop inline int8x8_t InterleaveLow32(const int8x8_t a, const int8x8_t b) {
642*09537850SAkhilesh Sanikop #if defined(__aarch64__)
643*09537850SAkhilesh Sanikop   return vreinterpret_s8_u32(
644*09537850SAkhilesh Sanikop       vzip1_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
645*09537850SAkhilesh Sanikop #else
646*09537850SAkhilesh Sanikop   // Discard |.val[1]|
647*09537850SAkhilesh Sanikop   return vreinterpret_s8_u32(
648*09537850SAkhilesh Sanikop       vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[0]);
649*09537850SAkhilesh Sanikop #endif
650*09537850SAkhilesh Sanikop }
651*09537850SAkhilesh Sanikop 
InterleaveHigh32(const uint8x8_t a,const uint8x8_t b)652*09537850SAkhilesh Sanikop inline uint8x8_t InterleaveHigh32(const uint8x8_t a, const uint8x8_t b) {
653*09537850SAkhilesh Sanikop #if defined(__aarch64__)
654*09537850SAkhilesh Sanikop   return vreinterpret_u8_u32(
655*09537850SAkhilesh Sanikop       vzip2_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
656*09537850SAkhilesh Sanikop #else
657*09537850SAkhilesh Sanikop   // Discard |.val[0]|
658*09537850SAkhilesh Sanikop   return vreinterpret_u8_u32(
659*09537850SAkhilesh Sanikop       vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[1]);
660*09537850SAkhilesh Sanikop #endif
661*09537850SAkhilesh Sanikop }
662*09537850SAkhilesh Sanikop 
InterleaveHigh32(const int8x8_t a,const int8x8_t b)663*09537850SAkhilesh Sanikop inline int8x8_t InterleaveHigh32(const int8x8_t a, const int8x8_t b) {
664*09537850SAkhilesh Sanikop #if defined(__aarch64__)
665*09537850SAkhilesh Sanikop   return vreinterpret_s8_u32(
666*09537850SAkhilesh Sanikop       vzip2_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
667*09537850SAkhilesh Sanikop #else
668*09537850SAkhilesh Sanikop   // Discard |.val[0]|
669*09537850SAkhilesh Sanikop   return vreinterpret_s8_u32(
670*09537850SAkhilesh Sanikop       vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[1]);
671*09537850SAkhilesh Sanikop #endif
672*09537850SAkhilesh Sanikop }
673*09537850SAkhilesh Sanikop 
674*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
675*09537850SAkhilesh Sanikop // Sum.
676*09537850SAkhilesh Sanikop 
SumVector(const uint8x8_t a)677*09537850SAkhilesh Sanikop inline uint16_t SumVector(const uint8x8_t a) {
678*09537850SAkhilesh Sanikop #if defined(__aarch64__)
679*09537850SAkhilesh Sanikop   return vaddlv_u8(a);
680*09537850SAkhilesh Sanikop #else
681*09537850SAkhilesh Sanikop   const uint16x4_t c = vpaddl_u8(a);
682*09537850SAkhilesh Sanikop   const uint32x2_t d = vpaddl_u16(c);
683*09537850SAkhilesh Sanikop   const uint64x1_t e = vpaddl_u32(d);
684*09537850SAkhilesh Sanikop   return static_cast<uint16_t>(vget_lane_u64(e, 0));
685*09537850SAkhilesh Sanikop #endif  // defined(__aarch64__)
686*09537850SAkhilesh Sanikop }
687*09537850SAkhilesh Sanikop 
SumVector(const uint32x2_t a)688*09537850SAkhilesh Sanikop inline uint32_t SumVector(const uint32x2_t a) {
689*09537850SAkhilesh Sanikop #if defined(__aarch64__)
690*09537850SAkhilesh Sanikop   return vaddv_u32(a);
691*09537850SAkhilesh Sanikop #else
692*09537850SAkhilesh Sanikop   const uint64x1_t b = vpaddl_u32(a);
693*09537850SAkhilesh Sanikop   return vget_lane_u32(vreinterpret_u32_u64(b), 0);
694*09537850SAkhilesh Sanikop #endif  // defined(__aarch64__)
695*09537850SAkhilesh Sanikop }
696*09537850SAkhilesh Sanikop 
SumVector(const uint32x4_t a)697*09537850SAkhilesh Sanikop inline uint32_t SumVector(const uint32x4_t a) {
698*09537850SAkhilesh Sanikop #if defined(__aarch64__)
699*09537850SAkhilesh Sanikop   return vaddvq_u32(a);
700*09537850SAkhilesh Sanikop #else
701*09537850SAkhilesh Sanikop   const uint64x2_t b = vpaddlq_u32(a);
702*09537850SAkhilesh Sanikop   const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
703*09537850SAkhilesh Sanikop   return static_cast<uint32_t>(vget_lane_u64(c, 0));
704*09537850SAkhilesh Sanikop #endif
705*09537850SAkhilesh Sanikop }
706*09537850SAkhilesh Sanikop 
707*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
708*09537850SAkhilesh Sanikop // Transpose.
709*09537850SAkhilesh Sanikop 
710*09537850SAkhilesh Sanikop // Transpose 32 bit elements such that:
711*09537850SAkhilesh Sanikop // a: 00 01
712*09537850SAkhilesh Sanikop // b: 02 03
713*09537850SAkhilesh Sanikop // returns
714*09537850SAkhilesh Sanikop // val[0]: 00 02
715*09537850SAkhilesh Sanikop // val[1]: 01 03
Interleave32(const uint8x8_t a,const uint8x8_t b)716*09537850SAkhilesh Sanikop inline uint8x8x2_t Interleave32(const uint8x8_t a, const uint8x8_t b) {
717*09537850SAkhilesh Sanikop   const uint32x2_t a_32 = vreinterpret_u32_u8(a);
718*09537850SAkhilesh Sanikop   const uint32x2_t b_32 = vreinterpret_u32_u8(b);
719*09537850SAkhilesh Sanikop   const uint32x2x2_t c = vtrn_u32(a_32, b_32);
720*09537850SAkhilesh Sanikop   const uint8x8x2_t d = {vreinterpret_u8_u32(c.val[0]),
721*09537850SAkhilesh Sanikop                          vreinterpret_u8_u32(c.val[1])};
722*09537850SAkhilesh Sanikop   return d;
723*09537850SAkhilesh Sanikop }
724*09537850SAkhilesh Sanikop 
725*09537850SAkhilesh Sanikop // Swap high and low 32 bit elements.
Transpose32(const uint8x8_t a)726*09537850SAkhilesh Sanikop inline uint8x8_t Transpose32(const uint8x8_t a) {
727*09537850SAkhilesh Sanikop   const uint32x2_t b = vrev64_u32(vreinterpret_u32_u8(a));
728*09537850SAkhilesh Sanikop   return vreinterpret_u8_u32(b);
729*09537850SAkhilesh Sanikop }
730*09537850SAkhilesh Sanikop 
731*09537850SAkhilesh Sanikop // Swap high and low halves.
Transpose64(const uint16x8_t a)732*09537850SAkhilesh Sanikop inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
733*09537850SAkhilesh Sanikop 
734*09537850SAkhilesh Sanikop // Implement vtrnq_s64().
735*09537850SAkhilesh Sanikop // Input:
736*09537850SAkhilesh Sanikop // a0: 00 01 02 03 04 05 06 07
737*09537850SAkhilesh Sanikop // a1: 16 17 18 19 20 21 22 23
738*09537850SAkhilesh Sanikop // Output:
739*09537850SAkhilesh Sanikop // b0.val[0]: 00 01 02 03 16 17 18 19
740*09537850SAkhilesh Sanikop // b0.val[1]: 04 05 06 07 20 21 22 23
VtrnqS64(const int32x4_t a0,const int32x4_t a1)741*09537850SAkhilesh Sanikop inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) {
742*09537850SAkhilesh Sanikop   int16x8x2_t b0;
743*09537850SAkhilesh Sanikop   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
744*09537850SAkhilesh Sanikop                            vreinterpret_s16_s32(vget_low_s32(a1)));
745*09537850SAkhilesh Sanikop   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
746*09537850SAkhilesh Sanikop                            vreinterpret_s16_s32(vget_high_s32(a1)));
747*09537850SAkhilesh Sanikop   return b0;
748*09537850SAkhilesh Sanikop }
749*09537850SAkhilesh Sanikop 
VtrnqU64(const uint32x4_t a0,const uint32x4_t a1)750*09537850SAkhilesh Sanikop inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) {
751*09537850SAkhilesh Sanikop   uint16x8x2_t b0;
752*09537850SAkhilesh Sanikop   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
753*09537850SAkhilesh Sanikop                            vreinterpret_u16_u32(vget_low_u32(a1)));
754*09537850SAkhilesh Sanikop   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
755*09537850SAkhilesh Sanikop                            vreinterpret_u16_u32(vget_high_u32(a1)));
756*09537850SAkhilesh Sanikop   return b0;
757*09537850SAkhilesh Sanikop }
758*09537850SAkhilesh Sanikop 
759*09537850SAkhilesh Sanikop // Input:
760*09537850SAkhilesh Sanikop // 00 01 02 03
761*09537850SAkhilesh Sanikop // 10 11 12 13
762*09537850SAkhilesh Sanikop // 20 21 22 23
763*09537850SAkhilesh Sanikop // 30 31 32 33
764*09537850SAkhilesh Sanikop // Output:
765*09537850SAkhilesh Sanikop // 00 10 20 30
766*09537850SAkhilesh Sanikop // 01 11 21 31
767*09537850SAkhilesh Sanikop // 02 12 22 32
768*09537850SAkhilesh Sanikop // 03 13 23 33
Transpose4x4(uint16x4_t a[4])769*09537850SAkhilesh Sanikop inline void Transpose4x4(uint16x4_t a[4]) {
770*09537850SAkhilesh Sanikop   // b:
771*09537850SAkhilesh Sanikop   // 00 10 02 12
772*09537850SAkhilesh Sanikop   // 01 11 03 13
773*09537850SAkhilesh Sanikop   const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
774*09537850SAkhilesh Sanikop   // c:
775*09537850SAkhilesh Sanikop   // 20 30 22 32
776*09537850SAkhilesh Sanikop   // 21 31 23 33
777*09537850SAkhilesh Sanikop   const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
778*09537850SAkhilesh Sanikop   // d:
779*09537850SAkhilesh Sanikop   // 00 10 20 30
780*09537850SAkhilesh Sanikop   // 02 12 22 32
781*09537850SAkhilesh Sanikop   const uint32x2x2_t d =
782*09537850SAkhilesh Sanikop       vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
783*09537850SAkhilesh Sanikop   // e:
784*09537850SAkhilesh Sanikop   // 01 11 21 31
785*09537850SAkhilesh Sanikop   // 03 13 23 33
786*09537850SAkhilesh Sanikop   const uint32x2x2_t e =
787*09537850SAkhilesh Sanikop       vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
788*09537850SAkhilesh Sanikop   a[0] = vreinterpret_u16_u32(d.val[0]);
789*09537850SAkhilesh Sanikop   a[1] = vreinterpret_u16_u32(e.val[0]);
790*09537850SAkhilesh Sanikop   a[2] = vreinterpret_u16_u32(d.val[1]);
791*09537850SAkhilesh Sanikop   a[3] = vreinterpret_u16_u32(e.val[1]);
792*09537850SAkhilesh Sanikop }
793*09537850SAkhilesh Sanikop 
794*09537850SAkhilesh Sanikop // Input:
795*09537850SAkhilesh Sanikop // a: 00 01 02 03 10 11 12 13
796*09537850SAkhilesh Sanikop // b: 20 21 22 23 30 31 32 33
797*09537850SAkhilesh Sanikop // Output:
798*09537850SAkhilesh Sanikop // Note that columns [1] and [2] are transposed.
799*09537850SAkhilesh Sanikop // a: 00 10 20 30 02 12 22 32
800*09537850SAkhilesh Sanikop // b: 01 11 21 31 03 13 23 33
Transpose4x4(uint8x8_t * a,uint8x8_t * b)801*09537850SAkhilesh Sanikop inline void Transpose4x4(uint8x8_t* a, uint8x8_t* b) {
802*09537850SAkhilesh Sanikop   const uint16x4x2_t c =
803*09537850SAkhilesh Sanikop       vtrn_u16(vreinterpret_u16_u8(*a), vreinterpret_u16_u8(*b));
804*09537850SAkhilesh Sanikop   const uint32x2x2_t d =
805*09537850SAkhilesh Sanikop       vtrn_u32(vreinterpret_u32_u16(c.val[0]), vreinterpret_u32_u16(c.val[1]));
806*09537850SAkhilesh Sanikop   const uint8x8x2_t e =
807*09537850SAkhilesh Sanikop       vtrn_u8(vreinterpret_u8_u32(d.val[0]), vreinterpret_u8_u32(d.val[1]));
808*09537850SAkhilesh Sanikop   *a = e.val[0];
809*09537850SAkhilesh Sanikop   *b = e.val[1];
810*09537850SAkhilesh Sanikop }
811*09537850SAkhilesh Sanikop 
812*09537850SAkhilesh Sanikop // 4x8 Input:
813*09537850SAkhilesh Sanikop // a[0]: 00 01 02 03 04 05 06 07
814*09537850SAkhilesh Sanikop // a[1]: 10 11 12 13 14 15 16 17
815*09537850SAkhilesh Sanikop // a[2]: 20 21 22 23 24 25 26 27
816*09537850SAkhilesh Sanikop // a[3]: 30 31 32 33 34 35 36 37
817*09537850SAkhilesh Sanikop // 8x4 Output:
818*09537850SAkhilesh Sanikop // a[0]: 00 10 20 30 04 14 24 34
819*09537850SAkhilesh Sanikop // a[1]: 01 11 21 31 05 15 25 35
820*09537850SAkhilesh Sanikop // a[2]: 02 12 22 32 06 16 26 36
821*09537850SAkhilesh Sanikop // a[3]: 03 13 23 33 07 17 27 37
Transpose4x8(uint16x8_t a[4])822*09537850SAkhilesh Sanikop inline void Transpose4x8(uint16x8_t a[4]) {
823*09537850SAkhilesh Sanikop   // b0.val[0]: 00 10 02 12 04 14 06 16
824*09537850SAkhilesh Sanikop   // b0.val[1]: 01 11 03 13 05 15 07 17
825*09537850SAkhilesh Sanikop   // b1.val[0]: 20 30 22 32 24 34 26 36
826*09537850SAkhilesh Sanikop   // b1.val[1]: 21 31 23 33 25 35 27 37
827*09537850SAkhilesh Sanikop   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
828*09537850SAkhilesh Sanikop   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
829*09537850SAkhilesh Sanikop 
830*09537850SAkhilesh Sanikop   // c0.val[0]: 00 10 20 30 04 14 24 34
831*09537850SAkhilesh Sanikop   // c0.val[1]: 02 12 22 32 06 16 26 36
832*09537850SAkhilesh Sanikop   // c1.val[0]: 01 11 21 31 05 15 25 35
833*09537850SAkhilesh Sanikop   // c1.val[1]: 03 13 23 33 07 17 27 37
834*09537850SAkhilesh Sanikop   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
835*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(b1.val[0]));
836*09537850SAkhilesh Sanikop   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
837*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(b1.val[1]));
838*09537850SAkhilesh Sanikop 
839*09537850SAkhilesh Sanikop   a[0] = vreinterpretq_u16_u32(c0.val[0]);
840*09537850SAkhilesh Sanikop   a[1] = vreinterpretq_u16_u32(c1.val[0]);
841*09537850SAkhilesh Sanikop   a[2] = vreinterpretq_u16_u32(c0.val[1]);
842*09537850SAkhilesh Sanikop   a[3] = vreinterpretq_u16_u32(c1.val[1]);
843*09537850SAkhilesh Sanikop }
844*09537850SAkhilesh Sanikop 
845*09537850SAkhilesh Sanikop // Special transpose for loop filter.
846*09537850SAkhilesh Sanikop // 4x8 Input:
847*09537850SAkhilesh Sanikop // p_q:  p3 p2 p1 p0 q0 q1 q2 q3
848*09537850SAkhilesh Sanikop // a[0]: 00 01 02 03 04 05 06 07
849*09537850SAkhilesh Sanikop // a[1]: 10 11 12 13 14 15 16 17
850*09537850SAkhilesh Sanikop // a[2]: 20 21 22 23 24 25 26 27
851*09537850SAkhilesh Sanikop // a[3]: 30 31 32 33 34 35 36 37
852*09537850SAkhilesh Sanikop // 8x4 Output:
853*09537850SAkhilesh Sanikop // a[0]: 03 13 23 33 04 14 24 34  p0q0
854*09537850SAkhilesh Sanikop // a[1]: 02 12 22 32 05 15 25 35  p1q1
855*09537850SAkhilesh Sanikop // a[2]: 01 11 21 31 06 16 26 36  p2q2
856*09537850SAkhilesh Sanikop // a[3]: 00 10 20 30 07 17 27 37  p3q3
857*09537850SAkhilesh Sanikop // Direct reapplication of the function will reset the high halves, but
858*09537850SAkhilesh Sanikop // reverse the low halves:
859*09537850SAkhilesh Sanikop // p_q:  p0 p1 p2 p3 q0 q1 q2 q3
860*09537850SAkhilesh Sanikop // a[0]: 33 32 31 30 04 05 06 07
861*09537850SAkhilesh Sanikop // a[1]: 23 22 21 20 14 15 16 17
862*09537850SAkhilesh Sanikop // a[2]: 13 12 11 10 24 25 26 27
863*09537850SAkhilesh Sanikop // a[3]: 03 02 01 00 34 35 36 37
864*09537850SAkhilesh Sanikop // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
865*09537850SAkhilesh Sanikop // reverse the high halves.
866*09537850SAkhilesh Sanikop // The standard Transpose4x8 will produce the same reversals, but with the
867*09537850SAkhilesh Sanikop // order of the low halves also restored relative to the high halves. This is
868*09537850SAkhilesh Sanikop // preferable because it puts all values from the same source row back together,
869*09537850SAkhilesh Sanikop // but some post-processing is inevitable.
LoopFilterTranspose4x8(uint16x8_t a[4])870*09537850SAkhilesh Sanikop inline void LoopFilterTranspose4x8(uint16x8_t a[4]) {
871*09537850SAkhilesh Sanikop   // b0.val[0]: 00 10 02 12 04 14 06 16
872*09537850SAkhilesh Sanikop   // b0.val[1]: 01 11 03 13 05 15 07 17
873*09537850SAkhilesh Sanikop   // b1.val[0]: 20 30 22 32 24 34 26 36
874*09537850SAkhilesh Sanikop   // b1.val[1]: 21 31 23 33 25 35 27 37
875*09537850SAkhilesh Sanikop   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
876*09537850SAkhilesh Sanikop   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
877*09537850SAkhilesh Sanikop 
878*09537850SAkhilesh Sanikop   // Reverse odd vectors to bring the appropriate items to the front of zips.
879*09537850SAkhilesh Sanikop   // b0.val[0]: 00 10 02 12 04 14 06 16
880*09537850SAkhilesh Sanikop   // r0       : 03 13 01 11 07 17 05 15
881*09537850SAkhilesh Sanikop   // b1.val[0]: 20 30 22 32 24 34 26 36
882*09537850SAkhilesh Sanikop   // r1       : 23 33 21 31 27 37 25 35
883*09537850SAkhilesh Sanikop   const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
884*09537850SAkhilesh Sanikop   const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
885*09537850SAkhilesh Sanikop 
886*09537850SAkhilesh Sanikop   // Zip to complete the halves.
887*09537850SAkhilesh Sanikop   // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
888*09537850SAkhilesh Sanikop   // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
889*09537850SAkhilesh Sanikop   // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
890*09537850SAkhilesh Sanikop   // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
891*09537850SAkhilesh Sanikop   const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
892*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(b1.val[0]));
893*09537850SAkhilesh Sanikop   const uint32x4x2_t c1 = vzipq_u32(r0, r1);
894*09537850SAkhilesh Sanikop 
895*09537850SAkhilesh Sanikop   // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
896*09537850SAkhilesh Sanikop   // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
897*09537850SAkhilesh Sanikop   // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
898*09537850SAkhilesh Sanikop   // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
899*09537850SAkhilesh Sanikop   const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c1.val[1]);
900*09537850SAkhilesh Sanikop   // The third row of c comes first here to swap p2 with q0.
901*09537850SAkhilesh Sanikop   const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c0.val[1]);
902*09537850SAkhilesh Sanikop 
903*09537850SAkhilesh Sanikop   // 8x4 Output:
904*09537850SAkhilesh Sanikop   // a[0]: 03 13 23 33 04 14 24 34  p0q0
905*09537850SAkhilesh Sanikop   // a[1]: 02 12 22 32 05 15 25 35  p1q1
906*09537850SAkhilesh Sanikop   // a[2]: 01 11 21 31 06 16 26 36  p2q2
907*09537850SAkhilesh Sanikop   // a[3]: 00 10 20 30 07 17 27 37  p3q3
908*09537850SAkhilesh Sanikop   a[0] = d1.val[0];  // p0q0
909*09537850SAkhilesh Sanikop   a[1] = d0.val[1];  // p1q1
910*09537850SAkhilesh Sanikop   a[2] = d1.val[1];  // p2q2
911*09537850SAkhilesh Sanikop   a[3] = d0.val[0];  // p3q3
912*09537850SAkhilesh Sanikop }
913*09537850SAkhilesh Sanikop 
914*09537850SAkhilesh Sanikop // Reversible if the x4 values are packed next to each other.
915*09537850SAkhilesh Sanikop // x4 input / x8 output:
916*09537850SAkhilesh Sanikop // a0: 00 01 02 03 40 41 42 43 44
917*09537850SAkhilesh Sanikop // a1: 10 11 12 13 50 51 52 53 54
918*09537850SAkhilesh Sanikop // a2: 20 21 22 23 60 61 62 63 64
919*09537850SAkhilesh Sanikop // a3: 30 31 32 33 70 71 72 73 74
920*09537850SAkhilesh Sanikop // x8 input / x4 output:
921*09537850SAkhilesh Sanikop // a0: 00 10 20 30 40 50 60 70
922*09537850SAkhilesh Sanikop // a1: 01 11 21 31 41 51 61 71
923*09537850SAkhilesh Sanikop // a2: 02 12 22 32 42 52 62 72
924*09537850SAkhilesh Sanikop // a3: 03 13 23 33 43 53 63 73
Transpose8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)925*09537850SAkhilesh Sanikop inline void Transpose8x4(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2,
926*09537850SAkhilesh Sanikop                          uint8x8_t* a3) {
927*09537850SAkhilesh Sanikop   const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
928*09537850SAkhilesh Sanikop   const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
929*09537850SAkhilesh Sanikop 
930*09537850SAkhilesh Sanikop   const uint16x4x2_t c0 =
931*09537850SAkhilesh Sanikop       vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
932*09537850SAkhilesh Sanikop   const uint16x4x2_t c1 =
933*09537850SAkhilesh Sanikop       vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
934*09537850SAkhilesh Sanikop 
935*09537850SAkhilesh Sanikop   *a0 = vreinterpret_u8_u16(c0.val[0]);
936*09537850SAkhilesh Sanikop   *a1 = vreinterpret_u8_u16(c1.val[0]);
937*09537850SAkhilesh Sanikop   *a2 = vreinterpret_u8_u16(c0.val[1]);
938*09537850SAkhilesh Sanikop   *a3 = vreinterpret_u8_u16(c1.val[1]);
939*09537850SAkhilesh Sanikop }
940*09537850SAkhilesh Sanikop 
941*09537850SAkhilesh Sanikop // Input:
942*09537850SAkhilesh Sanikop // a[0]: 00 01 02 03 04 05 06 07
943*09537850SAkhilesh Sanikop // a[1]: 10 11 12 13 14 15 16 17
944*09537850SAkhilesh Sanikop // a[2]: 20 21 22 23 24 25 26 27
945*09537850SAkhilesh Sanikop // a[3]: 30 31 32 33 34 35 36 37
946*09537850SAkhilesh Sanikop // a[4]: 40 41 42 43 44 45 46 47
947*09537850SAkhilesh Sanikop // a[5]: 50 51 52 53 54 55 56 57
948*09537850SAkhilesh Sanikop // a[6]: 60 61 62 63 64 65 66 67
949*09537850SAkhilesh Sanikop // a[7]: 70 71 72 73 74 75 76 77
950*09537850SAkhilesh Sanikop 
951*09537850SAkhilesh Sanikop // Output:
952*09537850SAkhilesh Sanikop // a[0]: 00 10 20 30 40 50 60 70
953*09537850SAkhilesh Sanikop // a[1]: 01 11 21 31 41 51 61 71
954*09537850SAkhilesh Sanikop // a[2]: 02 12 22 32 42 52 62 72
955*09537850SAkhilesh Sanikop // a[3]: 03 13 23 33 43 53 63 73
956*09537850SAkhilesh Sanikop // a[4]: 04 14 24 34 44 54 64 74
957*09537850SAkhilesh Sanikop // a[5]: 05 15 25 35 45 55 65 75
958*09537850SAkhilesh Sanikop // a[6]: 06 16 26 36 46 56 66 76
959*09537850SAkhilesh Sanikop // a[7]: 07 17 27 37 47 57 67 77
Transpose8x8(int8x8_t a[8])960*09537850SAkhilesh Sanikop inline void Transpose8x8(int8x8_t a[8]) {
961*09537850SAkhilesh Sanikop   // Swap 8 bit elements. Goes from:
962*09537850SAkhilesh Sanikop   // a[0]: 00 01 02 03 04 05 06 07
963*09537850SAkhilesh Sanikop   // a[1]: 10 11 12 13 14 15 16 17
964*09537850SAkhilesh Sanikop   // a[2]: 20 21 22 23 24 25 26 27
965*09537850SAkhilesh Sanikop   // a[3]: 30 31 32 33 34 35 36 37
966*09537850SAkhilesh Sanikop   // a[4]: 40 41 42 43 44 45 46 47
967*09537850SAkhilesh Sanikop   // a[5]: 50 51 52 53 54 55 56 57
968*09537850SAkhilesh Sanikop   // a[6]: 60 61 62 63 64 65 66 67
969*09537850SAkhilesh Sanikop   // a[7]: 70 71 72 73 74 75 76 77
970*09537850SAkhilesh Sanikop   // to:
971*09537850SAkhilesh Sanikop   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
972*09537850SAkhilesh Sanikop   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
973*09537850SAkhilesh Sanikop   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
974*09537850SAkhilesh Sanikop   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
975*09537850SAkhilesh Sanikop   const int8x16x2_t b0 =
976*09537850SAkhilesh Sanikop       vtrnq_s8(vcombine_s8(a[0], a[4]), vcombine_s8(a[1], a[5]));
977*09537850SAkhilesh Sanikop   const int8x16x2_t b1 =
978*09537850SAkhilesh Sanikop       vtrnq_s8(vcombine_s8(a[2], a[6]), vcombine_s8(a[3], a[7]));
979*09537850SAkhilesh Sanikop 
980*09537850SAkhilesh Sanikop   // Swap 16 bit elements resulting in:
981*09537850SAkhilesh Sanikop   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
982*09537850SAkhilesh Sanikop   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
983*09537850SAkhilesh Sanikop   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
984*09537850SAkhilesh Sanikop   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
985*09537850SAkhilesh Sanikop   const int16x8x2_t c0 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[0]),
986*09537850SAkhilesh Sanikop                                    vreinterpretq_s16_s8(b1.val[0]));
987*09537850SAkhilesh Sanikop   const int16x8x2_t c1 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[1]),
988*09537850SAkhilesh Sanikop                                    vreinterpretq_s16_s8(b1.val[1]));
989*09537850SAkhilesh Sanikop 
990*09537850SAkhilesh Sanikop   // Unzip 32 bit elements resulting in:
991*09537850SAkhilesh Sanikop   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
992*09537850SAkhilesh Sanikop   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
993*09537850SAkhilesh Sanikop   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
994*09537850SAkhilesh Sanikop   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
995*09537850SAkhilesh Sanikop   const int32x4x2_t d0 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[0]),
996*09537850SAkhilesh Sanikop                                    vreinterpretq_s32_s16(c1.val[0]));
997*09537850SAkhilesh Sanikop   const int32x4x2_t d1 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[1]),
998*09537850SAkhilesh Sanikop                                    vreinterpretq_s32_s16(c1.val[1]));
999*09537850SAkhilesh Sanikop 
1000*09537850SAkhilesh Sanikop   a[0] = vreinterpret_s8_s32(vget_low_s32(d0.val[0]));
1001*09537850SAkhilesh Sanikop   a[1] = vreinterpret_s8_s32(vget_high_s32(d0.val[0]));
1002*09537850SAkhilesh Sanikop   a[2] = vreinterpret_s8_s32(vget_low_s32(d1.val[0]));
1003*09537850SAkhilesh Sanikop   a[3] = vreinterpret_s8_s32(vget_high_s32(d1.val[0]));
1004*09537850SAkhilesh Sanikop   a[4] = vreinterpret_s8_s32(vget_low_s32(d0.val[1]));
1005*09537850SAkhilesh Sanikop   a[5] = vreinterpret_s8_s32(vget_high_s32(d0.val[1]));
1006*09537850SAkhilesh Sanikop   a[6] = vreinterpret_s8_s32(vget_low_s32(d1.val[1]));
1007*09537850SAkhilesh Sanikop   a[7] = vreinterpret_s8_s32(vget_high_s32(d1.val[1]));
1008*09537850SAkhilesh Sanikop }
1009*09537850SAkhilesh Sanikop 
1010*09537850SAkhilesh Sanikop // Unsigned.
Transpose8x8(uint8x8_t a[8])1011*09537850SAkhilesh Sanikop inline void Transpose8x8(uint8x8_t a[8]) {
1012*09537850SAkhilesh Sanikop   const uint8x16x2_t b0 =
1013*09537850SAkhilesh Sanikop       vtrnq_u8(vcombine_u8(a[0], a[4]), vcombine_u8(a[1], a[5]));
1014*09537850SAkhilesh Sanikop   const uint8x16x2_t b1 =
1015*09537850SAkhilesh Sanikop       vtrnq_u8(vcombine_u8(a[2], a[6]), vcombine_u8(a[3], a[7]));
1016*09537850SAkhilesh Sanikop 
1017*09537850SAkhilesh Sanikop   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1018*09537850SAkhilesh Sanikop                                     vreinterpretq_u16_u8(b1.val[0]));
1019*09537850SAkhilesh Sanikop   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1020*09537850SAkhilesh Sanikop                                     vreinterpretq_u16_u8(b1.val[1]));
1021*09537850SAkhilesh Sanikop 
1022*09537850SAkhilesh Sanikop   const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
1023*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(c1.val[0]));
1024*09537850SAkhilesh Sanikop   const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
1025*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(c1.val[1]));
1026*09537850SAkhilesh Sanikop 
1027*09537850SAkhilesh Sanikop   a[0] = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
1028*09537850SAkhilesh Sanikop   a[1] = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
1029*09537850SAkhilesh Sanikop   a[2] = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
1030*09537850SAkhilesh Sanikop   a[3] = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
1031*09537850SAkhilesh Sanikop   a[4] = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
1032*09537850SAkhilesh Sanikop   a[5] = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
1033*09537850SAkhilesh Sanikop   a[6] = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
1034*09537850SAkhilesh Sanikop   a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
1035*09537850SAkhilesh Sanikop }
1036*09537850SAkhilesh Sanikop 
Transpose8x8(uint8x8_t in[8],uint8x16_t out[4])1037*09537850SAkhilesh Sanikop inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) {
1038*09537850SAkhilesh Sanikop   const uint8x16x2_t a0 =
1039*09537850SAkhilesh Sanikop       vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5]));
1040*09537850SAkhilesh Sanikop   const uint8x16x2_t a1 =
1041*09537850SAkhilesh Sanikop       vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7]));
1042*09537850SAkhilesh Sanikop 
1043*09537850SAkhilesh Sanikop   const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]),
1044*09537850SAkhilesh Sanikop                                     vreinterpretq_u16_u8(a1.val[0]));
1045*09537850SAkhilesh Sanikop   const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]),
1046*09537850SAkhilesh Sanikop                                     vreinterpretq_u16_u8(a1.val[1]));
1047*09537850SAkhilesh Sanikop 
1048*09537850SAkhilesh Sanikop   const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]),
1049*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(b1.val[0]));
1050*09537850SAkhilesh Sanikop   const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]),
1051*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(b1.val[1]));
1052*09537850SAkhilesh Sanikop 
1053*09537850SAkhilesh Sanikop   out[0] = vreinterpretq_u8_u32(c0.val[0]);
1054*09537850SAkhilesh Sanikop   out[1] = vreinterpretq_u8_u32(c1.val[0]);
1055*09537850SAkhilesh Sanikop   out[2] = vreinterpretq_u8_u32(c0.val[1]);
1056*09537850SAkhilesh Sanikop   out[3] = vreinterpretq_u8_u32(c1.val[1]);
1057*09537850SAkhilesh Sanikop }
1058*09537850SAkhilesh Sanikop 
1059*09537850SAkhilesh Sanikop // Input:
1060*09537850SAkhilesh Sanikop // a[0]: 00 01 02 03 04 05 06 07
1061*09537850SAkhilesh Sanikop // a[1]: 10 11 12 13 14 15 16 17
1062*09537850SAkhilesh Sanikop // a[2]: 20 21 22 23 24 25 26 27
1063*09537850SAkhilesh Sanikop // a[3]: 30 31 32 33 34 35 36 37
1064*09537850SAkhilesh Sanikop // a[4]: 40 41 42 43 44 45 46 47
1065*09537850SAkhilesh Sanikop // a[5]: 50 51 52 53 54 55 56 57
1066*09537850SAkhilesh Sanikop // a[6]: 60 61 62 63 64 65 66 67
1067*09537850SAkhilesh Sanikop // a[7]: 70 71 72 73 74 75 76 77
1068*09537850SAkhilesh Sanikop 
1069*09537850SAkhilesh Sanikop // Output:
1070*09537850SAkhilesh Sanikop // a[0]: 00 10 20 30 40 50 60 70
1071*09537850SAkhilesh Sanikop // a[1]: 01 11 21 31 41 51 61 71
1072*09537850SAkhilesh Sanikop // a[2]: 02 12 22 32 42 52 62 72
1073*09537850SAkhilesh Sanikop // a[3]: 03 13 23 33 43 53 63 73
1074*09537850SAkhilesh Sanikop // a[4]: 04 14 24 34 44 54 64 74
1075*09537850SAkhilesh Sanikop // a[5]: 05 15 25 35 45 55 65 75
1076*09537850SAkhilesh Sanikop // a[6]: 06 16 26 36 46 56 66 76
1077*09537850SAkhilesh Sanikop // a[7]: 07 17 27 37 47 57 67 77
Transpose8x8(int16x8_t a[8])1078*09537850SAkhilesh Sanikop inline void Transpose8x8(int16x8_t a[8]) {
1079*09537850SAkhilesh Sanikop   const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
1080*09537850SAkhilesh Sanikop   const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
1081*09537850SAkhilesh Sanikop   const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
1082*09537850SAkhilesh Sanikop   const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
1083*09537850SAkhilesh Sanikop 
1084*09537850SAkhilesh Sanikop   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
1085*09537850SAkhilesh Sanikop                                    vreinterpretq_s32_s16(b1.val[0]));
1086*09537850SAkhilesh Sanikop   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
1087*09537850SAkhilesh Sanikop                                    vreinterpretq_s32_s16(b1.val[1]));
1088*09537850SAkhilesh Sanikop   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
1089*09537850SAkhilesh Sanikop                                    vreinterpretq_s32_s16(b3.val[0]));
1090*09537850SAkhilesh Sanikop   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
1091*09537850SAkhilesh Sanikop                                    vreinterpretq_s32_s16(b3.val[1]));
1092*09537850SAkhilesh Sanikop 
1093*09537850SAkhilesh Sanikop   const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
1094*09537850SAkhilesh Sanikop   const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
1095*09537850SAkhilesh Sanikop   const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
1096*09537850SAkhilesh Sanikop   const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
1097*09537850SAkhilesh Sanikop 
1098*09537850SAkhilesh Sanikop   a[0] = d0.val[0];
1099*09537850SAkhilesh Sanikop   a[1] = d1.val[0];
1100*09537850SAkhilesh Sanikop   a[2] = d2.val[0];
1101*09537850SAkhilesh Sanikop   a[3] = d3.val[0];
1102*09537850SAkhilesh Sanikop   a[4] = d0.val[1];
1103*09537850SAkhilesh Sanikop   a[5] = d1.val[1];
1104*09537850SAkhilesh Sanikop   a[6] = d2.val[1];
1105*09537850SAkhilesh Sanikop   a[7] = d3.val[1];
1106*09537850SAkhilesh Sanikop }
1107*09537850SAkhilesh Sanikop 
1108*09537850SAkhilesh Sanikop // Unsigned.
Transpose8x8(uint16x8_t a[8])1109*09537850SAkhilesh Sanikop inline void Transpose8x8(uint16x8_t a[8]) {
1110*09537850SAkhilesh Sanikop   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
1111*09537850SAkhilesh Sanikop   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
1112*09537850SAkhilesh Sanikop   const uint16x8x2_t b2 = vtrnq_u16(a[4], a[5]);
1113*09537850SAkhilesh Sanikop   const uint16x8x2_t b3 = vtrnq_u16(a[6], a[7]);
1114*09537850SAkhilesh Sanikop 
1115*09537850SAkhilesh Sanikop   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
1116*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(b1.val[0]));
1117*09537850SAkhilesh Sanikop   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
1118*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(b1.val[1]));
1119*09537850SAkhilesh Sanikop   const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
1120*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(b3.val[0]));
1121*09537850SAkhilesh Sanikop   const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
1122*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(b3.val[1]));
1123*09537850SAkhilesh Sanikop 
1124*09537850SAkhilesh Sanikop   const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c2.val[0]);
1125*09537850SAkhilesh Sanikop   const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c3.val[0]);
1126*09537850SAkhilesh Sanikop   const uint16x8x2_t d2 = VtrnqU64(c0.val[1], c2.val[1]);
1127*09537850SAkhilesh Sanikop   const uint16x8x2_t d3 = VtrnqU64(c1.val[1], c3.val[1]);
1128*09537850SAkhilesh Sanikop 
1129*09537850SAkhilesh Sanikop   a[0] = d0.val[0];
1130*09537850SAkhilesh Sanikop   a[1] = d1.val[0];
1131*09537850SAkhilesh Sanikop   a[2] = d2.val[0];
1132*09537850SAkhilesh Sanikop   a[3] = d3.val[0];
1133*09537850SAkhilesh Sanikop   a[4] = d0.val[1];
1134*09537850SAkhilesh Sanikop   a[5] = d1.val[1];
1135*09537850SAkhilesh Sanikop   a[6] = d2.val[1];
1136*09537850SAkhilesh Sanikop   a[7] = d3.val[1];
1137*09537850SAkhilesh Sanikop }
1138*09537850SAkhilesh Sanikop 
1139*09537850SAkhilesh Sanikop // Input:
1140*09537850SAkhilesh Sanikop // a[0]: 00 01 02 03 04 05 06 07  80 81 82 83 84 85 86 87
1141*09537850SAkhilesh Sanikop // a[1]: 10 11 12 13 14 15 16 17  90 91 92 93 94 95 96 97
1142*09537850SAkhilesh Sanikop // a[2]: 20 21 22 23 24 25 26 27  a0 a1 a2 a3 a4 a5 a6 a7
1143*09537850SAkhilesh Sanikop // a[3]: 30 31 32 33 34 35 36 37  b0 b1 b2 b3 b4 b5 b6 b7
1144*09537850SAkhilesh Sanikop // a[4]: 40 41 42 43 44 45 46 47  c0 c1 c2 c3 c4 c5 c6 c7
1145*09537850SAkhilesh Sanikop // a[5]: 50 51 52 53 54 55 56 57  d0 d1 d2 d3 d4 d5 d6 d7
1146*09537850SAkhilesh Sanikop // a[6]: 60 61 62 63 64 65 66 67  e0 e1 e2 e3 e4 e5 e6 e7
1147*09537850SAkhilesh Sanikop // a[7]: 70 71 72 73 74 75 76 77  f0 f1 f2 f3 f4 f5 f6 f7
1148*09537850SAkhilesh Sanikop 
1149*09537850SAkhilesh Sanikop // Output:
1150*09537850SAkhilesh Sanikop // a[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
1151*09537850SAkhilesh Sanikop // a[1]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
1152*09537850SAkhilesh Sanikop // a[2]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
1153*09537850SAkhilesh Sanikop // a[3]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
1154*09537850SAkhilesh Sanikop // a[4]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
1155*09537850SAkhilesh Sanikop // a[5]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
1156*09537850SAkhilesh Sanikop // a[6]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
1157*09537850SAkhilesh Sanikop // a[7]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
Transpose8x16(uint8x16_t a[8])1158*09537850SAkhilesh Sanikop inline void Transpose8x16(uint8x16_t a[8]) {
1159*09537850SAkhilesh Sanikop   // b0.val[0]: 00 10 02 12 04 14 06 16  80 90 82 92 84 94 86 96
1160*09537850SAkhilesh Sanikop   // b0.val[1]: 01 11 03 13 05 15 07 17  81 91 83 93 85 95 87 97
1161*09537850SAkhilesh Sanikop   // b1.val[0]: 20 30 22 32 24 34 26 36  a0 b0 a2 b2 a4 b4 a6 b6
1162*09537850SAkhilesh Sanikop   // b1.val[1]: 21 31 23 33 25 35 27 37  a1 b1 a3 b3 a5 b5 a7 b7
1163*09537850SAkhilesh Sanikop   // b2.val[0]: 40 50 42 52 44 54 46 56  c0 d0 c2 d2 c4 d4 c6 d6
1164*09537850SAkhilesh Sanikop   // b2.val[1]: 41 51 43 53 45 55 47 57  c1 d1 c3 d3 c5 d5 c7 d7
1165*09537850SAkhilesh Sanikop   // b3.val[0]: 60 70 62 72 64 74 66 76  e0 f0 e2 f2 e4 f4 e6 f6
1166*09537850SAkhilesh Sanikop   // b3.val[1]: 61 71 63 73 65 75 67 77  e1 f1 e3 f3 e5 f5 e7 f7
1167*09537850SAkhilesh Sanikop   const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]);
1168*09537850SAkhilesh Sanikop   const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]);
1169*09537850SAkhilesh Sanikop   const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]);
1170*09537850SAkhilesh Sanikop   const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]);
1171*09537850SAkhilesh Sanikop 
1172*09537850SAkhilesh Sanikop   // c0.val[0]: 00 10 20 30 04 14 24 34  80 90 a0 b0 84 94 a4 b4
1173*09537850SAkhilesh Sanikop   // c0.val[1]: 02 12 22 32 06 16 26 36  82 92 a2 b2 86 96 a6 b6
1174*09537850SAkhilesh Sanikop   // c1.val[0]: 01 11 21 31 05 15 25 35  81 91 a1 b1 85 95 a5 b5
1175*09537850SAkhilesh Sanikop   // c1.val[1]: 03 13 23 33 07 17 27 37  83 93 a3 b3 87 97 a7 b7
1176*09537850SAkhilesh Sanikop   // c2.val[0]: 40 50 60 70 44 54 64 74  c0 d0 e0 f0 c4 d4 e4 f4
1177*09537850SAkhilesh Sanikop   // c2.val[1]: 42 52 62 72 46 56 66 76  c2 d2 e2 f2 c6 d6 e6 f6
1178*09537850SAkhilesh Sanikop   // c3.val[0]: 41 51 61 71 45 55 65 75  c1 d1 e1 f1 c5 d5 e5 f5
1179*09537850SAkhilesh Sanikop   // c3.val[1]: 43 53 63 73 47 57 67 77  c3 d3 e3 f3 c7 d7 e7 f7
1180*09537850SAkhilesh Sanikop   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1181*09537850SAkhilesh Sanikop                                     vreinterpretq_u16_u8(b1.val[0]));
1182*09537850SAkhilesh Sanikop   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1183*09537850SAkhilesh Sanikop                                     vreinterpretq_u16_u8(b1.val[1]));
1184*09537850SAkhilesh Sanikop   const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
1185*09537850SAkhilesh Sanikop                                     vreinterpretq_u16_u8(b3.val[0]));
1186*09537850SAkhilesh Sanikop   const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
1187*09537850SAkhilesh Sanikop                                     vreinterpretq_u16_u8(b3.val[1]));
1188*09537850SAkhilesh Sanikop 
1189*09537850SAkhilesh Sanikop   // d0.val[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
1190*09537850SAkhilesh Sanikop   // d0.val[1]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
1191*09537850SAkhilesh Sanikop   // d1.val[0]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
1192*09537850SAkhilesh Sanikop   // d1.val[1]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
1193*09537850SAkhilesh Sanikop   // d2.val[0]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
1194*09537850SAkhilesh Sanikop   // d2.val[1]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
1195*09537850SAkhilesh Sanikop   // d3.val[0]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
1196*09537850SAkhilesh Sanikop   // d3.val[1]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
1197*09537850SAkhilesh Sanikop   const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
1198*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(c2.val[0]));
1199*09537850SAkhilesh Sanikop   const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
1200*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(c3.val[0]));
1201*09537850SAkhilesh Sanikop   const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
1202*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(c2.val[1]));
1203*09537850SAkhilesh Sanikop   const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
1204*09537850SAkhilesh Sanikop                                     vreinterpretq_u32_u16(c3.val[1]));
1205*09537850SAkhilesh Sanikop 
1206*09537850SAkhilesh Sanikop   a[0] = vreinterpretq_u8_u32(d0.val[0]);
1207*09537850SAkhilesh Sanikop   a[1] = vreinterpretq_u8_u32(d1.val[0]);
1208*09537850SAkhilesh Sanikop   a[2] = vreinterpretq_u8_u32(d2.val[0]);
1209*09537850SAkhilesh Sanikop   a[3] = vreinterpretq_u8_u32(d3.val[0]);
1210*09537850SAkhilesh Sanikop   a[4] = vreinterpretq_u8_u32(d0.val[1]);
1211*09537850SAkhilesh Sanikop   a[5] = vreinterpretq_u8_u32(d1.val[1]);
1212*09537850SAkhilesh Sanikop   a[6] = vreinterpretq_u8_u32(d2.val[1]);
1213*09537850SAkhilesh Sanikop   a[7] = vreinterpretq_u8_u32(d3.val[1]);
1214*09537850SAkhilesh Sanikop }
1215*09537850SAkhilesh Sanikop 
ZeroExtend(const uint8x8_t in)1216*09537850SAkhilesh Sanikop inline int16x8_t ZeroExtend(const uint8x8_t in) {
1217*09537850SAkhilesh Sanikop   return vreinterpretq_s16_u16(vmovl_u8(in));
1218*09537850SAkhilesh Sanikop }
1219*09537850SAkhilesh Sanikop 
1220*09537850SAkhilesh Sanikop }  // namespace dsp
1221*09537850SAkhilesh Sanikop }  // namespace libgav1
1222*09537850SAkhilesh Sanikop 
1223*09537850SAkhilesh Sanikop #endif  // LIBGAV1_ENABLE_NEON
1224*09537850SAkhilesh Sanikop #endif  // LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
1225