1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
12 #define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
13
14 #include <arm_neon.h>
15
16 #include "./vpx_config.h"
17
18 // Transpose 64 bit elements as follows:
19 // a0: 00 01 02 03 04 05 06 07
20 // a1: 16 17 18 19 20 21 22 23
21 //
22 // b0.val[0]: 00 01 02 03 16 17 18 19
23 // b0.val[1]: 04 05 06 07 20 21 22 23
vpx_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)24 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
25 int16x8x2_t b0;
26 #if VPX_ARCH_AARCH64
27 b0.val[0] = vreinterpretq_s16_s64(
28 vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
29 b0.val[1] = vreinterpretq_s16_s64(
30 vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
31 #else
32 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
33 vreinterpret_s16_s32(vget_low_s32(a1)));
34 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
35 vreinterpret_s16_s32(vget_high_s32(a1)));
36 #endif
37 return b0;
38 }
39
vpx_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)40 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
41 int32x4x2_t b0;
42 #if VPX_ARCH_AARCH64
43 b0.val[0] = vreinterpretq_s32_s64(
44 vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
45 b0.val[1] = vreinterpretq_s32_s64(
46 vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
47 #else
48 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
49 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
50 #endif
51 return b0;
52 }
53
vpx_vtrnq_s64(int32x4_t a0,int32x4_t a1)54 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
55 int64x2x2_t b0;
56 #if VPX_ARCH_AARCH64
57 b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
58 b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
59 #else
60 b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
61 vreinterpret_s64_s32(vget_low_s32(a1)));
62 b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
63 vreinterpret_s64_s32(vget_high_s32(a1)));
64 #endif
65 return b0;
66 }
67
vpx_vtrnq_u64_to_u8(uint32x4_t a0,uint32x4_t a1)68 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
69 uint8x16x2_t b0;
70 #if VPX_ARCH_AARCH64
71 b0.val[0] = vreinterpretq_u8_u64(
72 vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
73 b0.val[1] = vreinterpretq_u8_u64(
74 vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
75 #else
76 b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
77 vreinterpret_u8_u32(vget_low_u32(a1)));
78 b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
79 vreinterpret_u8_u32(vget_high_u32(a1)));
80 #endif
81 return b0;
82 }
83
vpx_vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)84 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
85 uint16x8x2_t b0;
86 #if VPX_ARCH_AARCH64
87 b0.val[0] = vreinterpretq_u16_u64(
88 vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
89 b0.val[1] = vreinterpretq_u16_u64(
90 vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
91 #else
92 b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
93 vreinterpret_u16_u32(vget_low_u32(a1)));
94 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
95 vreinterpret_u16_u32(vget_high_u32(a1)));
96 #endif
97 return b0;
98 }
99
transpose_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)100 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
101 // Swap 16 bit elements. Goes from:
102 // a0: 00 01 02 03 10 11 12 13
103 // a1: 20 21 22 23 30 31 32 33
104 // to:
105 // b0.val[0]: 00 01 20 21 10 11 30 31
106 // b0.val[1]: 02 03 22 23 12 13 32 33
107
108 const uint16x4x2_t b0 =
109 vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
110
111 // Swap 32 bit elements resulting in:
112 // c0.val[0]: 00 01 20 21 02 03 22 23
113 // c0.val[1]: 10 11 30 31 12 13 32 33
114
115 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
116 vreinterpret_u32_u16(b0.val[1]));
117
118 // Swap 8 bit elements resulting in:
119 // d0.val[0]: 00 10 20 30 02 12 22 32
120 // d0.val[1]: 01 11 21 31 03 13 23 33
121
122 const uint8x8x2_t d0 =
123 vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
124
125 *a0 = d0.val[0];
126 *a1 = d0.val[1];
127 }
128
transpose_s16_4x4d(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)129 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
130 int16x4_t *a2, int16x4_t *a3) {
131 // Swap 16 bit elements. Goes from:
132 // a0: 00 01 02 03
133 // a1: 10 11 12 13
134 // a2: 20 21 22 23
135 // a3: 30 31 32 33
136 // to:
137 // b0.val[0]: 00 10 02 12
138 // b0.val[1]: 01 11 03 13
139 // b1.val[0]: 20 30 22 32
140 // b1.val[1]: 21 31 23 33
141
142 const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
143 const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
144
145 // Swap 32 bit elements resulting in:
146 // c0.val[0]: 00 10 20 30
147 // c0.val[1]: 02 12 22 32
148 // c1.val[0]: 01 11 21 31
149 // c1.val[1]: 03 13 23 33
150
151 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
152 vreinterpret_s32_s16(b1.val[0]));
153 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
154 vreinterpret_s32_s16(b1.val[1]));
155
156 *a0 = vreinterpret_s16_s32(c0.val[0]);
157 *a1 = vreinterpret_s16_s32(c1.val[0]);
158 *a2 = vreinterpret_s16_s32(c0.val[1]);
159 *a3 = vreinterpret_s16_s32(c1.val[1]);
160 }
161
transpose_s16_4x4q(int16x8_t * a0,int16x8_t * a1)162 static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
163 // Swap 32 bit elements. Goes from:
164 // a0: 00 01 02 03 10 11 12 13
165 // a1: 20 21 22 23 30 31 32 33
166 // to:
167 // b0.val[0]: 00 01 20 21 10 11 30 31
168 // b0.val[1]: 02 03 22 23 12 13 32 33
169
170 const int32x4x2_t b0 =
171 vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
172
173 // Swap 64 bit elements resulting in:
174 // c0: 00 01 20 21 02 03 22 23
175 // c1: 10 11 30 31 12 13 32 33
176
177 const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]);
178
179 // Swap 16 bit elements resulting in:
180 // d0.val[0]: 00 10 20 30 02 12 22 32
181 // d0.val[1]: 01 11 21 31 03 13 23 33
182
183 const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]);
184
185 *a0 = d0.val[0];
186 *a1 = d0.val[1];
187 }
188
transpose_u16_4x4q(uint16x8_t * a0,uint16x8_t * a1)189 static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
190 // Swap 32 bit elements. Goes from:
191 // a0: 00 01 02 03 10 11 12 13
192 // a1: 20 21 22 23 30 31 32 33
193 // to:
194 // b0.val[0]: 00 01 20 21 10 11 30 31
195 // b0.val[1]: 02 03 22 23 12 13 32 33
196
197 const uint32x4x2_t b0 =
198 vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1));
199
200 // Swap 64 bit elements resulting in:
201 // c0: 00 01 20 21 02 03 22 23
202 // c1: 10 11 30 31 12 13 32 33
203
204 const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]);
205
206 // Swap 16 bit elements resulting in:
207 // d0.val[0]: 00 10 20 30 02 12 22 32
208 // d0.val[1]: 01 11 21 31 03 13 23 33
209
210 const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]);
211
212 *a0 = d0.val[0];
213 *a1 = d0.val[1];
214 }
215
transpose_u8_4x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,const uint8x8_t a4,const uint8x8_t a5,const uint8x8_t a6,const uint8x8_t a7)216 static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
217 uint8x8_t *a3, const uint8x8_t a4,
218 const uint8x8_t a5, const uint8x8_t a6,
219 const uint8x8_t a7) {
220 // Swap 32 bit elements. Goes from:
221 // a0: 00 01 02 03 XX XX XX XX
222 // a1: 10 11 12 13 XX XX XX XX
223 // a2: 20 21 22 23 XX XX XX XX
224 // a3; 30 31 32 33 XX XX XX XX
225 // a4: 40 41 42 43 XX XX XX XX
226 // a5: 50 51 52 53 XX XX XX XX
227 // a6: 60 61 62 63 XX XX XX XX
228 // a7: 70 71 72 73 XX XX XX XX
229 // to:
230 // b0.val[0]: 00 01 02 03 40 41 42 43
231 // b1.val[0]: 10 11 12 13 50 51 52 53
232 // b2.val[0]: 20 21 22 23 60 61 62 63
233 // b3.val[0]: 30 31 32 33 70 71 72 73
234
235 const uint32x2x2_t b0 =
236 vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
237 const uint32x2x2_t b1 =
238 vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
239 const uint32x2x2_t b2 =
240 vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
241 const uint32x2x2_t b3 =
242 vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
243
244 // Swap 16 bit elements resulting in:
245 // c0.val[0]: 00 01 20 21 40 41 60 61
246 // c0.val[1]: 02 03 22 23 42 43 62 63
247 // c1.val[0]: 10 11 30 31 50 51 70 71
248 // c1.val[1]: 12 13 32 33 52 53 72 73
249
250 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
251 vreinterpret_u16_u32(b2.val[0]));
252 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
253 vreinterpret_u16_u32(b3.val[0]));
254
255 // Swap 8 bit elements resulting in:
256 // d0.val[0]: 00 10 20 30 40 50 60 70
257 // d0.val[1]: 01 11 21 31 41 51 61 71
258 // d1.val[0]: 02 12 22 32 42 52 62 72
259 // d1.val[1]: 03 13 23 33 43 53 63 73
260
261 const uint8x8x2_t d0 =
262 vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
263 const uint8x8x2_t d1 =
264 vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
265
266 *a0 = d0.val[0];
267 *a1 = d0.val[1];
268 *a2 = d1.val[0];
269 *a3 = d1.val[1];
270 }
271
transpose_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)272 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
273 int32x4_t *a2, int32x4_t *a3) {
274 // Swap 32 bit elements. Goes from:
275 // a0: 00 01 02 03
276 // a1: 10 11 12 13
277 // a2: 20 21 22 23
278 // a3: 30 31 32 33
279 // to:
280 // b0.val[0]: 00 10 02 12
281 // b0.val[1]: 01 11 03 13
282 // b1.val[0]: 20 30 22 32
283 // b1.val[1]: 21 31 23 33
284
285 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
286 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
287
288 // Swap 64 bit elements resulting in:
289 // c0.val[0]: 00 10 20 30
290 // c0.val[1]: 02 12 22 32
291 // c1.val[0]: 01 11 21 31
292 // c1.val[1]: 03 13 23 33
293
294 const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
295 const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
296
297 *a0 = c0.val[0];
298 *a1 = c1.val[0];
299 *a2 = c0.val[1];
300 *a3 = c1.val[1];
301 }
302
transpose_s16_4x8(const int16x4_t a0,const int16x4_t a1,const int16x4_t a2,const int16x4_t a3,const int16x4_t a4,const int16x4_t a5,const int16x4_t a6,const int16x4_t a7,int16x8_t * const o0,int16x8_t * const o1,int16x8_t * const o2,int16x8_t * const o3)303 static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
304 const int16x4_t a2, const int16x4_t a3,
305 const int16x4_t a4, const int16x4_t a5,
306 const int16x4_t a6, const int16x4_t a7,
307 int16x8_t *const o0, int16x8_t *const o1,
308 int16x8_t *const o2, int16x8_t *const o3) {
309 // Combine rows. Goes from:
310 // a0: 00 01 02 03
311 // a1: 10 11 12 13
312 // a2: 20 21 22 23
313 // a3: 30 31 32 33
314 // a4: 40 41 42 43
315 // a5: 50 51 52 53
316 // a6: 60 61 62 63
317 // a7: 70 71 72 73
318 // to:
319 // b0: 00 01 02 03 40 41 42 43
320 // b1: 10 11 12 13 50 51 52 53
321 // b2: 20 21 22 23 60 61 62 63
322 // b3: 30 31 32 33 70 71 72 73
323
324 const int16x8_t b0 = vcombine_s16(a0, a4);
325 const int16x8_t b1 = vcombine_s16(a1, a5);
326 const int16x8_t b2 = vcombine_s16(a2, a6);
327 const int16x8_t b3 = vcombine_s16(a3, a7);
328
329 // Swap 16 bit elements resulting in:
330 // c0.val[0]: 00 10 02 12 40 50 42 52
331 // c0.val[1]: 01 11 03 13 41 51 43 53
332 // c1.val[0]: 20 30 22 32 60 70 62 72
333 // c1.val[1]: 21 31 23 33 61 71 63 73
334
335 const int16x8x2_t c0 = vtrnq_s16(b0, b1);
336 const int16x8x2_t c1 = vtrnq_s16(b2, b3);
337
338 // Swap 32 bit elements resulting in:
339 // d0.val[0]: 00 10 20 30 40 50 60 70
340 // d0.val[1]: 02 12 22 32 42 52 62 72
341 // d1.val[0]: 01 11 21 31 41 51 61 71
342 // d1.val[1]: 03 13 23 33 43 53 63 73
343
344 const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
345 vreinterpretq_s32_s16(c1.val[0]));
346 const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
347 vreinterpretq_s32_s16(c1.val[1]));
348
349 *o0 = vreinterpretq_s16_s32(d0.val[0]);
350 *o1 = vreinterpretq_s16_s32(d1.val[0]);
351 *o2 = vreinterpretq_s16_s32(d0.val[1]);
352 *o3 = vreinterpretq_s16_s32(d1.val[1]);
353 }
354
transpose_s32_4x8(int32x4_t * const a0,int32x4_t * const a1,int32x4_t * const a2,int32x4_t * const a3,int32x4_t * const a4,int32x4_t * const a5,int32x4_t * const a6,int32x4_t * const a7)355 static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
356 int32x4_t *const a2, int32x4_t *const a3,
357 int32x4_t *const a4, int32x4_t *const a5,
358 int32x4_t *const a6, int32x4_t *const a7) {
359 // Swap 32 bit elements. Goes from:
360 // a0: 00 01 02 03
361 // a1: 10 11 12 13
362 // a2: 20 21 22 23
363 // a3: 30 31 32 33
364 // a4: 40 41 42 43
365 // a5: 50 51 52 53
366 // a6: 60 61 62 63
367 // a7: 70 71 72 73
368 // to:
369 // b0.val[0]: 00 10 02 12
370 // b0.val[1]: 01 11 03 13
371 // b1.val[0]: 20 30 22 32
372 // b1.val[1]: 21 31 23 33
373 // b2.val[0]: 40 50 42 52
374 // b2.val[1]: 41 51 43 53
375 // b3.val[0]: 60 70 62 72
376 // b3.val[1]: 61 71 63 73
377
378 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
379 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
380 const int32x4x2_t b2 = vtrnq_s32(*a4, *a5);
381 const int32x4x2_t b3 = vtrnq_s32(*a6, *a7);
382
383 // Swap 64 bit elements resulting in:
384 // c0.val[0]: 00 10 20 30
385 // c0.val[1]: 02 12 22 32
386 // c1.val[0]: 01 11 21 31
387 // c1.val[1]: 03 13 23 33
388 // c2.val[0]: 40 50 60 70
389 // c2.val[1]: 42 52 62 72
390 // c3.val[0]: 41 51 61 71
391 // c3.val[1]: 43 53 63 73
392
393 const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]);
394 const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]);
395 const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]);
396 const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]);
397
398 *a0 = vreinterpretq_s32_s64(c0.val[0]);
399 *a1 = vreinterpretq_s32_s64(c2.val[0]);
400 *a2 = vreinterpretq_s32_s64(c1.val[0]);
401 *a3 = vreinterpretq_s32_s64(c3.val[0]);
402 *a4 = vreinterpretq_s32_s64(c0.val[1]);
403 *a5 = vreinterpretq_s32_s64(c2.val[1]);
404 *a6 = vreinterpretq_s32_s64(c1.val[1]);
405 *a7 = vreinterpretq_s32_s64(c3.val[1]);
406 }
407
transpose_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)408 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
409 uint8x8_t *a3) {
410 // Swap 8 bit elements. Goes from:
411 // a0: 00 01 02 03 04 05 06 07
412 // a1: 10 11 12 13 14 15 16 17
413 // a2: 20 21 22 23 24 25 26 27
414 // a3: 30 31 32 33 34 35 36 37
415 // to:
416 // b0.val[0]: 00 10 02 12 04 14 06 16
417 // b0.val[1]: 01 11 03 13 05 15 07 17
418 // b1.val[0]: 20 30 22 32 24 34 26 36
419 // b1.val[1]: 21 31 23 33 25 35 27 37
420
421 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
422 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
423
424 // Swap 16 bit elements resulting in:
425 // c0.val[0]: 00 10 20 30 04 14 24 34
426 // c0.val[1]: 02 12 22 32 06 16 26 36
427 // c1.val[0]: 01 11 21 31 05 15 25 35
428 // c1.val[1]: 03 13 23 33 07 17 27 37
429
430 const uint16x4x2_t c0 =
431 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
432 const uint16x4x2_t c1 =
433 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
434
435 *a0 = vreinterpret_u8_u16(c0.val[0]);
436 *a1 = vreinterpret_u8_u16(c1.val[0]);
437 *a2 = vreinterpret_u8_u16(c0.val[1]);
438 *a3 = vreinterpret_u8_u16(c1.val[1]);
439 }
440
transpose_u16_8x4(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3)441 static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
442 uint16x8_t *a2, uint16x8_t *a3) {
443 // Swap 16 bit elements. Goes from:
444 // a0: 00 01 02 03 04 05 06 07
445 // a1: 10 11 12 13 14 15 16 17
446 // a2: 20 21 22 23 24 25 26 27
447 // a3: 30 31 32 33 34 35 36 37
448 // to:
449 // b0.val[0]: 00 10 02 12 04 14 06 16
450 // b0.val[1]: 01 11 03 13 05 15 07 17
451 // b1.val[0]: 20 30 22 32 24 34 26 36
452 // b1.val[1]: 21 31 23 33 25 35 27 37
453
454 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
455 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
456
457 // Swap 32 bit elements resulting in:
458 // c0.val[0]: 00 10 20 30 04 14 24 34
459 // c0.val[1]: 02 12 22 32 06 16 26 36
460 // c1.val[0]: 01 11 21 31 05 15 25 35
461 // c1.val[1]: 03 13 23 33 07 17 27 37
462
463 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
464 vreinterpretq_u32_u16(b1.val[0]));
465 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
466 vreinterpretq_u32_u16(b1.val[1]));
467
468 *a0 = vreinterpretq_u16_u32(c0.val[0]);
469 *a1 = vreinterpretq_u16_u32(c1.val[0]);
470 *a2 = vreinterpretq_u16_u32(c0.val[1]);
471 *a3 = vreinterpretq_u16_u32(c1.val[1]);
472 }
473
transpose_s32_8x4(int32x4_t * const a0,int32x4_t * const a1,int32x4_t * const a2,int32x4_t * const a3,int32x4_t * const a4,int32x4_t * const a5,int32x4_t * const a6,int32x4_t * const a7)474 static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
475 int32x4_t *const a2, int32x4_t *const a3,
476 int32x4_t *const a4, int32x4_t *const a5,
477 int32x4_t *const a6, int32x4_t *const a7) {
478 // Swap 32 bit elements. Goes from:
479 // a0: 00 01 02 03
480 // a1: 04 05 06 07
481 // a2: 10 11 12 13
482 // a3: 14 15 16 17
483 // a4: 20 21 22 23
484 // a5: 24 25 26 27
485 // a6: 30 31 32 33
486 // a7: 34 35 36 37
487 // to:
488 // b0.val[0]: 00 10 02 12
489 // b0.val[1]: 01 11 03 13
490 // b1.val[0]: 04 14 06 16
491 // b1.val[1]: 05 15 07 17
492 // b2.val[0]: 20 30 22 32
493 // b2.val[1]: 21 31 23 33
494 // b3.val[0]: 24 34 26 36
495 // b3.val[1]: 25 35 27 37
496
497 const int32x4x2_t b0 = vtrnq_s32(*a0, *a2);
498 const int32x4x2_t b1 = vtrnq_s32(*a1, *a3);
499 const int32x4x2_t b2 = vtrnq_s32(*a4, *a6);
500 const int32x4x2_t b3 = vtrnq_s32(*a5, *a7);
501
502 // Swap 64 bit elements resulting in:
503 // c0.val[0]: 00 10 20 30
504 // c0.val[1]: 02 12 22 32
505 // c1.val[0]: 01 11 21 31
506 // c1.val[1]: 03 13 23 33
507 // c2.val[0]: 04 14 24 34
508 // c2.val[1]: 06 16 26 36
509 // c3.val[0]: 05 15 25 35
510 // c3.val[1]: 07 17 27 37
511
512 const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]);
513 const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]);
514 const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]);
515 const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]);
516
517 *a0 = vreinterpretq_s32_s64(c0.val[0]);
518 *a1 = vreinterpretq_s32_s64(c1.val[0]);
519 *a2 = vreinterpretq_s32_s64(c0.val[1]);
520 *a3 = vreinterpretq_s32_s64(c1.val[1]);
521 *a4 = vreinterpretq_s32_s64(c2.val[0]);
522 *a5 = vreinterpretq_s32_s64(c3.val[0]);
523 *a6 = vreinterpretq_s32_s64(c2.val[1]);
524 *a7 = vreinterpretq_s32_s64(c3.val[1]);
525 }
526
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)527 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
528 uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
529 uint8x8_t *a6, uint8x8_t *a7) {
530 // Widen to 128-bit registers (usually a no-op once inlined.)
531 const uint8x16_t a0q = vcombine_u8(*a0, vdup_n_u8(0));
532 const uint8x16_t a1q = vcombine_u8(*a1, vdup_n_u8(0));
533 const uint8x16_t a2q = vcombine_u8(*a2, vdup_n_u8(0));
534 const uint8x16_t a3q = vcombine_u8(*a3, vdup_n_u8(0));
535 const uint8x16_t a4q = vcombine_u8(*a4, vdup_n_u8(0));
536 const uint8x16_t a5q = vcombine_u8(*a5, vdup_n_u8(0));
537 const uint8x16_t a6q = vcombine_u8(*a6, vdup_n_u8(0));
538 const uint8x16_t a7q = vcombine_u8(*a7, vdup_n_u8(0));
539
540 // Zip 8 bit elements. Goes from:
541 // a0: 00 01 02 03 04 05 06 07
542 // a1: 10 11 12 13 14 15 16 17
543 // a2: 20 21 22 23 24 25 26 27
544 // a3: 30 31 32 33 34 35 36 37
545 // a4: 40 41 42 43 44 45 46 47
546 // a5: 50 51 52 53 54 55 56 57
547 // a6: 60 61 62 63 64 65 66 67
548 // a7: 70 71 72 73 74 75 76 77
549 // to:
550 // b0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
551 // b1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
552 // b2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
553 // b3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
554 const uint8x16_t b0 = vzipq_u8(a0q, a1q).val[0];
555 const uint8x16_t b1 = vzipq_u8(a2q, a3q).val[0];
556 const uint8x16_t b2 = vzipq_u8(a4q, a5q).val[0];
557 const uint8x16_t b3 = vzipq_u8(a6q, a7q).val[0];
558
559 // Zip 16 bit elements resulting in:
560 // c0.val[0]: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
561 // c0.val[1]: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
562 // c1.val[0]: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
563 // c1.val[1]: 44 54 64 74 45 55 65 75 46 66 56 76 47 67 57 77
564 const uint16x8x2_t c0 =
565 vzipq_u16(vreinterpretq_u16_u8(b0), vreinterpretq_u16_u8(b1));
566 const uint16x8x2_t c1 =
567 vzipq_u16(vreinterpretq_u16_u8(b2), vreinterpretq_u16_u8(b3));
568
569 // Zip 32 bit elements resulting in:
570 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
571 // d0.val[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
572 // d1.val[0]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
573 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
574 const uint32x4x2_t d0 = vzipq_u32(vreinterpretq_u32_u16(c0.val[0]),
575 vreinterpretq_u32_u16(c1.val[0]));
576 const uint32x4x2_t d1 = vzipq_u32(vreinterpretq_u32_u16(c0.val[1]),
577 vreinterpretq_u32_u16(c1.val[1]));
578
579 *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
580 *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
581 *a2 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
582 *a3 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
583 *a4 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
584 *a5 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
585 *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
586 *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
587 }
588
589 // Transpose 8x8 to a new location.
transpose_s16_8x8q(int16x8_t * a,int16x8_t * out)590 static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
591 // Swap 16 bit elements. Goes from:
592 // a0: 00 01 02 03 04 05 06 07
593 // a1: 10 11 12 13 14 15 16 17
594 // a2: 20 21 22 23 24 25 26 27
595 // a3: 30 31 32 33 34 35 36 37
596 // a4: 40 41 42 43 44 45 46 47
597 // a5: 50 51 52 53 54 55 56 57
598 // a6: 60 61 62 63 64 65 66 67
599 // a7: 70 71 72 73 74 75 76 77
600 // to:
601 // b0.val[0]: 00 10 02 12 04 14 06 16
602 // b0.val[1]: 01 11 03 13 05 15 07 17
603 // b1.val[0]: 20 30 22 32 24 34 26 36
604 // b1.val[1]: 21 31 23 33 25 35 27 37
605 // b2.val[0]: 40 50 42 52 44 54 46 56
606 // b2.val[1]: 41 51 43 53 45 55 47 57
607 // b3.val[0]: 60 70 62 72 64 74 66 76
608 // b3.val[1]: 61 71 63 73 65 75 67 77
609
610 const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
611 const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
612 const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
613 const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
614
615 // Swap 32 bit elements resulting in:
616 // c0.val[0]: 00 10 20 30 04 14 24 34
617 // c0.val[1]: 02 12 22 32 06 16 26 36
618 // c1.val[0]: 01 11 21 31 05 15 25 35
619 // c1.val[1]: 03 13 23 33 07 17 27 37
620 // c2.val[0]: 40 50 60 70 44 54 64 74
621 // c2.val[1]: 42 52 62 72 46 56 66 76
622 // c3.val[0]: 41 51 61 71 45 55 65 75
623 // c3.val[1]: 43 53 63 73 47 57 67 77
624
625 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
626 vreinterpretq_s32_s16(b1.val[0]));
627 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
628 vreinterpretq_s32_s16(b1.val[1]));
629 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
630 vreinterpretq_s32_s16(b3.val[0]));
631 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
632 vreinterpretq_s32_s16(b3.val[1]));
633
634 // Swap 64 bit elements resulting in:
635 // d0.val[0]: 00 10 20 30 40 50 60 70
636 // d0.val[1]: 04 14 24 34 44 54 64 74
637 // d1.val[0]: 01 11 21 31 41 51 61 71
638 // d1.val[1]: 05 15 25 35 45 55 65 75
639 // d2.val[0]: 02 12 22 32 42 52 62 72
640 // d2.val[1]: 06 16 26 36 46 56 66 76
641 // d3.val[0]: 03 13 23 33 43 53 63 73
642 // d3.val[1]: 07 17 27 37 47 57 67 77
643
644 const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
645 const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
646 const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
647 const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
648
649 out[0] = d0.val[0];
650 out[1] = d1.val[0];
651 out[2] = d2.val[0];
652 out[3] = d3.val[0];
653 out[4] = d0.val[1];
654 out[5] = d1.val[1];
655 out[6] = d2.val[1];
656 out[7] = d3.val[1];
657 }
658
transpose_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)659 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
660 int16x8_t *a2, int16x8_t *a3,
661 int16x8_t *a4, int16x8_t *a5,
662 int16x8_t *a6, int16x8_t *a7) {
663 // Swap 16 bit elements. Goes from:
664 // a0: 00 01 02 03 04 05 06 07
665 // a1: 10 11 12 13 14 15 16 17
666 // a2: 20 21 22 23 24 25 26 27
667 // a3: 30 31 32 33 34 35 36 37
668 // a4: 40 41 42 43 44 45 46 47
669 // a5: 50 51 52 53 54 55 56 57
670 // a6: 60 61 62 63 64 65 66 67
671 // a7: 70 71 72 73 74 75 76 77
672 // to:
673 // b0.val[0]: 00 10 02 12 04 14 06 16
674 // b0.val[1]: 01 11 03 13 05 15 07 17
675 // b1.val[0]: 20 30 22 32 24 34 26 36
676 // b1.val[1]: 21 31 23 33 25 35 27 37
677 // b2.val[0]: 40 50 42 52 44 54 46 56
678 // b2.val[1]: 41 51 43 53 45 55 47 57
679 // b3.val[0]: 60 70 62 72 64 74 66 76
680 // b3.val[1]: 61 71 63 73 65 75 67 77
681
682 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
683 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
684 const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
685 const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
686
687 // Swap 32 bit elements resulting in:
688 // c0.val[0]: 00 10 20 30 04 14 24 34
689 // c0.val[1]: 02 12 22 32 06 16 26 36
690 // c1.val[0]: 01 11 21 31 05 15 25 35
691 // c1.val[1]: 03 13 23 33 07 17 27 37
692 // c2.val[0]: 40 50 60 70 44 54 64 74
693 // c2.val[1]: 42 52 62 72 46 56 66 76
694 // c3.val[0]: 41 51 61 71 45 55 65 75
695 // c3.val[1]: 43 53 63 73 47 57 67 77
696
697 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
698 vreinterpretq_s32_s16(b1.val[0]));
699 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
700 vreinterpretq_s32_s16(b1.val[1]));
701 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
702 vreinterpretq_s32_s16(b3.val[0]));
703 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
704 vreinterpretq_s32_s16(b3.val[1]));
705
706 // Swap 64 bit elements resulting in:
707 // d0.val[0]: 00 10 20 30 40 50 60 70
708 // d0.val[1]: 04 14 24 34 44 54 64 74
709 // d1.val[0]: 01 11 21 31 41 51 61 71
710 // d1.val[1]: 05 15 25 35 45 55 65 75
711 // d2.val[0]: 02 12 22 32 42 52 62 72
712 // d2.val[1]: 06 16 26 36 46 56 66 76
713 // d3.val[0]: 03 13 23 33 43 53 63 73
714 // d3.val[1]: 07 17 27 37 47 57 67 77
715
716 const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
717 const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
718 const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
719 const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
720
721 *a0 = d0.val[0];
722 *a1 = d1.val[0];
723 *a2 = d2.val[0];
724 *a3 = d3.val[0];
725 *a4 = d0.val[1];
726 *a5 = d1.val[1];
727 *a6 = d2.val[1];
728 *a7 = d3.val[1];
729 }
730
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)731 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
732 uint16x8_t *a2, uint16x8_t *a3,
733 uint16x8_t *a4, uint16x8_t *a5,
734 uint16x8_t *a6, uint16x8_t *a7) {
735 // Swap 16 bit elements. Goes from:
736 // a0: 00 01 02 03 04 05 06 07
737 // a1: 10 11 12 13 14 15 16 17
738 // a2: 20 21 22 23 24 25 26 27
739 // a3: 30 31 32 33 34 35 36 37
740 // a4: 40 41 42 43 44 45 46 47
741 // a5: 50 51 52 53 54 55 56 57
742 // a6: 60 61 62 63 64 65 66 67
743 // a7: 70 71 72 73 74 75 76 77
744 // to:
745 // b0.val[0]: 00 10 02 12 04 14 06 16
746 // b0.val[1]: 01 11 03 13 05 15 07 17
747 // b1.val[0]: 20 30 22 32 24 34 26 36
748 // b1.val[1]: 21 31 23 33 25 35 27 37
749 // b2.val[0]: 40 50 42 52 44 54 46 56
750 // b2.val[1]: 41 51 43 53 45 55 47 57
751 // b3.val[0]: 60 70 62 72 64 74 66 76
752 // b3.val[1]: 61 71 63 73 65 75 67 77
753
754 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
755 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
756 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
757 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
758
759 // Swap 32 bit elements resulting in:
760 // c0.val[0]: 00 10 20 30 04 14 24 34
761 // c0.val[1]: 02 12 22 32 06 16 26 36
762 // c1.val[0]: 01 11 21 31 05 15 25 35
763 // c1.val[1]: 03 13 23 33 07 17 27 37
764 // c2.val[0]: 40 50 60 70 44 54 64 74
765 // c2.val[1]: 42 52 62 72 46 56 66 76
766 // c3.val[0]: 41 51 61 71 45 55 65 75
767 // c3.val[1]: 43 53 63 73 47 57 67 77
768
769 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
770 vreinterpretq_u32_u16(b1.val[0]));
771 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
772 vreinterpretq_u32_u16(b1.val[1]));
773 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
774 vreinterpretq_u32_u16(b3.val[0]));
775 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
776 vreinterpretq_u32_u16(b3.val[1]));
777
778 // Swap 64 bit elements resulting in:
779 // d0.val[0]: 00 10 20 30 40 50 60 70
780 // d0.val[1]: 04 14 24 34 44 54 64 74
781 // d1.val[0]: 01 11 21 31 41 51 61 71
782 // d1.val[1]: 05 15 25 35 45 55 65 75
783 // d2.val[0]: 02 12 22 32 42 52 62 72
784 // d2.val[1]: 06 16 26 36 46 56 66 76
785 // d3.val[0]: 03 13 23 33 43 53 63 73
786 // d3.val[1]: 07 17 27 37 47 57 67 77
787
788 const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
789 const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
790 const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
791 const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
792
793 *a0 = d0.val[0];
794 *a1 = d1.val[0];
795 *a2 = d2.val[0];
796 *a3 = d3.val[0];
797 *a4 = d0.val[1];
798 *a5 = d1.val[1];
799 *a6 = d2.val[1];
800 *a7 = d3.val[1];
801 }
802
transpose_s32_8x8(int32x4x2_t * a0,int32x4x2_t * a1,int32x4x2_t * a2,int32x4x2_t * a3,int32x4x2_t * a4,int32x4x2_t * a5,int32x4x2_t * a6,int32x4x2_t * a7)803 static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
804 int32x4x2_t *a2, int32x4x2_t *a3,
805 int32x4x2_t *a4, int32x4x2_t *a5,
806 int32x4x2_t *a6, int32x4x2_t *a7) {
807 // Swap 32 bit elements. Goes from:
808 // a0: 00 01 02 03 04 05 06 07
809 // a1: 10 11 12 13 14 15 16 17
810 // a2: 20 21 22 23 24 25 26 27
811 // a3: 30 31 32 33 34 35 36 37
812 // a4: 40 41 42 43 44 45 46 47
813 // a5: 50 51 52 53 54 55 56 57
814 // a6: 60 61 62 63 64 65 66 67
815 // a7: 70 71 72 73 74 75 76 77
816 // to:
817 // b0: 00 10 02 12 01 11 03 13
818 // b1: 20 30 22 32 21 31 23 33
819 // b2: 40 50 42 52 41 51 43 53
820 // b3: 60 70 62 72 61 71 63 73
821 // b4: 04 14 06 16 05 15 07 17
822 // b5: 24 34 26 36 25 35 27 37
823 // b6: 44 54 46 56 45 55 47 57
824 // b7: 64 74 66 76 65 75 67 77
825
826 const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
827 const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
828 const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
829 const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
830 const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
831 const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
832 const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
833 const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
834
835 // Swap 64 bit elements resulting in:
836 // c0: 00 10 20 30 02 12 22 32
837 // c1: 01 11 21 31 03 13 23 33
838 // c2: 40 50 60 70 42 52 62 72
839 // c3: 41 51 61 71 43 53 63 73
840 // c4: 04 14 24 34 06 16 26 36
841 // c5: 05 15 25 35 07 17 27 37
842 // c6: 44 54 64 74 46 56 66 76
843 // c7: 45 55 65 75 47 57 67 77
844 const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
845 const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
846 const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
847 const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
848 const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
849 const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
850 const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
851 const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
852
853 // Swap 128 bit elements resulting in:
854 // a0: 00 10 20 30 40 50 60 70
855 // a1: 01 11 21 31 41 51 61 71
856 // a2: 02 12 22 32 42 52 62 72
857 // a3: 03 13 23 33 43 53 63 73
858 // a4: 04 14 24 34 44 54 64 74
859 // a5: 05 15 25 35 45 55 65 75
860 // a6: 06 16 26 36 46 56 66 76
861 // a7: 07 17 27 37 47 57 67 77
862 a0->val[0] = c0.val[0];
863 a0->val[1] = c2.val[0];
864 a1->val[0] = c1.val[0];
865 a1->val[1] = c3.val[0];
866 a2->val[0] = c0.val[1];
867 a2->val[1] = c2.val[1];
868 a3->val[0] = c1.val[1];
869 a3->val[1] = c3.val[1];
870 a4->val[0] = c4.val[0];
871 a4->val[1] = c6.val[0];
872 a5->val[0] = c5.val[0];
873 a5->val[1] = c7.val[0];
874 a6->val[0] = c4.val[1];
875 a6->val[1] = c6.val[1];
876 a7->val[0] = c5.val[1];
877 a7->val[1] = c7.val[1];
878 }
879
880 // Helper transpose function for highbd FDCT variants
transpose_s32_8x8_2(int32x4_t * left,int32x4_t * right,int32x4_t * out_left,int32x4_t * out_right)881 static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
882 int32x4_t *right /*[8]*/,
883 int32x4_t *out_left /*[8]*/,
884 int32x4_t *out_right /*[8]*/) {
885 int32x4x2_t out[8];
886
887 out[0].val[0] = left[0];
888 out[0].val[1] = right[0];
889 out[1].val[0] = left[1];
890 out[1].val[1] = right[1];
891 out[2].val[0] = left[2];
892 out[2].val[1] = right[2];
893 out[3].val[0] = left[3];
894 out[3].val[1] = right[3];
895 out[4].val[0] = left[4];
896 out[4].val[1] = right[4];
897 out[5].val[0] = left[5];
898 out[5].val[1] = right[5];
899 out[6].val[0] = left[6];
900 out[6].val[1] = right[6];
901 out[7].val[0] = left[7];
902 out[7].val[1] = right[7];
903
904 transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
905 &out[6], &out[7]);
906
907 out_left[0] = out[0].val[0];
908 out_left[1] = out[1].val[0];
909 out_left[2] = out[2].val[0];
910 out_left[3] = out[3].val[0];
911 out_left[4] = out[4].val[0];
912 out_left[5] = out[5].val[0];
913 out_left[6] = out[6].val[0];
914 out_left[7] = out[7].val[0];
915 out_right[0] = out[0].val[1];
916 out_right[1] = out[1].val[1];
917 out_right[2] = out[2].val[1];
918 out_right[3] = out[3].val[1];
919 out_right[4] = out[4].val[1];
920 out_right[5] = out[5].val[1];
921 out_right[6] = out[6].val[1];
922 out_right[7] = out[7].val[1];
923 }
924
transpose_s32_16x16(int32x4_t * left1,int32x4_t * right1,int32x4_t * left2,int32x4_t * right2)925 static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1,
926 int32x4_t *left2, int32x4_t *right2) {
927 int32x4_t tl[16], tr[16];
928
929 // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
930 tl[0] = left1[8];
931 tl[1] = left1[9];
932 tl[2] = left1[10];
933 tl[3] = left1[11];
934 tl[4] = left1[12];
935 tl[5] = left1[13];
936 tl[6] = left1[14];
937 tl[7] = left1[15];
938 tr[0] = right1[8];
939 tr[1] = right1[9];
940 tr[2] = right1[10];
941 tr[3] = right1[11];
942 tr[4] = right1[12];
943 tr[5] = right1[13];
944 tr[6] = right1[14];
945 tr[7] = right1[15];
946
947 left1[8] = left2[0];
948 left1[9] = left2[1];
949 left1[10] = left2[2];
950 left1[11] = left2[3];
951 left1[12] = left2[4];
952 left1[13] = left2[5];
953 left1[14] = left2[6];
954 left1[15] = left2[7];
955 right1[8] = right2[0];
956 right1[9] = right2[1];
957 right1[10] = right2[2];
958 right1[11] = right2[3];
959 right1[12] = right2[4];
960 right1[13] = right2[5];
961 right1[14] = right2[6];
962 right1[15] = right2[7];
963
964 left2[0] = tl[0];
965 left2[1] = tl[1];
966 left2[2] = tl[2];
967 left2[3] = tl[3];
968 left2[4] = tl[4];
969 left2[5] = tl[5];
970 left2[6] = tl[6];
971 left2[7] = tl[7];
972 right2[0] = tr[0];
973 right2[1] = tr[1];
974 right2[2] = tr[2];
975 right2[3] = tr[3];
976 right2[4] = tr[4];
977 right2[5] = tr[5];
978 right2[6] = tr[6];
979 right2[7] = tr[7];
980
981 transpose_s32_8x8_2(left1, right1, left1, right1);
982 transpose_s32_8x8_2(left2, right2, left2, right2);
983 transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8);
984 transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8);
985 }
986
transpose_u8_16x8(const uint8x16_t i0,const uint8x16_t i1,const uint8x16_t i2,const uint8x16_t i3,const uint8x16_t i4,const uint8x16_t i5,const uint8x16_t i6,const uint8x16_t i7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3,uint8x8_t * o4,uint8x8_t * o5,uint8x8_t * o6,uint8x8_t * o7,uint8x8_t * o8,uint8x8_t * o9,uint8x8_t * o10,uint8x8_t * o11,uint8x8_t * o12,uint8x8_t * o13,uint8x8_t * o14,uint8x8_t * o15)987 static INLINE void transpose_u8_16x8(
988 const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
989 const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
990 const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1,
991 uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
992 uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11,
993 uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) {
994 // Swap 8 bit elements. Goes from:
995 // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
996 // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
997 // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
998 // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
999 // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
1000 // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
1001 // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
1002 // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
1003 // to:
1004 // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
1005 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
1006 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
1007 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
1008 // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
1009 // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
1010 // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
1011 // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
1012 const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
1013 const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
1014 const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
1015 const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
1016
1017 // Swap 16 bit elements resulting in:
1018 // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
1019 // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
1020 // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
1021 // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
1022 // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
1023 // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
1024 // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
1025 // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
1026 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1027 vreinterpretq_u16_u8(b1.val[0]));
1028 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1029 vreinterpretq_u16_u8(b1.val[1]));
1030 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
1031 vreinterpretq_u16_u8(b3.val[0]));
1032 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
1033 vreinterpretq_u16_u8(b3.val[1]));
1034
1035 // Swap 32 bit elements resulting in:
1036 // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
1037 // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
1038 // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
1039 // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
1040 // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
1041 // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
1042 // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
1043 // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
1044 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
1045 vreinterpretq_u32_u16(c2.val[0]));
1046 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
1047 vreinterpretq_u32_u16(c2.val[1]));
1048 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
1049 vreinterpretq_u32_u16(c3.val[0]));
1050 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
1051 vreinterpretq_u32_u16(c3.val[1]));
1052
1053 // Output:
1054 // o0 : 00 10 20 30 40 50 60 70
1055 // o1 : 01 11 21 31 41 51 61 71
1056 // o2 : 02 12 22 32 42 52 62 72
1057 // o3 : 03 13 23 33 43 53 63 73
1058 // o4 : 04 14 24 34 44 54 64 74
1059 // o5 : 05 15 25 35 45 55 65 75
1060 // o6 : 06 16 26 36 46 56 66 76
1061 // o7 : 07 17 27 37 47 57 67 77
1062 // o8 : 08 18 28 38 48 58 68 78
1063 // o9 : 09 19 29 39 49 59 69 79
1064 // o10: 0A 1A 2A 3A 4A 5A 6A 7A
1065 // o11: 0B 1B 2B 3B 4B 5B 6B 7B
1066 // o12: 0C 1C 2C 3C 4C 5C 6C 7C
1067 // o13: 0D 1D 2D 3D 4D 5D 6D 7D
1068 // o14: 0E 1E 2E 3E 4E 5E 6E 7E
1069 // o15: 0F 1F 2F 3F 4F 5F 6F 7F
1070 *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
1071 *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
1072 *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
1073 *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
1074 *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
1075 *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
1076 *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
1077 *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
1078 *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
1079 *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
1080 *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
1081 *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
1082 *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
1083 *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
1084 *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
1085 *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
1086 }
1087
transpose_u8_8x16(const uint8x8_t i0,const uint8x8_t i1,const uint8x8_t i2,const uint8x8_t i3,const uint8x8_t i4,const uint8x8_t i5,const uint8x8_t i6,const uint8x8_t i7,const uint8x8_t i8,const uint8x8_t i9,const uint8x8_t i10,const uint8x8_t i11,const uint8x8_t i12,const uint8x8_t i13,const uint8x8_t i14,const uint8x8_t i15,uint8x16_t * o0,uint8x16_t * o1,uint8x16_t * o2,uint8x16_t * o3,uint8x16_t * o4,uint8x16_t * o5,uint8x16_t * o6,uint8x16_t * o7)1088 static INLINE void transpose_u8_8x16(
1089 const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2,
1090 const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5,
1091 const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8,
1092 const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11,
1093 const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14,
1094 const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
1095 uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
1096 uint8x16_t *o7) {
1097 // Combine 8 bit elements. Goes from:
1098 // i0 : 00 01 02 03 04 05 06 07
1099 // i1 : 10 11 12 13 14 15 16 17
1100 // i2 : 20 21 22 23 24 25 26 27
1101 // i3 : 30 31 32 33 34 35 36 37
1102 // i4 : 40 41 42 43 44 45 46 47
1103 // i5 : 50 51 52 53 54 55 56 57
1104 // i6 : 60 61 62 63 64 65 66 67
1105 // i7 : 70 71 72 73 74 75 76 77
1106 // i8 : 80 81 82 83 84 85 86 87
1107 // i9 : 90 91 92 93 94 95 96 97
1108 // i10: A0 A1 A2 A3 A4 A5 A6 A7
1109 // i11: B0 B1 B2 B3 B4 B5 B6 B7
1110 // i12: C0 C1 C2 C3 C4 C5 C6 C7
1111 // i13: D0 D1 D2 D3 D4 D5 D6 D7
1112 // i14: E0 E1 E2 E3 E4 E5 E6 E7
1113 // i15: F0 F1 F2 F3 F4 F5 F6 F7
1114 // to:
1115 // a0: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
1116 // a1: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
1117 // a2: 20 21 22 23 24 25 26 27 A0 A1 A2 A3 A4 A5 A6 A7
1118 // a3: 30 31 32 33 34 35 36 37 B0 B1 B2 B3 B4 B5 B6 B7
1119 // a4: 40 41 42 43 44 45 46 47 C0 C1 C2 C3 C4 C5 C6 C7
1120 // a5: 50 51 52 53 54 55 56 57 D0 D1 D2 D3 D4 D5 D6 D7
1121 // a6: 60 61 62 63 64 65 66 67 E0 E1 E2 E3 E4 E5 E6 E7
1122 // a7: 70 71 72 73 74 75 76 77 F0 F1 F2 F3 F4 F5 F6 F7
1123 const uint8x16_t a0 = vcombine_u8(i0, i8);
1124 const uint8x16_t a1 = vcombine_u8(i1, i9);
1125 const uint8x16_t a2 = vcombine_u8(i2, i10);
1126 const uint8x16_t a3 = vcombine_u8(i3, i11);
1127 const uint8x16_t a4 = vcombine_u8(i4, i12);
1128 const uint8x16_t a5 = vcombine_u8(i5, i13);
1129 const uint8x16_t a6 = vcombine_u8(i6, i14);
1130 const uint8x16_t a7 = vcombine_u8(i7, i15);
1131
1132 // Swap 8 bit elements resulting in:
1133 // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
1134 // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
1135 // b1.val[0]: 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6
1136 // b1.val[1]: 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7
1137 // b2.val[0]: 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6
1138 // b2.val[1]: 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7
1139 // b3.val[0]: 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6
1140 // b3.val[1]: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7
1141 const uint8x16x2_t b0 = vtrnq_u8(a0, a1);
1142 const uint8x16x2_t b1 = vtrnq_u8(a2, a3);
1143 const uint8x16x2_t b2 = vtrnq_u8(a4, a5);
1144 const uint8x16x2_t b3 = vtrnq_u8(a6, a7);
1145
1146 // Swap 16 bit elements resulting in:
1147 // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4
1148 // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6
1149 // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5
1150 // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7
1151 // c2.val[0]: 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4
1152 // c2.val[1]: 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6
1153 // c3.val[0]: 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5
1154 // c3.val[1]: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7
1155 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1156 vreinterpretq_u16_u8(b1.val[0]));
1157 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1158 vreinterpretq_u16_u8(b1.val[1]));
1159 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
1160 vreinterpretq_u16_u8(b3.val[0]));
1161 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
1162 vreinterpretq_u16_u8(b3.val[1]));
1163
1164 // Swap 32 bit elements resulting in:
1165 // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
1166 // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
1167 // d1.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
1168 // d1.val[1]: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
1169 // d2.val[0]: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
1170 // d2.val[1]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
1171 // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
1172 // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
1173 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
1174 vreinterpretq_u32_u16(c2.val[0]));
1175 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
1176 vreinterpretq_u32_u16(c2.val[1]));
1177 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
1178 vreinterpretq_u32_u16(c3.val[0]));
1179 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
1180 vreinterpretq_u32_u16(c3.val[1]));
1181
1182 // Output:
1183 // o0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
1184 // o1: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
1185 // o2: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
1186 // o3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
1187 // o4: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
1188 // o5: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
1189 // o6: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
1190 // o7: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
1191 *o0 = vreinterpretq_u8_u32(d0.val[0]);
1192 *o1 = vreinterpretq_u8_u32(d2.val[0]);
1193 *o2 = vreinterpretq_u8_u32(d1.val[0]);
1194 *o3 = vreinterpretq_u8_u32(d3.val[0]);
1195 *o4 = vreinterpretq_u8_u32(d0.val[1]);
1196 *o5 = vreinterpretq_u8_u32(d2.val[1]);
1197 *o6 = vreinterpretq_u8_u32(d1.val[1]);
1198 *o7 = vreinterpretq_u8_u32(d3.val[1]);
1199 }
1200
transpose_u8_16x16(const uint8x16_t i0,const uint8x16_t i1,const uint8x16_t i2,const uint8x16_t i3,const uint8x16_t i4,const uint8x16_t i5,const uint8x16_t i6,const uint8x16_t i7,const uint8x16_t i8,const uint8x16_t i9,const uint8x16_t i10,const uint8x16_t i11,const uint8x16_t i12,const uint8x16_t i13,const uint8x16_t i14,const uint8x16_t i15,uint8x16_t * o0,uint8x16_t * o1,uint8x16_t * o2,uint8x16_t * o3,uint8x16_t * o4,uint8x16_t * o5,uint8x16_t * o6,uint8x16_t * o7,uint8x16_t * o8,uint8x16_t * o9,uint8x16_t * o10,uint8x16_t * o11,uint8x16_t * o12,uint8x16_t * o13,uint8x16_t * o14,uint8x16_t * o15)1201 static INLINE void transpose_u8_16x16(
1202 const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
1203 const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
1204 const uint8x16_t i6, const uint8x16_t i7, const uint8x16_t i8,
1205 const uint8x16_t i9, const uint8x16_t i10, const uint8x16_t i11,
1206 const uint8x16_t i12, const uint8x16_t i13, const uint8x16_t i14,
1207 const uint8x16_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
1208 uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
1209 uint8x16_t *o7, uint8x16_t *o8, uint8x16_t *o9, uint8x16_t *o10,
1210 uint8x16_t *o11, uint8x16_t *o12, uint8x16_t *o13, uint8x16_t *o14,
1211 uint8x16_t *o15) {
1212 // Swap 8 bit elements. Goes from:
1213 // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
1214 // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
1215 // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
1216 // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
1217 // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
1218 // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
1219 // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
1220 // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
1221 // i8: 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F
1222 // i9: 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F
1223 // i10: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF
1224 // i11: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF
1225 // i12: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF
1226 // i13: D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF
1227 // i14: E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF
1228 // i15: F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF
1229 // to:
1230 // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
1231 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
1232 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
1233 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
1234 // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
1235 // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
1236 // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
1237 // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
1238 // b4.val[0]: 80 90 82 92 84 94 86 96 88 98 8A 9A 8C 9C 8E 9E
1239 // b4.val[1]: 81 91 83 93 85 95 87 97 89 99 8B 9B 8D 9D 8F 9F
1240 // b5.val[0]: A0 B0 A2 B2 A4 B4 A6 B6 A8 B8 AA BA AC BC AE BE
1241 // b5.val[1]: A1 B1 A3 B3 A5 B5 A7 B7 A9 B9 AB BB AD BD AF BF
1242 // b6.val[0]: C0 D0 C2 D2 C4 D4 C6 D6 C8 D8 CA DA CC DC CE DE
1243 // b6.val[1]: C1 D1 C3 D3 C5 D5 C7 D7 C9 D9 CB DB CD DD CF DF
1244 // b7.val[0]: E0 F0 E2 F2 E4 F4 E6 F6 E8 F8 EA FA EC FC EE FE
1245 // b7.val[1]: E1 F1 E3 F3 E5 F5 E7 F7 E9 F9 EB FB ED FD EF FF
1246 const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
1247 const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
1248 const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
1249 const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
1250 const uint8x16x2_t b4 = vtrnq_u8(i8, i9);
1251 const uint8x16x2_t b5 = vtrnq_u8(i10, i11);
1252 const uint8x16x2_t b6 = vtrnq_u8(i12, i13);
1253 const uint8x16x2_t b7 = vtrnq_u8(i14, i15);
1254
1255 // Swap 16 bit elements resulting in:
1256 // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
1257 // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
1258 // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
1259 // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
1260 // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
1261 // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
1262 // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
1263 // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
1264 // c4.val[0]: 80 90 A0 B0 84 94 A4 B4 88 98 A8 B8 8C 9C AC BC
1265 // c4.val[1]: 82 92 A2 B2 86 96 A6 B6 8A 9A AA BA 8E 9E AE BE
1266 // c5.val[0]: 81 91 A1 B1 85 95 A5 B5 89 99 A9 B9 8D 9D AD BD
1267 // c5.val[1]: 83 93 A3 B3 87 97 A7 B7 8B 9B AB BB 8F 9F AF BF
1268 // c6.val[0]: C0 D0 E0 F0 C4 D4 E4 F4 C8 D8 E8 F8 CC DC EC FC
1269 // c6.val[1]: C2 D2 E2 F2 C6 D6 E6 F6 CA DA EA FA CE DE EE FE
1270 // c7.val[0]: C1 D1 E1 F1 C5 D5 E5 F5 C9 D9 E9 F9 CD DD ED FD
1271 // c7.val[1]: C3 D3 E3 F3 C7 D7 E7 F7 CB DB EB FB CF DF EF FF
1272 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1273 vreinterpretq_u16_u8(b1.val[0]));
1274 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1275 vreinterpretq_u16_u8(b1.val[1]));
1276 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
1277 vreinterpretq_u16_u8(b3.val[0]));
1278 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
1279 vreinterpretq_u16_u8(b3.val[1]));
1280 const uint16x8x2_t c4 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[0]),
1281 vreinterpretq_u16_u8(b5.val[0]));
1282 const uint16x8x2_t c5 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[1]),
1283 vreinterpretq_u16_u8(b5.val[1]));
1284 const uint16x8x2_t c6 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[0]),
1285 vreinterpretq_u16_u8(b7.val[0]));
1286 const uint16x8x2_t c7 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[1]),
1287 vreinterpretq_u16_u8(b7.val[1]));
1288
1289 // Swap 32 bit elements resulting in:
1290 // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
1291 // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
1292 // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
1293 // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
1294 // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
1295 // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
1296 // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
1297 // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
1298 // d4.val[0]: 80 90 A0 B0 C0 D0 E0 F0 88 98 A8 B8 C8 D8 E8 F8
1299 // d4.val[1]: 84 94 A4 B4 C4 D4 E4 F4 8C 9C AC BC CC DC EC FC
1300 // d5.val[0]: 82 92 A2 B2 C2 D2 E2 F2 8A 9A AA BA CA DA EA FA
1301 // d5.val[1]: 86 96 A6 B6 C6 D6 E6 F6 8E 9E AE BE CE DE EE FE
1302 // d6.val[0]: 81 91 A1 B1 C1 D1 E1 F1 89 99 A9 B9 C9 D9 E9 F9
1303 // d6.val[1]: 85 95 A5 B5 C5 D5 E5 F5 8D 9D AD BD CD DD ED FD
1304 // d7.val[0]: 83 93 A3 B3 C3 D3 E3 F3 8B 9B AB BB CB DB EB FB
1305 // d7.val[1]: 87 97 A7 B7 C7 D7 E7 F7 8F 9F AF BF CF DF EF FF
1306 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
1307 vreinterpretq_u32_u16(c2.val[0]));
1308 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
1309 vreinterpretq_u32_u16(c2.val[1]));
1310 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
1311 vreinterpretq_u32_u16(c3.val[0]));
1312 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
1313 vreinterpretq_u32_u16(c3.val[1]));
1314 const uint32x4x2_t d4 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[0]),
1315 vreinterpretq_u32_u16(c6.val[0]));
1316 const uint32x4x2_t d5 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[1]),
1317 vreinterpretq_u32_u16(c6.val[1]));
1318 const uint32x4x2_t d6 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[0]),
1319 vreinterpretq_u32_u16(c7.val[0]));
1320 const uint32x4x2_t d7 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[1]),
1321 vreinterpretq_u32_u16(c7.val[1]));
1322
1323 // Swap 64 bit elements resulting in:
1324 // e0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
1325 // e0.val[1]: 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
1326 // e1.val[0]: 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
1327 // e1.val[1]: 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
1328 // e2.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
1329 // e2.val[1]: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
1330 // e3.val[0]: 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
1331 // e3.val[1]: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
1332 // e4.val[0]: 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
1333 // e4.val[1]: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
1334 // e5.val[0]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
1335 // e5.val[1]: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
1336 // e6.val[0]: 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
1337 // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
1338 // e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
1339 // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
1340 const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]);
1341 const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]);
1342 const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]);
1343 const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]);
1344 const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]);
1345 const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]);
1346 const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]);
1347 const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]);
1348
1349 // Output:
1350 // o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
1351 // o1 : 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
1352 // o2 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
1353 // o3 : 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
1354 // o4 : 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
1355 // o5 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
1356 // o6 : 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
1357 // o7 : 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
1358 // o8 : 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
1359 // o9 : 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
1360 // o10: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
1361 // o11: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
1362 // o12: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
1363 // o13: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
1364 // o14: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
1365 // o15: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
1366 *o0 = e0.val[0];
1367 *o1 = e1.val[0];
1368 *o2 = e2.val[0];
1369 *o3 = e3.val[0];
1370 *o4 = e4.val[0];
1371 *o5 = e5.val[0];
1372 *o6 = e6.val[0];
1373 *o7 = e7.val[0];
1374 *o8 = e0.val[1];
1375 *o9 = e1.val[1];
1376 *o10 = e2.val[1];
1377 *o11 = e3.val[1];
1378 *o12 = e4.val[1];
1379 *o13 = e5.val[1];
1380 *o14 = e6.val[1];
1381 *o15 = e7.val[1];
1382 }
1383
transpose_s16_16x16(int16x8_t * in0,int16x8_t * in1)1384 static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
1385 int16x8_t t[8];
1386
1387 // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
1388 t[0] = in0[8];
1389 t[1] = in0[9];
1390 t[2] = in0[10];
1391 t[3] = in0[11];
1392 t[4] = in0[12];
1393 t[5] = in0[13];
1394 t[6] = in0[14];
1395 t[7] = in0[15];
1396 in0[8] = in1[0];
1397 in0[9] = in1[1];
1398 in0[10] = in1[2];
1399 in0[11] = in1[3];
1400 in0[12] = in1[4];
1401 in0[13] = in1[5];
1402 in0[14] = in1[6];
1403 in0[15] = in1[7];
1404 in1[0] = t[0];
1405 in1[1] = t[1];
1406 in1[2] = t[2];
1407 in1[3] = t[3];
1408 in1[4] = t[4];
1409 in1[5] = t[5];
1410 in1[6] = t[6];
1411 in1[7] = t[7];
1412
1413 transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
1414 &in0[6], &in0[7]);
1415 transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
1416 &in0[14], &in0[15]);
1417 transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
1418 &in1[6], &in1[7]);
1419 transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
1420 &in1[14], &in1[15]);
1421 }
1422
load_and_transpose_u8_4x8(const uint8_t * a,const int a_stride,uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)1423 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
1424 const int a_stride, uint8x8_t *a0,
1425 uint8x8_t *a1, uint8x8_t *a2,
1426 uint8x8_t *a3) {
1427 uint8x8_t a4, a5, a6, a7;
1428 *a0 = vld1_u8(a);
1429 a += a_stride;
1430 *a1 = vld1_u8(a);
1431 a += a_stride;
1432 *a2 = vld1_u8(a);
1433 a += a_stride;
1434 *a3 = vld1_u8(a);
1435 a += a_stride;
1436 a4 = vld1_u8(a);
1437 a += a_stride;
1438 a5 = vld1_u8(a);
1439 a += a_stride;
1440 a6 = vld1_u8(a);
1441 a += a_stride;
1442 a7 = vld1_u8(a);
1443
1444 transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);
1445 }
1446
load_and_transpose_u8_8x8(const uint8_t * a,const int a_stride,uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)1447 static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,
1448 const int a_stride, uint8x8_t *a0,
1449 uint8x8_t *a1, uint8x8_t *a2,
1450 uint8x8_t *a3, uint8x8_t *a4,
1451 uint8x8_t *a5, uint8x8_t *a6,
1452 uint8x8_t *a7) {
1453 *a0 = vld1_u8(a);
1454 a += a_stride;
1455 *a1 = vld1_u8(a);
1456 a += a_stride;
1457 *a2 = vld1_u8(a);
1458 a += a_stride;
1459 *a3 = vld1_u8(a);
1460 a += a_stride;
1461 *a4 = vld1_u8(a);
1462 a += a_stride;
1463 *a5 = vld1_u8(a);
1464 a += a_stride;
1465 *a6 = vld1_u8(a);
1466 a += a_stride;
1467 *a7 = vld1_u8(a);
1468
1469 transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
1470 }
1471
transpose_and_store_u8_8x8(uint8_t * a,const int a_stride,uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7)1472 static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,
1473 uint8x8_t a0, uint8x8_t a1,
1474 uint8x8_t a2, uint8x8_t a3,
1475 uint8x8_t a4, uint8x8_t a5,
1476 uint8x8_t a6, uint8x8_t a7) {
1477 transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
1478
1479 vst1_u8(a, a0);
1480 a += a_stride;
1481 vst1_u8(a, a1);
1482 a += a_stride;
1483 vst1_u8(a, a2);
1484 a += a_stride;
1485 vst1_u8(a, a3);
1486 a += a_stride;
1487 vst1_u8(a, a4);
1488 a += a_stride;
1489 vst1_u8(a, a5);
1490 a += a_stride;
1491 vst1_u8(a, a6);
1492 a += a_stride;
1493 vst1_u8(a, a7);
1494 }
1495
load_and_transpose_s16_8x8(const int16_t * a,const int a_stride,int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)1496 static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
1497 const int a_stride, int16x8_t *a0,
1498 int16x8_t *a1, int16x8_t *a2,
1499 int16x8_t *a3, int16x8_t *a4,
1500 int16x8_t *a5, int16x8_t *a6,
1501 int16x8_t *a7) {
1502 *a0 = vld1q_s16(a);
1503 a += a_stride;
1504 *a1 = vld1q_s16(a);
1505 a += a_stride;
1506 *a2 = vld1q_s16(a);
1507 a += a_stride;
1508 *a3 = vld1q_s16(a);
1509 a += a_stride;
1510 *a4 = vld1q_s16(a);
1511 a += a_stride;
1512 *a5 = vld1q_s16(a);
1513 a += a_stride;
1514 *a6 = vld1q_s16(a);
1515 a += a_stride;
1516 *a7 = vld1q_s16(a);
1517
1518 transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
1519 }
1520
load_and_transpose_s32_8x8(const int32_t * a,const int a_stride,int32x4x2_t * const a0,int32x4x2_t * const a1,int32x4x2_t * const a2,int32x4x2_t * const a3,int32x4x2_t * const a4,int32x4x2_t * const a5,int32x4x2_t * const a6,int32x4x2_t * const a7)1521 static INLINE void load_and_transpose_s32_8x8(
1522 const int32_t *a, const int a_stride, int32x4x2_t *const a0,
1523 int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3,
1524 int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6,
1525 int32x4x2_t *const a7) {
1526 a0->val[0] = vld1q_s32(a);
1527 a0->val[1] = vld1q_s32(a + 4);
1528 a += a_stride;
1529 a1->val[0] = vld1q_s32(a);
1530 a1->val[1] = vld1q_s32(a + 4);
1531 a += a_stride;
1532 a2->val[0] = vld1q_s32(a);
1533 a2->val[1] = vld1q_s32(a + 4);
1534 a += a_stride;
1535 a3->val[0] = vld1q_s32(a);
1536 a3->val[1] = vld1q_s32(a + 4);
1537 a += a_stride;
1538 a4->val[0] = vld1q_s32(a);
1539 a4->val[1] = vld1q_s32(a + 4);
1540 a += a_stride;
1541 a5->val[0] = vld1q_s32(a);
1542 a5->val[1] = vld1q_s32(a + 4);
1543 a += a_stride;
1544 a6->val[0] = vld1q_s32(a);
1545 a6->val[1] = vld1q_s32(a + 4);
1546 a += a_stride;
1547 a7->val[0] = vld1q_s32(a);
1548 a7->val[1] = vld1q_s32(a + 4);
1549
1550 transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
1551 }
1552 #endif // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
1553