xref: /aosp_15_r20/external/libaom/aom_dsp/arm/transpose_neon.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
13*77c1e3ccSAndroid Build Coastguard Worker #define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
16*77c1e3ccSAndroid Build Coastguard Worker 
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/aom_dsp_common.h"  // For AOM_FORCE_INLINE.
18*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
19*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_u8_8x8(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3,uint8x8_t * o4,uint8x8_t * o5,uint8x8_t * o6,uint8x8_t * o7)20*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_u8_8x8(
21*77c1e3ccSAndroid Build Coastguard Worker     uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
22*77c1e3ccSAndroid Build Coastguard Worker     uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
23*77c1e3ccSAndroid Build Coastguard Worker     uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
24*77c1e3ccSAndroid Build Coastguard Worker     uint8x8_t *o7) {
25*77c1e3ccSAndroid Build Coastguard Worker   // Swap 8 bit elements. Goes from:
26*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03 04 05 06 07
27*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13 14 15 16 17
28*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23 24 25 26 27
29*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33 34 35 36 37
30*77c1e3ccSAndroid Build Coastguard Worker   // a4: 40 41 42 43 44 45 46 47
31*77c1e3ccSAndroid Build Coastguard Worker   // a5: 50 51 52 53 54 55 56 57
32*77c1e3ccSAndroid Build Coastguard Worker   // a6: 60 61 62 63 64 65 66 67
33*77c1e3ccSAndroid Build Coastguard Worker   // a7: 70 71 72 73 74 75 76 77
34*77c1e3ccSAndroid Build Coastguard Worker   // to:
35*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
36*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
37*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
38*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
39*77c1e3ccSAndroid Build Coastguard Worker 
40*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5));
41*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7));
42*77c1e3ccSAndroid Build Coastguard Worker 
43*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements resulting in:
44*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
45*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
46*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
47*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
48*77c1e3ccSAndroid Build Coastguard Worker 
49*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
50*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u16_u8(b1.val[0]));
51*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
52*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u16_u8(b1.val[1]));
53*77c1e3ccSAndroid Build Coastguard Worker 
54*77c1e3ccSAndroid Build Coastguard Worker   // Unzip 32 bit elements resulting in:
55*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
56*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
57*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
58*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
59*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
60*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(c1.val[0]));
61*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
62*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(c1.val[1]));
63*77c1e3ccSAndroid Build Coastguard Worker 
64*77c1e3ccSAndroid Build Coastguard Worker   *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
65*77c1e3ccSAndroid Build Coastguard Worker   *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
66*77c1e3ccSAndroid Build Coastguard Worker   *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
67*77c1e3ccSAndroid Build Coastguard Worker   *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
68*77c1e3ccSAndroid Build Coastguard Worker   *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
69*77c1e3ccSAndroid Build Coastguard Worker   *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
70*77c1e3ccSAndroid Build Coastguard Worker   *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
71*77c1e3ccSAndroid Build Coastguard Worker   *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
72*77c1e3ccSAndroid Build Coastguard Worker }
73*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)74*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
75*77c1e3ccSAndroid Build Coastguard Worker                                                   uint8x8_t *a2, uint8x8_t *a3,
76*77c1e3ccSAndroid Build Coastguard Worker                                                   uint8x8_t *a4, uint8x8_t *a5,
77*77c1e3ccSAndroid Build Coastguard Worker                                                   uint8x8_t *a6,
78*77c1e3ccSAndroid Build Coastguard Worker                                                   uint8x8_t *a7) {
79*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3,
80*77c1e3ccSAndroid Build Coastguard Worker                          a4, a5, a6, a7);
81*77c1e3ccSAndroid Build Coastguard Worker }
82*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_u8_8x8(const uint8x8_t * in,uint8x8_t * out)83*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_arrays_u8_8x8(const uint8x8_t *in,
84*77c1e3ccSAndroid Build Coastguard Worker                                            uint8x8_t *out) {
85*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
86*77c1e3ccSAndroid Build Coastguard Worker                          &out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
87*77c1e3ccSAndroid Build Coastguard Worker                          &out[6], &out[7]);
88*77c1e3ccSAndroid Build Coastguard Worker }
89*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_u8_8x16(const uint8x8_t * x,uint8x16_t * d)90*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x,
91*77c1e3ccSAndroid Build Coastguard Worker                                                       uint8x16_t *d) {
92*77c1e3ccSAndroid Build Coastguard Worker   uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
93*77c1e3ccSAndroid Build Coastguard Worker   uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
94*77c1e3ccSAndroid Build Coastguard Worker   uint8x8x2_t w2 = vzip_u8(x[4], x[5]);
95*77c1e3ccSAndroid Build Coastguard Worker   uint8x8x2_t w3 = vzip_u8(x[6], x[7]);
96*77c1e3ccSAndroid Build Coastguard Worker 
97*77c1e3ccSAndroid Build Coastguard Worker   uint8x8x2_t w8 = vzip_u8(x[8], x[9]);
98*77c1e3ccSAndroid Build Coastguard Worker   uint8x8x2_t w9 = vzip_u8(x[10], x[11]);
99*77c1e3ccSAndroid Build Coastguard Worker   uint8x8x2_t w10 = vzip_u8(x[12], x[13]);
100*77c1e3ccSAndroid Build Coastguard Worker   uint8x8x2_t w11 = vzip_u8(x[14], x[15]);
101*77c1e3ccSAndroid Build Coastguard Worker 
102*77c1e3ccSAndroid Build Coastguard Worker   uint16x4x2_t w4 =
103*77c1e3ccSAndroid Build Coastguard Worker       vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
104*77c1e3ccSAndroid Build Coastguard Worker   uint16x4x2_t w5 =
105*77c1e3ccSAndroid Build Coastguard Worker       vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
106*77c1e3ccSAndroid Build Coastguard Worker   uint16x4x2_t w12 =
107*77c1e3ccSAndroid Build Coastguard Worker       vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
108*77c1e3ccSAndroid Build Coastguard Worker   uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
109*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpret_u16_u8(w11.val[0]));
110*77c1e3ccSAndroid Build Coastguard Worker 
111*77c1e3ccSAndroid Build Coastguard Worker   uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
112*77c1e3ccSAndroid Build Coastguard Worker                              vreinterpret_u32_u16(w5.val[0]));
113*77c1e3ccSAndroid Build Coastguard Worker   uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
114*77c1e3ccSAndroid Build Coastguard Worker                              vreinterpret_u32_u16(w5.val[1]));
115*77c1e3ccSAndroid Build Coastguard Worker   uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
116*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpret_u32_u16(w13.val[0]));
117*77c1e3ccSAndroid Build Coastguard Worker   uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
118*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpret_u32_u16(w13.val[1]));
119*77c1e3ccSAndroid Build Coastguard Worker 
120*77c1e3ccSAndroid Build Coastguard Worker   // Store first 4-line result
121*77c1e3ccSAndroid Build Coastguard Worker   d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
122*77c1e3ccSAndroid Build Coastguard Worker   d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
123*77c1e3ccSAndroid Build Coastguard Worker   d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
124*77c1e3ccSAndroid Build Coastguard Worker   d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
125*77c1e3ccSAndroid Build Coastguard Worker 
126*77c1e3ccSAndroid Build Coastguard Worker   w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
127*77c1e3ccSAndroid Build Coastguard Worker   w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
128*77c1e3ccSAndroid Build Coastguard Worker   w12 =
129*77c1e3ccSAndroid Build Coastguard Worker       vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
130*77c1e3ccSAndroid Build Coastguard Worker   w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
131*77c1e3ccSAndroid Build Coastguard Worker                  vreinterpret_u16_u8(w11.val[1]));
132*77c1e3ccSAndroid Build Coastguard Worker 
133*77c1e3ccSAndroid Build Coastguard Worker   w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
134*77c1e3ccSAndroid Build Coastguard Worker                 vreinterpret_u32_u16(w5.val[0]));
135*77c1e3ccSAndroid Build Coastguard Worker   w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
136*77c1e3ccSAndroid Build Coastguard Worker                 vreinterpret_u32_u16(w5.val[1]));
137*77c1e3ccSAndroid Build Coastguard Worker   w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
138*77c1e3ccSAndroid Build Coastguard Worker                  vreinterpret_u32_u16(w13.val[0]));
139*77c1e3ccSAndroid Build Coastguard Worker   w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
140*77c1e3ccSAndroid Build Coastguard Worker                  vreinterpret_u32_u16(w13.val[1]));
141*77c1e3ccSAndroid Build Coastguard Worker 
142*77c1e3ccSAndroid Build Coastguard Worker   // Store second 4-line result
143*77c1e3ccSAndroid Build Coastguard Worker   d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
144*77c1e3ccSAndroid Build Coastguard Worker   d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
145*77c1e3ccSAndroid Build Coastguard Worker   d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
146*77c1e3ccSAndroid Build Coastguard Worker   d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
147*77c1e3ccSAndroid Build Coastguard Worker }
148*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_u8_16x8(const uint8x16_t * x,uint8x8_t * d)149*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x,
150*77c1e3ccSAndroid Build Coastguard Worker                                                       uint8x8_t *d) {
151*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
152*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
153*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
154*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
155*77c1e3ccSAndroid Build Coastguard Worker 
156*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
157*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpretq_u16_u8(w1.val[0]));
158*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
159*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpretq_u16_u8(w3.val[0]));
160*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
161*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpretq_u16_u8(w1.val[1]));
162*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
163*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpretq_u16_u8(w3.val[1]));
164*77c1e3ccSAndroid Build Coastguard Worker 
165*77c1e3ccSAndroid Build Coastguard Worker   uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
166*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpretq_u32_u16(w5.val[0]));
167*77c1e3ccSAndroid Build Coastguard Worker   uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
168*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpretq_u32_u16(w7.val[0]));
169*77c1e3ccSAndroid Build Coastguard Worker   uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
170*77c1e3ccSAndroid Build Coastguard Worker                                vreinterpretq_u32_u16(w5.val[1]));
171*77c1e3ccSAndroid Build Coastguard Worker   uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
172*77c1e3ccSAndroid Build Coastguard Worker                                vreinterpretq_u32_u16(w7.val[1]));
173*77c1e3ccSAndroid Build Coastguard Worker 
174*77c1e3ccSAndroid Build Coastguard Worker   d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0]));
175*77c1e3ccSAndroid Build Coastguard Worker   d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0]));
176*77c1e3ccSAndroid Build Coastguard Worker   d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1]));
177*77c1e3ccSAndroid Build Coastguard Worker   d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1]));
178*77c1e3ccSAndroid Build Coastguard Worker   d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0]));
179*77c1e3ccSAndroid Build Coastguard Worker   d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0]));
180*77c1e3ccSAndroid Build Coastguard Worker   d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1]));
181*77c1e3ccSAndroid Build Coastguard Worker   d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1]));
182*77c1e3ccSAndroid Build Coastguard Worker   d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0]));
183*77c1e3ccSAndroid Build Coastguard Worker   d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0]));
184*77c1e3ccSAndroid Build Coastguard Worker   d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1]));
185*77c1e3ccSAndroid Build Coastguard Worker   d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1]));
186*77c1e3ccSAndroid Build Coastguard Worker   d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0]));
187*77c1e3ccSAndroid Build Coastguard Worker   d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0]));
188*77c1e3ccSAndroid Build Coastguard Worker   d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1]));
189*77c1e3ccSAndroid Build Coastguard Worker   d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1]));
190*77c1e3ccSAndroid Build Coastguard Worker }
191*77c1e3ccSAndroid Build Coastguard Worker 
aom_vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)192*77c1e3ccSAndroid Build Coastguard Worker static inline uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
193*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t b0;
194*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
195*77c1e3ccSAndroid Build Coastguard Worker   b0.val[0] = vreinterpretq_u16_u64(
196*77c1e3ccSAndroid Build Coastguard Worker       vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
197*77c1e3ccSAndroid Build Coastguard Worker   b0.val[1] = vreinterpretq_u16_u64(
198*77c1e3ccSAndroid Build Coastguard Worker       vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
199*77c1e3ccSAndroid Build Coastguard Worker #else
200*77c1e3ccSAndroid Build Coastguard Worker   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
201*77c1e3ccSAndroid Build Coastguard Worker                            vreinterpret_u16_u32(vget_low_u32(a1)));
202*77c1e3ccSAndroid Build Coastguard Worker   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
203*77c1e3ccSAndroid Build Coastguard Worker                            vreinterpret_u16_u32(vget_high_u32(a1)));
204*77c1e3ccSAndroid Build Coastguard Worker #endif
205*77c1e3ccSAndroid Build Coastguard Worker   return b0;
206*77c1e3ccSAndroid Build Coastguard Worker }
207*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_u8_16x16(const uint8x16_t * x,uint8x16_t * d)208*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_arrays_u8_16x16(const uint8x16_t *x,
209*77c1e3ccSAndroid Build Coastguard Worker                                              uint8x16_t *d) {
210*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
211*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
212*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
213*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
214*77c1e3ccSAndroid Build Coastguard Worker 
215*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w4 = vzipq_u8(x[8], x[9]);
216*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w5 = vzipq_u8(x[10], x[11]);
217*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w6 = vzipq_u8(x[12], x[13]);
218*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t w7 = vzipq_u8(x[14], x[15]);
219*77c1e3ccSAndroid Build Coastguard Worker 
220*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
221*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpretq_u16_u8(w1.val[0]));
222*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
223*77c1e3ccSAndroid Build Coastguard Worker                               vreinterpretq_u16_u8(w3.val[0]));
224*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
225*77c1e3ccSAndroid Build Coastguard Worker                                vreinterpretq_u16_u8(w5.val[0]));
226*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
227*77c1e3ccSAndroid Build Coastguard Worker                                vreinterpretq_u16_u8(w7.val[0]));
228*77c1e3ccSAndroid Build Coastguard Worker 
229*77c1e3ccSAndroid Build Coastguard Worker   uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
230*77c1e3ccSAndroid Build Coastguard Worker                                vreinterpretq_u32_u16(w9.val[0]));
231*77c1e3ccSAndroid Build Coastguard Worker   uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
232*77c1e3ccSAndroid Build Coastguard Worker                                vreinterpretq_u32_u16(w11.val[0]));
233*77c1e3ccSAndroid Build Coastguard Worker   uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
234*77c1e3ccSAndroid Build Coastguard Worker                                vreinterpretq_u32_u16(w9.val[1]));
235*77c1e3ccSAndroid Build Coastguard Worker   uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
236*77c1e3ccSAndroid Build Coastguard Worker                                vreinterpretq_u32_u16(w11.val[1]));
237*77c1e3ccSAndroid Build Coastguard Worker 
238*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
239*77c1e3ccSAndroid Build Coastguard Worker   d[0] = vreinterpretq_u8_u16(d01.val[0]);
240*77c1e3ccSAndroid Build Coastguard Worker   d[1] = vreinterpretq_u8_u16(d01.val[1]);
241*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
242*77c1e3ccSAndroid Build Coastguard Worker   d[2] = vreinterpretq_u8_u16(d23.val[0]);
243*77c1e3ccSAndroid Build Coastguard Worker   d[3] = vreinterpretq_u8_u16(d23.val[1]);
244*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
245*77c1e3ccSAndroid Build Coastguard Worker   d[4] = vreinterpretq_u8_u16(d45.val[0]);
246*77c1e3ccSAndroid Build Coastguard Worker   d[5] = vreinterpretq_u8_u16(d45.val[1]);
247*77c1e3ccSAndroid Build Coastguard Worker   uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
248*77c1e3ccSAndroid Build Coastguard Worker   d[6] = vreinterpretq_u8_u16(d67.val[0]);
249*77c1e3ccSAndroid Build Coastguard Worker   d[7] = vreinterpretq_u8_u16(d67.val[1]);
250*77c1e3ccSAndroid Build Coastguard Worker 
251*77c1e3ccSAndroid Build Coastguard Worker   // upper half
252*77c1e3ccSAndroid Build Coastguard Worker   w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
253*77c1e3ccSAndroid Build Coastguard Worker                  vreinterpretq_u16_u8(w1.val[1]));
254*77c1e3ccSAndroid Build Coastguard Worker   w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
255*77c1e3ccSAndroid Build Coastguard Worker                  vreinterpretq_u16_u8(w3.val[1]));
256*77c1e3ccSAndroid Build Coastguard Worker   w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
257*77c1e3ccSAndroid Build Coastguard Worker                   vreinterpretq_u16_u8(w5.val[1]));
258*77c1e3ccSAndroid Build Coastguard Worker   w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
259*77c1e3ccSAndroid Build Coastguard Worker                   vreinterpretq_u16_u8(w7.val[1]));
260*77c1e3ccSAndroid Build Coastguard Worker 
261*77c1e3ccSAndroid Build Coastguard Worker   w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
262*77c1e3ccSAndroid Build Coastguard Worker                   vreinterpretq_u32_u16(w9.val[0]));
263*77c1e3ccSAndroid Build Coastguard Worker   w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
264*77c1e3ccSAndroid Build Coastguard Worker                   vreinterpretq_u32_u16(w11.val[0]));
265*77c1e3ccSAndroid Build Coastguard Worker   w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
266*77c1e3ccSAndroid Build Coastguard Worker                   vreinterpretq_u32_u16(w9.val[1]));
267*77c1e3ccSAndroid Build Coastguard Worker   w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
268*77c1e3ccSAndroid Build Coastguard Worker                   vreinterpretq_u32_u16(w11.val[1]));
269*77c1e3ccSAndroid Build Coastguard Worker 
270*77c1e3ccSAndroid Build Coastguard Worker   d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
271*77c1e3ccSAndroid Build Coastguard Worker   d[8] = vreinterpretq_u8_u16(d01.val[0]);
272*77c1e3ccSAndroid Build Coastguard Worker   d[9] = vreinterpretq_u8_u16(d01.val[1]);
273*77c1e3ccSAndroid Build Coastguard Worker   d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
274*77c1e3ccSAndroid Build Coastguard Worker   d[10] = vreinterpretq_u8_u16(d23.val[0]);
275*77c1e3ccSAndroid Build Coastguard Worker   d[11] = vreinterpretq_u8_u16(d23.val[1]);
276*77c1e3ccSAndroid Build Coastguard Worker   d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
277*77c1e3ccSAndroid Build Coastguard Worker   d[12] = vreinterpretq_u8_u16(d45.val[0]);
278*77c1e3ccSAndroid Build Coastguard Worker   d[13] = vreinterpretq_u8_u16(d45.val[1]);
279*77c1e3ccSAndroid Build Coastguard Worker   d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
280*77c1e3ccSAndroid Build Coastguard Worker   d[14] = vreinterpretq_u8_u16(d67.val[0]);
281*77c1e3ccSAndroid Build Coastguard Worker   d[15] = vreinterpretq_u8_u16(d67.val[1]);
282*77c1e3ccSAndroid Build Coastguard Worker }
283*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_u8_32x16(const uint8x16x2_t * x,uint8x16_t * d)284*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x,
285*77c1e3ccSAndroid Build Coastguard Worker                                                        uint8x16_t *d) {
286*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t x2[32];
287*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 16; ++i) {
288*77c1e3ccSAndroid Build Coastguard Worker     x2[i] = x[i].val[0];
289*77c1e3ccSAndroid Build Coastguard Worker     x2[i + 16] = x[i].val[1];
290*77c1e3ccSAndroid Build Coastguard Worker   }
291*77c1e3ccSAndroid Build Coastguard Worker   transpose_arrays_u8_16x16(x2, d);
292*77c1e3ccSAndroid Build Coastguard Worker   transpose_arrays_u8_16x16(x2 + 16, d + 16);
293*77c1e3ccSAndroid Build Coastguard Worker }
294*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)295*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
296*77c1e3ccSAndroid Build Coastguard Worker                                                   uint8x8_t *a2,
297*77c1e3ccSAndroid Build Coastguard Worker                                                   uint8x8_t *a3) {
298*77c1e3ccSAndroid Build Coastguard Worker   // Swap 8 bit elements. Goes from:
299*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03 04 05 06 07
300*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13 14 15 16 17
301*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23 24 25 26 27
302*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33 34 35 36 37
303*77c1e3ccSAndroid Build Coastguard Worker   // to:
304*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16
305*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13 05 15 07 17
306*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36
307*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33 25 35 27 37
308*77c1e3ccSAndroid Build Coastguard Worker 
309*77c1e3ccSAndroid Build Coastguard Worker   const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
310*77c1e3ccSAndroid Build Coastguard Worker   const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
311*77c1e3ccSAndroid Build Coastguard Worker 
312*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements resulting in:
313*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30 04 14 24 34
314*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32 06 16 26 36
315*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31 05 15 25 35
316*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33 07 17 27 37
317*77c1e3ccSAndroid Build Coastguard Worker 
318*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4x2_t c0 =
319*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
320*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4x2_t c1 =
321*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
322*77c1e3ccSAndroid Build Coastguard Worker 
323*77c1e3ccSAndroid Build Coastguard Worker   *a0 = vreinterpret_u8_u16(c0.val[0]);
324*77c1e3ccSAndroid Build Coastguard Worker   *a1 = vreinterpret_u8_u16(c1.val[0]);
325*77c1e3ccSAndroid Build Coastguard Worker   *a2 = vreinterpret_u8_u16(c0.val[1]);
326*77c1e3ccSAndroid Build Coastguard Worker   *a3 = vreinterpret_u8_u16(c1.val[1]);
327*77c1e3ccSAndroid Build Coastguard Worker }
328*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_u8_16x4(uint8x16_t * a0,uint8x16_t * a1,uint8x16_t * a2,uint8x16_t * a3)329*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_u8_16x4(uint8x16_t *a0,
330*77c1e3ccSAndroid Build Coastguard Worker                                                    uint8x16_t *a1,
331*77c1e3ccSAndroid Build Coastguard Worker                                                    uint8x16_t *a2,
332*77c1e3ccSAndroid Build Coastguard Worker                                                    uint8x16_t *a3) {
333*77c1e3ccSAndroid Build Coastguard Worker   // Swap 8 bit elements. Goes from:
334*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03 04 05 06 07 08 09 010 011 012 013 014 015
335*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115
336*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23 24 25 26 27 28 29 210 211 212 213 214 215
337*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33 34 35 36 37 38 39 310 311 312 313 314 315
338*77c1e3ccSAndroid Build Coastguard Worker   // to:
339*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 010 110 012 112 014 114
340*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 011 111 013 113 015 115
341*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 210 310 212 312 214 314
342*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 211 311 213 313 215 315
343*77c1e3ccSAndroid Build Coastguard Worker 
344*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16x2_t b0 = vtrnq_u8(*a0, *a1);
345*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16x2_t b1 = vtrnq_u8(*a2, *a3);
346*77c1e3ccSAndroid Build Coastguard Worker 
347*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements resulting in:
348*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30 04 14 24 34 08  18  28  38  012 112 212 312
349*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32 06 16 26 36 09  19  29  39  013 113 213 313
350*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31 05 15 25 35 010 110 210 310 014 114 214 314
351*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33 07 17 27 37 011 111 211 311 015 115 215 315
352*77c1e3ccSAndroid Build Coastguard Worker 
353*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
354*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u16_u8(b1.val[0]));
355*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
356*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u16_u8(b1.val[1]));
357*77c1e3ccSAndroid Build Coastguard Worker 
358*77c1e3ccSAndroid Build Coastguard Worker   *a0 = vreinterpretq_u8_u16(c0.val[0]);
359*77c1e3ccSAndroid Build Coastguard Worker   *a1 = vreinterpretq_u8_u16(c1.val[0]);
360*77c1e3ccSAndroid Build Coastguard Worker   *a2 = vreinterpretq_u8_u16(c0.val[1]);
361*77c1e3ccSAndroid Build Coastguard Worker   *a3 = vreinterpretq_u8_u16(c1.val[1]);
362*77c1e3ccSAndroid Build Coastguard Worker }
363*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)364*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
365*77c1e3ccSAndroid Build Coastguard Worker                                                   uint8x8_t *a1) {
366*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements. Goes from:
367*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03  10 11 12 13
368*77c1e3ccSAndroid Build Coastguard Worker   // a1: 20 21 22 23  30 31 32 33
369*77c1e3ccSAndroid Build Coastguard Worker   // to:
370*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 01 20 21  10 11 30 31
371*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 02 03 22 23  12 13 32 33
372*77c1e3ccSAndroid Build Coastguard Worker 
373*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4x2_t b0 =
374*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
375*77c1e3ccSAndroid Build Coastguard Worker 
376*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements resulting in:
377*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 01 20 21  02 03 22 23
378*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 10 11 30 31  12 13 32 33
379*77c1e3ccSAndroid Build Coastguard Worker 
380*77c1e3ccSAndroid Build Coastguard Worker   const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
381*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpret_u32_u16(b0.val[1]));
382*77c1e3ccSAndroid Build Coastguard Worker 
383*77c1e3ccSAndroid Build Coastguard Worker   // Swap 8 bit elements resulting in:
384*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[0]: 00 10 20 30  02 12 22 32
385*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[1]: 01 11 21 31  03 13 23 33
386*77c1e3ccSAndroid Build Coastguard Worker 
387*77c1e3ccSAndroid Build Coastguard Worker   const uint8x8x2_t d0 =
388*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
389*77c1e3ccSAndroid Build Coastguard Worker 
390*77c1e3ccSAndroid Build Coastguard Worker   *a0 = d0.val[0];
391*77c1e3ccSAndroid Build Coastguard Worker   *a1 = d0.val[1];
392*77c1e3ccSAndroid Build Coastguard Worker }
393*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_u8_4x8(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3)394*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
395*77c1e3ccSAndroid Build Coastguard Worker                                           uint8x8_t a2, uint8x8_t a3,
396*77c1e3ccSAndroid Build Coastguard Worker                                           uint8x8_t a4, uint8x8_t a5,
397*77c1e3ccSAndroid Build Coastguard Worker                                           uint8x8_t a6, uint8x8_t a7,
398*77c1e3ccSAndroid Build Coastguard Worker                                           uint8x8_t *o0, uint8x8_t *o1,
399*77c1e3ccSAndroid Build Coastguard Worker                                           uint8x8_t *o2, uint8x8_t *o3) {
400*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements. Goes from:
401*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03 XX XX XX XX
402*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13 XX XX XX XX
403*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23 XX XX XX XX
404*77c1e3ccSAndroid Build Coastguard Worker   // a3; 30 31 32 33 XX XX XX XX
405*77c1e3ccSAndroid Build Coastguard Worker   // a4: 40 41 42 43 XX XX XX XX
406*77c1e3ccSAndroid Build Coastguard Worker   // a5: 50 51 52 53 XX XX XX XX
407*77c1e3ccSAndroid Build Coastguard Worker   // a6: 60 61 62 63 XX XX XX XX
408*77c1e3ccSAndroid Build Coastguard Worker   // a7: 70 71 72 73 XX XX XX XX
409*77c1e3ccSAndroid Build Coastguard Worker   // to:
410*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 01 02 03 40 41 42 43
411*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 10 11 12 13 50 51 52 53
412*77c1e3ccSAndroid Build Coastguard Worker   // b2.val[0]: 20 21 22 23 60 61 62 63
413*77c1e3ccSAndroid Build Coastguard Worker   // b3.val[0]: 30 31 32 33 70 71 72 73
414*77c1e3ccSAndroid Build Coastguard Worker 
415*77c1e3ccSAndroid Build Coastguard Worker   const uint32x2x2_t b0 =
416*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
417*77c1e3ccSAndroid Build Coastguard Worker   const uint32x2x2_t b1 =
418*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
419*77c1e3ccSAndroid Build Coastguard Worker   const uint32x2x2_t b2 =
420*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
421*77c1e3ccSAndroid Build Coastguard Worker   const uint32x2x2_t b3 =
422*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
423*77c1e3ccSAndroid Build Coastguard Worker 
424*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements resulting in:
425*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 01 20 21 40 41 60 61
426*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 03 22 23 42 43 62 63
427*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 10 11 30 31 50 51 70 71
428*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 12 13 32 33 52 53 72 73
429*77c1e3ccSAndroid Build Coastguard Worker 
430*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
431*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpret_u16_u32(b2.val[0]));
432*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
433*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpret_u16_u32(b3.val[0]));
434*77c1e3ccSAndroid Build Coastguard Worker 
435*77c1e3ccSAndroid Build Coastguard Worker   // Swap 8 bit elements resulting in:
436*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[0]: 00 10 20 30 40 50 60 70
437*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[1]: 01 11 21 31 41 51 61 71
438*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[0]: 02 12 22 32 42 52 62 72
439*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[1]: 03 13 23 33 43 53 63 73
440*77c1e3ccSAndroid Build Coastguard Worker 
441*77c1e3ccSAndroid Build Coastguard Worker   const uint8x8x2_t d0 =
442*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
443*77c1e3ccSAndroid Build Coastguard Worker   const uint8x8x2_t d1 =
444*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
445*77c1e3ccSAndroid Build Coastguard Worker 
446*77c1e3ccSAndroid Build Coastguard Worker   *o0 = d0.val[0];
447*77c1e3ccSAndroid Build Coastguard Worker   *o1 = d0.val[1];
448*77c1e3ccSAndroid Build Coastguard Worker   *o2 = d1.val[0];
449*77c1e3ccSAndroid Build Coastguard Worker   *o3 = d1.val[1];
450*77c1e3ccSAndroid Build Coastguard Worker }
451*77c1e3ccSAndroid Build Coastguard Worker 
transpose_array_inplace_u16_4x4(uint16x4_t a[4])452*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
453*77c1e3ccSAndroid Build Coastguard Worker   // Input:
454*77c1e3ccSAndroid Build Coastguard Worker   // 00 01 02 03
455*77c1e3ccSAndroid Build Coastguard Worker   // 10 11 12 13
456*77c1e3ccSAndroid Build Coastguard Worker   // 20 21 22 23
457*77c1e3ccSAndroid Build Coastguard Worker   // 30 31 32 33
458*77c1e3ccSAndroid Build Coastguard Worker 
459*77c1e3ccSAndroid Build Coastguard Worker   // b:
460*77c1e3ccSAndroid Build Coastguard Worker   // 00 10 02 12
461*77c1e3ccSAndroid Build Coastguard Worker   // 01 11 03 13
462*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
463*77c1e3ccSAndroid Build Coastguard Worker   // c:
464*77c1e3ccSAndroid Build Coastguard Worker   // 20 30 22 32
465*77c1e3ccSAndroid Build Coastguard Worker   // 21 31 23 33
466*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
467*77c1e3ccSAndroid Build Coastguard Worker   // d:
468*77c1e3ccSAndroid Build Coastguard Worker   // 00 10 20 30
469*77c1e3ccSAndroid Build Coastguard Worker   // 02 12 22 32
470*77c1e3ccSAndroid Build Coastguard Worker   const uint32x2x2_t d =
471*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
472*77c1e3ccSAndroid Build Coastguard Worker   // e:
473*77c1e3ccSAndroid Build Coastguard Worker   // 01 11 21 31
474*77c1e3ccSAndroid Build Coastguard Worker   // 03 13 23 33
475*77c1e3ccSAndroid Build Coastguard Worker   const uint32x2x2_t e =
476*77c1e3ccSAndroid Build Coastguard Worker       vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
477*77c1e3ccSAndroid Build Coastguard Worker 
478*77c1e3ccSAndroid Build Coastguard Worker   // Output:
479*77c1e3ccSAndroid Build Coastguard Worker   // 00 10 20 30
480*77c1e3ccSAndroid Build Coastguard Worker   // 01 11 21 31
481*77c1e3ccSAndroid Build Coastguard Worker   // 02 12 22 32
482*77c1e3ccSAndroid Build Coastguard Worker   // 03 13 23 33
483*77c1e3ccSAndroid Build Coastguard Worker   a[0] = vreinterpret_u16_u32(d.val[0]);
484*77c1e3ccSAndroid Build Coastguard Worker   a[1] = vreinterpret_u16_u32(e.val[0]);
485*77c1e3ccSAndroid Build Coastguard Worker   a[2] = vreinterpret_u16_u32(d.val[1]);
486*77c1e3ccSAndroid Build Coastguard Worker   a[3] = vreinterpret_u16_u32(e.val[1]);
487*77c1e3ccSAndroid Build Coastguard Worker }
488*77c1e3ccSAndroid Build Coastguard Worker 
transpose_array_inplace_u16_4x8(uint16x8_t a[4])489*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
490*77c1e3ccSAndroid Build Coastguard Worker   // 4x8 Input:
491*77c1e3ccSAndroid Build Coastguard Worker   // a[0]: 00 01 02 03 04 05 06 07
492*77c1e3ccSAndroid Build Coastguard Worker   // a[1]: 10 11 12 13 14 15 16 17
493*77c1e3ccSAndroid Build Coastguard Worker   // a[2]: 20 21 22 23 24 25 26 27
494*77c1e3ccSAndroid Build Coastguard Worker   // a[3]: 30 31 32 33 34 35 36 37
495*77c1e3ccSAndroid Build Coastguard Worker 
496*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16
497*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13 05 15 07 17
498*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36
499*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33 25 35 27 37
500*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
501*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
502*77c1e3ccSAndroid Build Coastguard Worker 
503*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30 04 14 24 34
504*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32 06 16 26 36
505*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31 05 15 25 35
506*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33 07 17 27 37
507*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
508*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(b1.val[0]));
509*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
510*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(b1.val[1]));
511*77c1e3ccSAndroid Build Coastguard Worker 
512*77c1e3ccSAndroid Build Coastguard Worker   // 8x4 Output:
513*77c1e3ccSAndroid Build Coastguard Worker   // a[0]: 00 10 20 30 04 14 24 34
514*77c1e3ccSAndroid Build Coastguard Worker   // a[1]: 01 11 21 31 05 15 25 35
515*77c1e3ccSAndroid Build Coastguard Worker   // a[2]: 02 12 22 32 06 16 26 36
516*77c1e3ccSAndroid Build Coastguard Worker   // a[3]: 03 13 23 33 07 17 27 37
517*77c1e3ccSAndroid Build Coastguard Worker   a[0] = vreinterpretq_u16_u32(c0.val[0]);
518*77c1e3ccSAndroid Build Coastguard Worker   a[1] = vreinterpretq_u16_u32(c1.val[0]);
519*77c1e3ccSAndroid Build Coastguard Worker   a[2] = vreinterpretq_u16_u32(c0.val[1]);
520*77c1e3ccSAndroid Build Coastguard Worker   a[3] = vreinterpretq_u16_u32(c1.val[1]);
521*77c1e3ccSAndroid Build Coastguard Worker }
522*77c1e3ccSAndroid Build Coastguard Worker 
523*77c1e3ccSAndroid Build Coastguard Worker // Special transpose for loop filter.
524*77c1e3ccSAndroid Build Coastguard Worker // 4x8 Input:
525*77c1e3ccSAndroid Build Coastguard Worker // p_q:  p3 p2 p1 p0 q0 q1 q2 q3
526*77c1e3ccSAndroid Build Coastguard Worker // a[0]: 00 01 02 03 04 05 06 07
527*77c1e3ccSAndroid Build Coastguard Worker // a[1]: 10 11 12 13 14 15 16 17
528*77c1e3ccSAndroid Build Coastguard Worker // a[2]: 20 21 22 23 24 25 26 27
529*77c1e3ccSAndroid Build Coastguard Worker // a[3]: 30 31 32 33 34 35 36 37
530*77c1e3ccSAndroid Build Coastguard Worker // 8x4 Output:
531*77c1e3ccSAndroid Build Coastguard Worker // a[0]: 03 13 23 33 04 14 24 34  p0q0
532*77c1e3ccSAndroid Build Coastguard Worker // a[1]: 02 12 22 32 05 15 25 35  p1q1
533*77c1e3ccSAndroid Build Coastguard Worker // a[2]: 01 11 21 31 06 16 26 36  p2q2
534*77c1e3ccSAndroid Build Coastguard Worker // a[3]: 00 10 20 30 07 17 27 37  p3q3
535*77c1e3ccSAndroid Build Coastguard Worker // Direct reapplication of the function will reset the high halves, but
536*77c1e3ccSAndroid Build Coastguard Worker // reverse the low halves:
537*77c1e3ccSAndroid Build Coastguard Worker // p_q:  p0 p1 p2 p3 q0 q1 q2 q3
538*77c1e3ccSAndroid Build Coastguard Worker // a[0]: 33 32 31 30 04 05 06 07
539*77c1e3ccSAndroid Build Coastguard Worker // a[1]: 23 22 21 20 14 15 16 17
540*77c1e3ccSAndroid Build Coastguard Worker // a[2]: 13 12 11 10 24 25 26 27
541*77c1e3ccSAndroid Build Coastguard Worker // a[3]: 03 02 01 00 34 35 36 37
542*77c1e3ccSAndroid Build Coastguard Worker // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
543*77c1e3ccSAndroid Build Coastguard Worker // reverse the high halves.
544*77c1e3ccSAndroid Build Coastguard Worker // The standard transpose_u16_4x8q will produce the same reversals, but with the
545*77c1e3ccSAndroid Build Coastguard Worker // order of the low halves also restored relative to the high halves. This is
546*77c1e3ccSAndroid Build Coastguard Worker // preferable because it puts all values from the same source row back together,
547*77c1e3ccSAndroid Build Coastguard Worker // but some post-processing is inevitable.
loop_filter_transpose_u16_4x8q(uint16x8_t a[4])548*77c1e3ccSAndroid Build Coastguard Worker static inline void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
549*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16
550*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13 05 15 07 17
551*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36
552*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33 25 35 27 37
553*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
554*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
555*77c1e3ccSAndroid Build Coastguard Worker 
556*77c1e3ccSAndroid Build Coastguard Worker   // Reverse odd vectors to bring the appropriate items to the front of zips.
557*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16
558*77c1e3ccSAndroid Build Coastguard Worker   // r0       : 03 13 01 11 07 17 05 15
559*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36
560*77c1e3ccSAndroid Build Coastguard Worker   // r1       : 23 33 21 31 27 37 25 35
561*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
562*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
563*77c1e3ccSAndroid Build Coastguard Worker 
564*77c1e3ccSAndroid Build Coastguard Worker   // Zip to complete the halves.
565*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
566*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
567*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
568*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
569*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
570*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(b1.val[0]));
571*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t c1 = vzipq_u32(r0, r1);
572*77c1e3ccSAndroid Build Coastguard Worker 
573*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
574*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
575*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
576*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
577*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
578*77c1e3ccSAndroid Build Coastguard Worker   // The third row of c comes first here to swap p2 with q0.
579*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
580*77c1e3ccSAndroid Build Coastguard Worker 
581*77c1e3ccSAndroid Build Coastguard Worker   // 8x4 Output:
582*77c1e3ccSAndroid Build Coastguard Worker   // a[0]: 03 13 23 33 04 14 24 34  p0q0
583*77c1e3ccSAndroid Build Coastguard Worker   // a[1]: 02 12 22 32 05 15 25 35  p1q1
584*77c1e3ccSAndroid Build Coastguard Worker   // a[2]: 01 11 21 31 06 16 26 36  p2q2
585*77c1e3ccSAndroid Build Coastguard Worker   // a[3]: 00 10 20 30 07 17 27 37  p3q3
586*77c1e3ccSAndroid Build Coastguard Worker   a[0] = d1.val[0];  // p0q0
587*77c1e3ccSAndroid Build Coastguard Worker   a[1] = d0.val[1];  // p1q1
588*77c1e3ccSAndroid Build Coastguard Worker   a[2] = d1.val[1];  // p2q2
589*77c1e3ccSAndroid Build Coastguard Worker   a[3] = d0.val[0];  // p3q3
590*77c1e3ccSAndroid Build Coastguard Worker }
591*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_u16_4x8(const uint16x4_t a0,const uint16x4_t a1,const uint16x4_t a2,const uint16x4_t a3,const uint16x4_t a4,const uint16x4_t a5,const uint16x4_t a6,const uint16x4_t a7,uint16x8_t * o0,uint16x8_t * o1,uint16x8_t * o2,uint16x8_t * o3)592*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_u16_4x8(
593*77c1e3ccSAndroid Build Coastguard Worker     const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
594*77c1e3ccSAndroid Build Coastguard Worker     const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
595*77c1e3ccSAndroid Build Coastguard Worker     const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
596*77c1e3ccSAndroid Build Coastguard Worker     uint16x8_t *o2, uint16x8_t *o3) {
597*77c1e3ccSAndroid Build Coastguard Worker   // Combine rows. Goes from:
598*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03
599*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13
600*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23
601*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33
602*77c1e3ccSAndroid Build Coastguard Worker   // a4: 40 41 42 43
603*77c1e3ccSAndroid Build Coastguard Worker   // a5: 50 51 52 53
604*77c1e3ccSAndroid Build Coastguard Worker   // a6: 60 61 62 63
605*77c1e3ccSAndroid Build Coastguard Worker   // a7: 70 71 72 73
606*77c1e3ccSAndroid Build Coastguard Worker   // to:
607*77c1e3ccSAndroid Build Coastguard Worker   // b0: 00 01 02 03 40 41 42 43
608*77c1e3ccSAndroid Build Coastguard Worker   // b1: 10 11 12 13 50 51 52 53
609*77c1e3ccSAndroid Build Coastguard Worker   // b2: 20 21 22 23 60 61 62 63
610*77c1e3ccSAndroid Build Coastguard Worker   // b3: 30 31 32 33 70 71 72 73
611*77c1e3ccSAndroid Build Coastguard Worker 
612*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8_t b0 = vcombine_u16(a0, a4);
613*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8_t b1 = vcombine_u16(a1, a5);
614*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8_t b2 = vcombine_u16(a2, a6);
615*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8_t b3 = vcombine_u16(a3, a7);
616*77c1e3ccSAndroid Build Coastguard Worker 
617*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements resulting in:
618*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 02 12 40 50 42 52
619*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 01 11 03 13 41 51 43 53
620*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 20 30 22 32 60 70 62 72
621*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 21 31 23 33 61 71 63 73
622*77c1e3ccSAndroid Build Coastguard Worker 
623*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t c0 = vtrnq_u16(b0, b1);
624*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t c1 = vtrnq_u16(b2, b3);
625*77c1e3ccSAndroid Build Coastguard Worker 
626*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements resulting in:
627*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[0]: 00 10 20 30 40 50 60 70
628*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[1]: 02 12 22 32 42 52 62 72
629*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[0]: 01 11 21 31 41 51 61 71
630*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[1]: 03 13 23 33 43 53 63 73
631*77c1e3ccSAndroid Build Coastguard Worker 
632*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
633*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(c1.val[0]));
634*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
635*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(c1.val[1]));
636*77c1e3ccSAndroid Build Coastguard Worker 
637*77c1e3ccSAndroid Build Coastguard Worker   *o0 = vreinterpretq_u16_u32(d0.val[0]);
638*77c1e3ccSAndroid Build Coastguard Worker   *o1 = vreinterpretq_u16_u32(d1.val[0]);
639*77c1e3ccSAndroid Build Coastguard Worker   *o2 = vreinterpretq_u16_u32(d0.val[1]);
640*77c1e3ccSAndroid Build Coastguard Worker   *o3 = vreinterpretq_u16_u32(d1.val[1]);
641*77c1e3ccSAndroid Build Coastguard Worker }
642*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_s16_4x8(const int16x4_t a0,const int16x4_t a1,const int16x4_t a2,const int16x4_t a3,const int16x4_t a4,const int16x4_t a5,const int16x4_t a6,const int16x4_t a7,int16x8_t * o0,int16x8_t * o1,int16x8_t * o2,int16x8_t * o3)643*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_s16_4x8(
644*77c1e3ccSAndroid Build Coastguard Worker     const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
645*77c1e3ccSAndroid Build Coastguard Worker     const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
646*77c1e3ccSAndroid Build Coastguard Worker     const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
647*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t *o2, int16x8_t *o3) {
648*77c1e3ccSAndroid Build Coastguard Worker   // Combine rows. Goes from:
649*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03
650*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13
651*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23
652*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33
653*77c1e3ccSAndroid Build Coastguard Worker   // a4: 40 41 42 43
654*77c1e3ccSAndroid Build Coastguard Worker   // a5: 50 51 52 53
655*77c1e3ccSAndroid Build Coastguard Worker   // a6: 60 61 62 63
656*77c1e3ccSAndroid Build Coastguard Worker   // a7: 70 71 72 73
657*77c1e3ccSAndroid Build Coastguard Worker   // to:
658*77c1e3ccSAndroid Build Coastguard Worker   // b0: 00 01 02 03 40 41 42 43
659*77c1e3ccSAndroid Build Coastguard Worker   // b1: 10 11 12 13 50 51 52 53
660*77c1e3ccSAndroid Build Coastguard Worker   // b2: 20 21 22 23 60 61 62 63
661*77c1e3ccSAndroid Build Coastguard Worker   // b3: 30 31 32 33 70 71 72 73
662*77c1e3ccSAndroid Build Coastguard Worker 
663*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t b0 = vcombine_s16(a0, a4);
664*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t b1 = vcombine_s16(a1, a5);
665*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t b2 = vcombine_s16(a2, a6);
666*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t b3 = vcombine_s16(a3, a7);
667*77c1e3ccSAndroid Build Coastguard Worker 
668*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements resulting in:
669*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 02 12 40 50 42 52
670*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 01 11 03 13 41 51 43 53
671*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 20 30 22 32 60 70 62 72
672*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 21 31 23 33 61 71 63 73
673*77c1e3ccSAndroid Build Coastguard Worker 
674*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t c0 = vtrnq_s16(b0, b1);
675*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t c1 = vtrnq_s16(b2, b3);
676*77c1e3ccSAndroid Build Coastguard Worker 
677*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements resulting in:
678*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[0]: 00 10 20 30 40 50 60 70
679*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[1]: 02 12 22 32 42 52 62 72
680*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[0]: 01 11 21 31 41 51 61 71
681*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[1]: 03 13 23 33 43 53 63 73
682*77c1e3ccSAndroid Build Coastguard Worker 
683*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
684*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(c1.val[0]));
685*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
686*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(c1.val[1]));
687*77c1e3ccSAndroid Build Coastguard Worker 
688*77c1e3ccSAndroid Build Coastguard Worker   *o0 = vreinterpretq_s16_s32(d0.val[0]);
689*77c1e3ccSAndroid Build Coastguard Worker   *o1 = vreinterpretq_s16_s32(d1.val[0]);
690*77c1e3ccSAndroid Build Coastguard Worker   *o2 = vreinterpretq_s16_s32(d0.val[1]);
691*77c1e3ccSAndroid Build Coastguard Worker   *o3 = vreinterpretq_s16_s32(d1.val[1]);
692*77c1e3ccSAndroid Build Coastguard Worker }
693*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)694*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_u16_8x8(
695*77c1e3ccSAndroid Build Coastguard Worker     uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
696*77c1e3ccSAndroid Build Coastguard Worker     uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
697*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements. Goes from:
698*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03 04 05 06 07
699*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13 14 15 16 17
700*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23 24 25 26 27
701*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33 34 35 36 37
702*77c1e3ccSAndroid Build Coastguard Worker   // a4: 40 41 42 43 44 45 46 47
703*77c1e3ccSAndroid Build Coastguard Worker   // a5: 50 51 52 53 54 55 56 57
704*77c1e3ccSAndroid Build Coastguard Worker   // a6: 60 61 62 63 64 65 66 67
705*77c1e3ccSAndroid Build Coastguard Worker   // a7: 70 71 72 73 74 75 76 77
706*77c1e3ccSAndroid Build Coastguard Worker   // to:
707*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16
708*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13 05 15 07 17
709*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36
710*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33 25 35 27 37
711*77c1e3ccSAndroid Build Coastguard Worker   // b2.val[0]: 40 50 42 52 44 54 46 56
712*77c1e3ccSAndroid Build Coastguard Worker   // b2.val[1]: 41 51 43 53 45 55 47 57
713*77c1e3ccSAndroid Build Coastguard Worker   // b3.val[0]: 60 70 62 72 64 74 66 76
714*77c1e3ccSAndroid Build Coastguard Worker   // b3.val[1]: 61 71 63 73 65 75 67 77
715*77c1e3ccSAndroid Build Coastguard Worker 
716*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
717*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
718*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
719*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
720*77c1e3ccSAndroid Build Coastguard Worker 
721*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements resulting in:
722*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30 04 14 24 34
723*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32 06 16 26 36
724*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31 05 15 25 35
725*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33 07 17 27 37
726*77c1e3ccSAndroid Build Coastguard Worker   // c2.val[0]: 40 50 60 70 44 54 64 74
727*77c1e3ccSAndroid Build Coastguard Worker   // c2.val[1]: 42 52 62 72 46 56 66 76
728*77c1e3ccSAndroid Build Coastguard Worker   // c3.val[0]: 41 51 61 71 45 55 65 75
729*77c1e3ccSAndroid Build Coastguard Worker   // c3.val[1]: 43 53 63 73 47 57 67 77
730*77c1e3ccSAndroid Build Coastguard Worker 
731*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
732*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(b1.val[0]));
733*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
734*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(b1.val[1]));
735*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
736*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(b3.val[0]));
737*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
738*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_u16(b3.val[1]));
739*77c1e3ccSAndroid Build Coastguard Worker 
740*77c1e3ccSAndroid Build Coastguard Worker   // Swap 64 bit elements resulting in:
741*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[0]: 00 10 20 30 40 50 60 70
742*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[1]: 04 14 24 34 44 54 64 74
743*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[0]: 01 11 21 31 41 51 61 71
744*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[1]: 05 15 25 35 45 55 65 75
745*77c1e3ccSAndroid Build Coastguard Worker   // d2.val[0]: 02 12 22 32 42 52 62 72
746*77c1e3ccSAndroid Build Coastguard Worker   // d2.val[1]: 06 16 26 36 46 56 66 76
747*77c1e3ccSAndroid Build Coastguard Worker   // d3.val[0]: 03 13 23 33 43 53 63 73
748*77c1e3ccSAndroid Build Coastguard Worker   // d3.val[1]: 07 17 27 37 47 57 67 77
749*77c1e3ccSAndroid Build Coastguard Worker 
750*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
751*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
752*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
753*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
754*77c1e3ccSAndroid Build Coastguard Worker 
755*77c1e3ccSAndroid Build Coastguard Worker   *a0 = d0.val[0];
756*77c1e3ccSAndroid Build Coastguard Worker   *a1 = d1.val[0];
757*77c1e3ccSAndroid Build Coastguard Worker   *a2 = d2.val[0];
758*77c1e3ccSAndroid Build Coastguard Worker   *a3 = d3.val[0];
759*77c1e3ccSAndroid Build Coastguard Worker   *a4 = d0.val[1];
760*77c1e3ccSAndroid Build Coastguard Worker   *a5 = d1.val[1];
761*77c1e3ccSAndroid Build Coastguard Worker   *a6 = d2.val[1];
762*77c1e3ccSAndroid Build Coastguard Worker   *a7 = d3.val[1];
763*77c1e3ccSAndroid Build Coastguard Worker }
764*77c1e3ccSAndroid Build Coastguard Worker 
aom_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)765*77c1e3ccSAndroid Build Coastguard Worker static inline int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
766*77c1e3ccSAndroid Build Coastguard Worker   int16x8x2_t b0;
767*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
768*77c1e3ccSAndroid Build Coastguard Worker   b0.val[0] = vreinterpretq_s16_s64(
769*77c1e3ccSAndroid Build Coastguard Worker       vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
770*77c1e3ccSAndroid Build Coastguard Worker   b0.val[1] = vreinterpretq_s16_s64(
771*77c1e3ccSAndroid Build Coastguard Worker       vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
772*77c1e3ccSAndroid Build Coastguard Worker #else
773*77c1e3ccSAndroid Build Coastguard Worker   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
774*77c1e3ccSAndroid Build Coastguard Worker                            vreinterpret_s16_s32(vget_low_s32(a1)));
775*77c1e3ccSAndroid Build Coastguard Worker   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
776*77c1e3ccSAndroid Build Coastguard Worker                            vreinterpret_s16_s32(vget_high_s32(a1)));
777*77c1e3ccSAndroid Build Coastguard Worker #endif
778*77c1e3ccSAndroid Build Coastguard Worker   return b0;
779*77c1e3ccSAndroid Build Coastguard Worker }
780*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)781*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
782*77c1e3ccSAndroid Build Coastguard Worker                                                    int16x8_t *a2, int16x8_t *a3,
783*77c1e3ccSAndroid Build Coastguard Worker                                                    int16x8_t *a4, int16x8_t *a5,
784*77c1e3ccSAndroid Build Coastguard Worker                                                    int16x8_t *a6,
785*77c1e3ccSAndroid Build Coastguard Worker                                                    int16x8_t *a7) {
786*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements. Goes from:
787*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03 04 05 06 07
788*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13 14 15 16 17
789*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23 24 25 26 27
790*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33 34 35 36 37
791*77c1e3ccSAndroid Build Coastguard Worker   // a4: 40 41 42 43 44 45 46 47
792*77c1e3ccSAndroid Build Coastguard Worker   // a5: 50 51 52 53 54 55 56 57
793*77c1e3ccSAndroid Build Coastguard Worker   // a6: 60 61 62 63 64 65 66 67
794*77c1e3ccSAndroid Build Coastguard Worker   // a7: 70 71 72 73 74 75 76 77
795*77c1e3ccSAndroid Build Coastguard Worker   // to:
796*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16
797*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13 05 15 07 17
798*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36
799*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33 25 35 27 37
800*77c1e3ccSAndroid Build Coastguard Worker   // b2.val[0]: 40 50 42 52 44 54 46 56
801*77c1e3ccSAndroid Build Coastguard Worker   // b2.val[1]: 41 51 43 53 45 55 47 57
802*77c1e3ccSAndroid Build Coastguard Worker   // b3.val[0]: 60 70 62 72 64 74 66 76
803*77c1e3ccSAndroid Build Coastguard Worker   // b3.val[1]: 61 71 63 73 65 75 67 77
804*77c1e3ccSAndroid Build Coastguard Worker 
805*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
806*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
807*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
808*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
809*77c1e3ccSAndroid Build Coastguard Worker 
810*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements resulting in:
811*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30 04 14 24 34
812*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32 06 16 26 36
813*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31 05 15 25 35
814*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33 07 17 27 37
815*77c1e3ccSAndroid Build Coastguard Worker   // c2.val[0]: 40 50 60 70 44 54 64 74
816*77c1e3ccSAndroid Build Coastguard Worker   // c2.val[1]: 42 52 62 72 46 56 66 76
817*77c1e3ccSAndroid Build Coastguard Worker   // c3.val[0]: 41 51 61 71 45 55 65 75
818*77c1e3ccSAndroid Build Coastguard Worker   // c3.val[1]: 43 53 63 73 47 57 67 77
819*77c1e3ccSAndroid Build Coastguard Worker 
820*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
821*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(b1.val[0]));
822*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
823*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(b1.val[1]));
824*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
825*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(b3.val[0]));
826*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
827*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(b3.val[1]));
828*77c1e3ccSAndroid Build Coastguard Worker 
829*77c1e3ccSAndroid Build Coastguard Worker   // Swap 64 bit elements resulting in:
830*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[0]: 00 10 20 30 40 50 60 70
831*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[1]: 04 14 24 34 44 54 64 74
832*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[0]: 01 11 21 31 41 51 61 71
833*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[1]: 05 15 25 35 45 55 65 75
834*77c1e3ccSAndroid Build Coastguard Worker   // d2.val[0]: 02 12 22 32 42 52 62 72
835*77c1e3ccSAndroid Build Coastguard Worker   // d2.val[1]: 06 16 26 36 46 56 66 76
836*77c1e3ccSAndroid Build Coastguard Worker   // d3.val[0]: 03 13 23 33 43 53 63 73
837*77c1e3ccSAndroid Build Coastguard Worker   // d3.val[1]: 07 17 27 37 47 57 67 77
838*77c1e3ccSAndroid Build Coastguard Worker 
839*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
840*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
841*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
842*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
843*77c1e3ccSAndroid Build Coastguard Worker 
844*77c1e3ccSAndroid Build Coastguard Worker   *a0 = d0.val[0];
845*77c1e3ccSAndroid Build Coastguard Worker   *a1 = d1.val[0];
846*77c1e3ccSAndroid Build Coastguard Worker   *a2 = d2.val[0];
847*77c1e3ccSAndroid Build Coastguard Worker   *a3 = d3.val[0];
848*77c1e3ccSAndroid Build Coastguard Worker   *a4 = d0.val[1];
849*77c1e3ccSAndroid Build Coastguard Worker   *a5 = d1.val[1];
850*77c1e3ccSAndroid Build Coastguard Worker   *a6 = d2.val[1];
851*77c1e3ccSAndroid Build Coastguard Worker   *a7 = d3.val[1];
852*77c1e3ccSAndroid Build Coastguard Worker }
853*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_s16_8x8(const int16x8_t * a,int16x8_t * out)854*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_arrays_s16_8x8(const int16x8_t *a,
855*77c1e3ccSAndroid Build Coastguard Worker                                             int16x8_t *out) {
856*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements. Goes from:
857*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03 04 05 06 07
858*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13 14 15 16 17
859*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23 24 25 26 27
860*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33 34 35 36 37
861*77c1e3ccSAndroid Build Coastguard Worker   // a4: 40 41 42 43 44 45 46 47
862*77c1e3ccSAndroid Build Coastguard Worker   // a5: 50 51 52 53 54 55 56 57
863*77c1e3ccSAndroid Build Coastguard Worker   // a6: 60 61 62 63 64 65 66 67
864*77c1e3ccSAndroid Build Coastguard Worker   // a7: 70 71 72 73 74 75 76 77
865*77c1e3ccSAndroid Build Coastguard Worker   // to:
866*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16
867*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13 05 15 07 17
868*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36
869*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33 25 35 27 37
870*77c1e3ccSAndroid Build Coastguard Worker   // b2.val[0]: 40 50 42 52 44 54 46 56
871*77c1e3ccSAndroid Build Coastguard Worker   // b2.val[1]: 41 51 43 53 45 55 47 57
872*77c1e3ccSAndroid Build Coastguard Worker   // b3.val[0]: 60 70 62 72 64 74 66 76
873*77c1e3ccSAndroid Build Coastguard Worker   // b3.val[1]: 61 71 63 73 65 75 67 77
874*77c1e3ccSAndroid Build Coastguard Worker 
875*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
876*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
877*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
878*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
879*77c1e3ccSAndroid Build Coastguard Worker 
880*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements resulting in:
881*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30 04 14 24 34
882*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32 06 16 26 36
883*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31 05 15 25 35
884*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33 07 17 27 37
885*77c1e3ccSAndroid Build Coastguard Worker   // c2.val[0]: 40 50 60 70 44 54 64 74
886*77c1e3ccSAndroid Build Coastguard Worker   // c2.val[1]: 42 52 62 72 46 56 66 76
887*77c1e3ccSAndroid Build Coastguard Worker   // c3.val[0]: 41 51 61 71 45 55 65 75
888*77c1e3ccSAndroid Build Coastguard Worker   // c3.val[1]: 43 53 63 73 47 57 67 77
889*77c1e3ccSAndroid Build Coastguard Worker 
890*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
891*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(b1.val[0]));
892*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
893*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(b1.val[1]));
894*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
895*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(b3.val[0]));
896*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
897*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(b3.val[1]));
898*77c1e3ccSAndroid Build Coastguard Worker 
899*77c1e3ccSAndroid Build Coastguard Worker   // Swap 64 bit elements resulting in:
900*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[0]: 00 10 20 30 40 50 60 70
901*77c1e3ccSAndroid Build Coastguard Worker   // d0.val[1]: 04 14 24 34 44 54 64 74
902*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[0]: 01 11 21 31 41 51 61 71
903*77c1e3ccSAndroid Build Coastguard Worker   // d1.val[1]: 05 15 25 35 45 55 65 75
904*77c1e3ccSAndroid Build Coastguard Worker   // d2.val[0]: 02 12 22 32 42 52 62 72
905*77c1e3ccSAndroid Build Coastguard Worker   // d2.val[1]: 06 16 26 36 46 56 66 76
906*77c1e3ccSAndroid Build Coastguard Worker   // d3.val[0]: 03 13 23 33 43 53 63 73
907*77c1e3ccSAndroid Build Coastguard Worker   // d3.val[1]: 07 17 27 37 47 57 67 77
908*77c1e3ccSAndroid Build Coastguard Worker 
909*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
910*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
911*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
912*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
913*77c1e3ccSAndroid Build Coastguard Worker 
914*77c1e3ccSAndroid Build Coastguard Worker   out[0] = d0.val[0];
915*77c1e3ccSAndroid Build Coastguard Worker   out[1] = d1.val[0];
916*77c1e3ccSAndroid Build Coastguard Worker   out[2] = d2.val[0];
917*77c1e3ccSAndroid Build Coastguard Worker   out[3] = d3.val[0];
918*77c1e3ccSAndroid Build Coastguard Worker   out[4] = d0.val[1];
919*77c1e3ccSAndroid Build Coastguard Worker   out[5] = d1.val[1];
920*77c1e3ccSAndroid Build Coastguard Worker   out[6] = d2.val[1];
921*77c1e3ccSAndroid Build Coastguard Worker   out[7] = d3.val[1];
922*77c1e3ccSAndroid Build Coastguard Worker }
923*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_s16_8x4(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3)924*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1,
925*77c1e3ccSAndroid Build Coastguard Worker                                                    int16x8_t *a2,
926*77c1e3ccSAndroid Build Coastguard Worker                                                    int16x8_t *a3) {
927*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements. Goes from:
928*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03 04 05 06 07
929*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13 14 15 16 17
930*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23 24 25 26 27
931*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33 34 35 36 37
932*77c1e3ccSAndroid Build Coastguard Worker   // to:
933*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16
934*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13 05 15 07 17
935*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36
936*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33 25 35 27 37
937*77c1e3ccSAndroid Build Coastguard Worker 
938*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
939*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
940*77c1e3ccSAndroid Build Coastguard Worker 
941*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements resulting in:
942*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30 04 14 24 34
943*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 01 11 21 31 05 15 25 35
944*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 02 12 22 32 06 16 26 36
945*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33 07 17 27 37
946*77c1e3ccSAndroid Build Coastguard Worker 
947*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
948*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(b1.val[0]));
949*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
950*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpretq_s32_s16(b1.val[1]));
951*77c1e3ccSAndroid Build Coastguard Worker 
952*77c1e3ccSAndroid Build Coastguard Worker   *a0 = vreinterpretq_s16_s32(c0.val[0]);
953*77c1e3ccSAndroid Build Coastguard Worker   *a1 = vreinterpretq_s16_s32(c1.val[0]);
954*77c1e3ccSAndroid Build Coastguard Worker   *a2 = vreinterpretq_s16_s32(c0.val[1]);
955*77c1e3ccSAndroid Build Coastguard Worker   *a3 = vreinterpretq_s16_s32(c1.val[1]);
956*77c1e3ccSAndroid Build Coastguard Worker }
957*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_u16_4x4(uint16x4_t * a0,uint16x4_t * a1,uint16x4_t * a2,uint16x4_t * a3)958*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
959*77c1e3ccSAndroid Build Coastguard Worker                                                    uint16x4_t *a1,
960*77c1e3ccSAndroid Build Coastguard Worker                                                    uint16x4_t *a2,
961*77c1e3ccSAndroid Build Coastguard Worker                                                    uint16x4_t *a3) {
962*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements. Goes from:
963*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03
964*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13
965*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23
966*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33
967*77c1e3ccSAndroid Build Coastguard Worker   // to:
968*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12
969*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13
970*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32
971*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33
972*77c1e3ccSAndroid Build Coastguard Worker 
973*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
974*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
975*77c1e3ccSAndroid Build Coastguard Worker 
976*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements resulting in:
977*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30
978*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32
979*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31
980*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33
981*77c1e3ccSAndroid Build Coastguard Worker 
982*77c1e3ccSAndroid Build Coastguard Worker   const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
983*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpret_u32_u16(b1.val[0]));
984*77c1e3ccSAndroid Build Coastguard Worker   const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
985*77c1e3ccSAndroid Build Coastguard Worker                                    vreinterpret_u32_u16(b1.val[1]));
986*77c1e3ccSAndroid Build Coastguard Worker 
987*77c1e3ccSAndroid Build Coastguard Worker   *a0 = vreinterpret_u16_u32(c0.val[0]);
988*77c1e3ccSAndroid Build Coastguard Worker   *a1 = vreinterpret_u16_u32(c1.val[0]);
989*77c1e3ccSAndroid Build Coastguard Worker   *a2 = vreinterpret_u16_u32(c0.val[1]);
990*77c1e3ccSAndroid Build Coastguard Worker   *a3 = vreinterpret_u16_u32(c1.val[1]);
991*77c1e3ccSAndroid Build Coastguard Worker }
992*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_s16_4x4(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)993*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
994*77c1e3ccSAndroid Build Coastguard Worker                                                    int16x4_t *a2,
995*77c1e3ccSAndroid Build Coastguard Worker                                                    int16x4_t *a3) {
996*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements. Goes from:
997*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03
998*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13
999*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23
1000*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33
1001*77c1e3ccSAndroid Build Coastguard Worker   // to:
1002*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12
1003*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13
1004*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32
1005*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33
1006*77c1e3ccSAndroid Build Coastguard Worker 
1007*77c1e3ccSAndroid Build Coastguard Worker   const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
1008*77c1e3ccSAndroid Build Coastguard Worker   const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
1009*77c1e3ccSAndroid Build Coastguard Worker 
1010*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements resulting in:
1011*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30
1012*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32
1013*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31
1014*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33
1015*77c1e3ccSAndroid Build Coastguard Worker 
1016*77c1e3ccSAndroid Build Coastguard Worker   const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
1017*77c1e3ccSAndroid Build Coastguard Worker                                   vreinterpret_s32_s16(b1.val[0]));
1018*77c1e3ccSAndroid Build Coastguard Worker   const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
1019*77c1e3ccSAndroid Build Coastguard Worker                                   vreinterpret_s32_s16(b1.val[1]));
1020*77c1e3ccSAndroid Build Coastguard Worker 
1021*77c1e3ccSAndroid Build Coastguard Worker   *a0 = vreinterpret_s16_s32(c0.val[0]);
1022*77c1e3ccSAndroid Build Coastguard Worker   *a1 = vreinterpret_s16_s32(c1.val[0]);
1023*77c1e3ccSAndroid Build Coastguard Worker   *a2 = vreinterpret_s16_s32(c0.val[1]);
1024*77c1e3ccSAndroid Build Coastguard Worker   *a3 = vreinterpret_s16_s32(c1.val[1]);
1025*77c1e3ccSAndroid Build Coastguard Worker }
1026*77c1e3ccSAndroid Build Coastguard Worker 
aom_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)1027*77c1e3ccSAndroid Build Coastguard Worker static inline int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
1028*77c1e3ccSAndroid Build Coastguard Worker   int32x4x2_t b0;
1029*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1030*77c1e3ccSAndroid Build Coastguard Worker   b0.val[0] = vreinterpretq_s32_s64(
1031*77c1e3ccSAndroid Build Coastguard Worker       vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
1032*77c1e3ccSAndroid Build Coastguard Worker   b0.val[1] = vreinterpretq_s32_s64(
1033*77c1e3ccSAndroid Build Coastguard Worker       vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
1034*77c1e3ccSAndroid Build Coastguard Worker #else
1035*77c1e3ccSAndroid Build Coastguard Worker   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
1036*77c1e3ccSAndroid Build Coastguard Worker   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
1037*77c1e3ccSAndroid Build Coastguard Worker #endif
1038*77c1e3ccSAndroid Build Coastguard Worker   return b0;
1039*77c1e3ccSAndroid Build Coastguard Worker }
1040*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_s32_4x4(const int32x4_t a0,const int32x4_t a1,const int32x4_t a2,const int32x4_t a3,int32x4_t * o0,int32x4_t * o1,int32x4_t * o2,int32x4_t * o3)1041*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_s32_4x4(const int32x4_t a0,
1042*77c1e3ccSAndroid Build Coastguard Worker                                            const int32x4_t a1,
1043*77c1e3ccSAndroid Build Coastguard Worker                                            const int32x4_t a2,
1044*77c1e3ccSAndroid Build Coastguard Worker                                            const int32x4_t a3, int32x4_t *o0,
1045*77c1e3ccSAndroid Build Coastguard Worker                                            int32x4_t *o1, int32x4_t *o2,
1046*77c1e3ccSAndroid Build Coastguard Worker                                            int32x4_t *o3) {
1047*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements. Goes from:
1048*77c1e3ccSAndroid Build Coastguard Worker   // a0: 00 01 02 03
1049*77c1e3ccSAndroid Build Coastguard Worker   // a1: 10 11 12 13
1050*77c1e3ccSAndroid Build Coastguard Worker   // a2: 20 21 22 23
1051*77c1e3ccSAndroid Build Coastguard Worker   // a3: 30 31 32 33
1052*77c1e3ccSAndroid Build Coastguard Worker   // to:
1053*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12
1054*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13
1055*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32
1056*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33
1057*77c1e3ccSAndroid Build Coastguard Worker 
1058*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t b0 = vtrnq_s32(a0, a1);
1059*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t b1 = vtrnq_s32(a2, a3);
1060*77c1e3ccSAndroid Build Coastguard Worker 
1061*77c1e3ccSAndroid Build Coastguard Worker   // Swap 64 bit elements resulting in:
1062*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30
1063*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32
1064*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31
1065*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33
1066*77c1e3ccSAndroid Build Coastguard Worker 
1067*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
1068*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
1069*77c1e3ccSAndroid Build Coastguard Worker 
1070*77c1e3ccSAndroid Build Coastguard Worker   *o0 = c0.val[0];
1071*77c1e3ccSAndroid Build Coastguard Worker   *o1 = c1.val[0];
1072*77c1e3ccSAndroid Build Coastguard Worker   *o2 = c0.val[1];
1073*77c1e3ccSAndroid Build Coastguard Worker   *o3 = c1.val[1];
1074*77c1e3ccSAndroid Build Coastguard Worker }
1075*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)1076*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
1077*77c1e3ccSAndroid Build Coastguard Worker                                                    int32x4_t *a2,
1078*77c1e3ccSAndroid Build Coastguard Worker                                                    int32x4_t *a3) {
1079*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
1080*77c1e3ccSAndroid Build Coastguard Worker }
1081*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_s32_4x4(const int32x4_t * in,int32x4_t * out)1082*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_arrays_s32_4x4(const int32x4_t *in,
1083*77c1e3ccSAndroid Build Coastguard Worker                                             int32x4_t *out) {
1084*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
1085*77c1e3ccSAndroid Build Coastguard Worker                           &out[3]);
1086*77c1e3ccSAndroid Build Coastguard Worker }
1087*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_s32_4nx4n(const int32x4_t * in,int32x4_t * out,const int width,const int height)1088*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
1089*77c1e3ccSAndroid Build Coastguard Worker                                                         int32x4_t *out,
1090*77c1e3ccSAndroid Build Coastguard Worker                                                         const int width,
1091*77c1e3ccSAndroid Build Coastguard Worker                                                         const int height) {
1092*77c1e3ccSAndroid Build Coastguard Worker   const int h = height >> 2;
1093*77c1e3ccSAndroid Build Coastguard Worker   const int w = width >> 2;
1094*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < w; j++) {
1095*77c1e3ccSAndroid Build Coastguard Worker     for (int i = 0; i < h; i++) {
1096*77c1e3ccSAndroid Build Coastguard Worker       transpose_arrays_s32_4x4(in + j * height + i * 4,
1097*77c1e3ccSAndroid Build Coastguard Worker                                out + i * width + j * 4);
1098*77c1e3ccSAndroid Build Coastguard Worker     }
1099*77c1e3ccSAndroid Build Coastguard Worker   }
1100*77c1e3ccSAndroid Build Coastguard Worker }
1101*77c1e3ccSAndroid Build Coastguard Worker 
1102*77c1e3ccSAndroid Build Coastguard Worker #define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h)                    \
1103*77c1e3ccSAndroid Build Coastguard Worker   static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
1104*77c1e3ccSAndroid Build Coastguard Worker       const int32x4_t *in, int32x4_t *out) {                   \
1105*77c1e3ccSAndroid Build Coastguard Worker     transpose_arrays_s32_4nx4n(in, out, w, h);                 \
1106*77c1e3ccSAndroid Build Coastguard Worker   }
1107*77c1e3ccSAndroid Build Coastguard Worker 
1108*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
1109*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
1110*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
1111*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
1112*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
1113*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
1114*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
1115*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
1116*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
1117*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
1118*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
1119*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
1120*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
1121*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
1122*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
1123*77c1e3ccSAndroid Build Coastguard Worker TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
1124*77c1e3ccSAndroid Build Coastguard Worker 
1125*77c1e3ccSAndroid Build Coastguard Worker #undef TRANSPOSE_ARRAYS_S32_WXH_NEON
1126*77c1e3ccSAndroid Build Coastguard Worker 
aom_vtrn1q_s64(int64x2_t a,int64x2_t b)1127*77c1e3ccSAndroid Build Coastguard Worker static inline int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
1128*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1129*77c1e3ccSAndroid Build Coastguard Worker   return vtrn1q_s64(a, b);
1130*77c1e3ccSAndroid Build Coastguard Worker #else
1131*77c1e3ccSAndroid Build Coastguard Worker   return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
1132*77c1e3ccSAndroid Build Coastguard Worker #endif
1133*77c1e3ccSAndroid Build Coastguard Worker }
1134*77c1e3ccSAndroid Build Coastguard Worker 
aom_vtrn2q_s64(int64x2_t a,int64x2_t b)1135*77c1e3ccSAndroid Build Coastguard Worker static inline int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
1136*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1137*77c1e3ccSAndroid Build Coastguard Worker   return vtrn2q_s64(a, b);
1138*77c1e3ccSAndroid Build Coastguard Worker #else
1139*77c1e3ccSAndroid Build Coastguard Worker   return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
1140*77c1e3ccSAndroid Build Coastguard Worker #endif
1141*77c1e3ccSAndroid Build Coastguard Worker }
1142*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_s32_4x8(int32x4_t a0,int32x4_t a1,int32x4_t a2,int32x4_t a3,int32x4_t a4,int32x4_t a5,int32x4_t a6,int32x4_t a7,int32x4x2_t * o0,int32x4x2_t * o1,int32x4x2_t * o2,int32x4x2_t * o3)1143*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
1144*77c1e3ccSAndroid Build Coastguard Worker                                            int32x4_t a2, int32x4_t a3,
1145*77c1e3ccSAndroid Build Coastguard Worker                                            int32x4_t a4, int32x4_t a5,
1146*77c1e3ccSAndroid Build Coastguard Worker                                            int32x4_t a6, int32x4_t a7,
1147*77c1e3ccSAndroid Build Coastguard Worker                                            int32x4x2_t *o0, int32x4x2_t *o1,
1148*77c1e3ccSAndroid Build Coastguard Worker                                            int32x4x2_t *o2, int32x4x2_t *o3) {
1149*77c1e3ccSAndroid Build Coastguard Worker   // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
1150*77c1e3ccSAndroid Build Coastguard Worker   // matrix transpose implementation:
1151*77c1e3ccSAndroid Build Coastguard Worker   // [ A ]^T => [ A^T B^T ]
1152*77c1e3ccSAndroid Build Coastguard Worker   // [ B ]
1153*77c1e3ccSAndroid Build Coastguard Worker 
1154*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3);  // A^T
1155*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7);  // B^T
1156*77c1e3ccSAndroid Build Coastguard Worker 
1157*77c1e3ccSAndroid Build Coastguard Worker   o0->val[0] = a0;
1158*77c1e3ccSAndroid Build Coastguard Worker   o1->val[0] = a1;
1159*77c1e3ccSAndroid Build Coastguard Worker   o2->val[0] = a2;
1160*77c1e3ccSAndroid Build Coastguard Worker   o3->val[0] = a3;
1161*77c1e3ccSAndroid Build Coastguard Worker 
1162*77c1e3ccSAndroid Build Coastguard Worker   o0->val[1] = a4;
1163*77c1e3ccSAndroid Build Coastguard Worker   o1->val[1] = a5;
1164*77c1e3ccSAndroid Build Coastguard Worker   o2->val[1] = a6;
1165*77c1e3ccSAndroid Build Coastguard Worker   o3->val[1] = a7;
1166*77c1e3ccSAndroid Build Coastguard Worker }
1167*77c1e3ccSAndroid Build Coastguard Worker 
transpose_elems_inplace_s32_8x8(int32x4x2_t * a0,int32x4x2_t * a1,int32x4x2_t * a2,int32x4x2_t * a3,int32x4x2_t * a4,int32x4x2_t * a5,int32x4x2_t * a6,int32x4x2_t * a7)1168*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_elems_inplace_s32_8x8(
1169*77c1e3ccSAndroid Build Coastguard Worker     int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
1170*77c1e3ccSAndroid Build Coastguard Worker     int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
1171*77c1e3ccSAndroid Build Coastguard Worker   // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
1172*77c1e3ccSAndroid Build Coastguard Worker   // matrix transpose implementation:
1173*77c1e3ccSAndroid Build Coastguard Worker   // [ A B ]^T => [ A^T C^T ]
1174*77c1e3ccSAndroid Build Coastguard Worker   // [ C D ]      [ B^T D^T ]
1175*77c1e3ccSAndroid Build Coastguard Worker 
1176*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q0_v1 = a0->val[0];
1177*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q0_v2 = a1->val[0];
1178*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q0_v3 = a2->val[0];
1179*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q0_v4 = a3->val[0];
1180*77c1e3ccSAndroid Build Coastguard Worker 
1181*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q1_v1 = a0->val[1];
1182*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q1_v2 = a1->val[1];
1183*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q1_v3 = a2->val[1];
1184*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q1_v4 = a3->val[1];
1185*77c1e3ccSAndroid Build Coastguard Worker 
1186*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q2_v1 = a4->val[0];
1187*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q2_v2 = a5->val[0];
1188*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q2_v3 = a6->val[0];
1189*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q2_v4 = a7->val[0];
1190*77c1e3ccSAndroid Build Coastguard Worker 
1191*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q3_v1 = a4->val[1];
1192*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q3_v2 = a5->val[1];
1193*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q3_v3 = a6->val[1];
1194*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t q3_v4 = a7->val[1];
1195*77c1e3ccSAndroid Build Coastguard Worker 
1196*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4);  // A^T
1197*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4);  // B^T
1198*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4);  // C^T
1199*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4);  // D^T
1200*77c1e3ccSAndroid Build Coastguard Worker 
1201*77c1e3ccSAndroid Build Coastguard Worker   a0->val[0] = q0_v1;
1202*77c1e3ccSAndroid Build Coastguard Worker   a1->val[0] = q0_v2;
1203*77c1e3ccSAndroid Build Coastguard Worker   a2->val[0] = q0_v3;
1204*77c1e3ccSAndroid Build Coastguard Worker   a3->val[0] = q0_v4;
1205*77c1e3ccSAndroid Build Coastguard Worker 
1206*77c1e3ccSAndroid Build Coastguard Worker   a0->val[1] = q2_v1;
1207*77c1e3ccSAndroid Build Coastguard Worker   a1->val[1] = q2_v2;
1208*77c1e3ccSAndroid Build Coastguard Worker   a2->val[1] = q2_v3;
1209*77c1e3ccSAndroid Build Coastguard Worker   a3->val[1] = q2_v4;
1210*77c1e3ccSAndroid Build Coastguard Worker 
1211*77c1e3ccSAndroid Build Coastguard Worker   a4->val[0] = q1_v1;
1212*77c1e3ccSAndroid Build Coastguard Worker   a5->val[0] = q1_v2;
1213*77c1e3ccSAndroid Build Coastguard Worker   a6->val[0] = q1_v3;
1214*77c1e3ccSAndroid Build Coastguard Worker   a7->val[0] = q1_v4;
1215*77c1e3ccSAndroid Build Coastguard Worker 
1216*77c1e3ccSAndroid Build Coastguard Worker   a4->val[1] = q3_v1;
1217*77c1e3ccSAndroid Build Coastguard Worker   a5->val[1] = q3_v2;
1218*77c1e3ccSAndroid Build Coastguard Worker   a6->val[1] = q3_v3;
1219*77c1e3ccSAndroid Build Coastguard Worker   a7->val[1] = q3_v4;
1220*77c1e3ccSAndroid Build Coastguard Worker }
1221*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_s16_4x4(const int16x4_t * const in,int16x4_t * const out)1222*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_arrays_s16_4x4(const int16x4_t *const in,
1223*77c1e3ccSAndroid Build Coastguard Worker                                             int16x4_t *const out) {
1224*77c1e3ccSAndroid Build Coastguard Worker   int16x4_t a0 = in[0];
1225*77c1e3ccSAndroid Build Coastguard Worker   int16x4_t a1 = in[1];
1226*77c1e3ccSAndroid Build Coastguard Worker   int16x4_t a2 = in[2];
1227*77c1e3ccSAndroid Build Coastguard Worker   int16x4_t a3 = in[3];
1228*77c1e3ccSAndroid Build Coastguard Worker 
1229*77c1e3ccSAndroid Build Coastguard Worker   transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
1230*77c1e3ccSAndroid Build Coastguard Worker 
1231*77c1e3ccSAndroid Build Coastguard Worker   out[0] = a0;
1232*77c1e3ccSAndroid Build Coastguard Worker   out[1] = a1;
1233*77c1e3ccSAndroid Build Coastguard Worker   out[2] = a2;
1234*77c1e3ccSAndroid Build Coastguard Worker   out[3] = a3;
1235*77c1e3ccSAndroid Build Coastguard Worker }
1236*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_s16_4x8(const int16x4_t * const in,int16x8_t * const out)1237*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_arrays_s16_4x8(const int16x4_t *const in,
1238*77c1e3ccSAndroid Build Coastguard Worker                                             int16x8_t *const out) {
1239*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1240*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
1241*77c1e3ccSAndroid Build Coastguard Worker                                   vcombine_s16(in[1], vdup_n_s16(0)));
1242*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
1243*77c1e3ccSAndroid Build Coastguard Worker                                   vcombine_s16(in[3], vdup_n_s16(0)));
1244*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
1245*77c1e3ccSAndroid Build Coastguard Worker                                   vcombine_s16(in[5], vdup_n_s16(0)));
1246*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
1247*77c1e3ccSAndroid Build Coastguard Worker                                   vcombine_s16(in[7], vdup_n_s16(0)));
1248*77c1e3ccSAndroid Build Coastguard Worker #else
1249*77c1e3ccSAndroid Build Coastguard Worker   int16x4x2_t temp;
1250*77c1e3ccSAndroid Build Coastguard Worker   temp = vzip_s16(in[0], in[1]);
1251*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
1252*77c1e3ccSAndroid Build Coastguard Worker   temp = vzip_s16(in[2], in[3]);
1253*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
1254*77c1e3ccSAndroid Build Coastguard Worker   temp = vzip_s16(in[4], in[5]);
1255*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
1256*77c1e3ccSAndroid Build Coastguard Worker   temp = vzip_s16(in[6], in[7]);
1257*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
1258*77c1e3ccSAndroid Build Coastguard Worker #endif
1259*77c1e3ccSAndroid Build Coastguard Worker 
1260*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t b02 =
1261*77c1e3ccSAndroid Build Coastguard Worker       vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
1262*77c1e3ccSAndroid Build Coastguard Worker   const int32x4x2_t b13 =
1263*77c1e3ccSAndroid Build Coastguard Worker       vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
1264*77c1e3ccSAndroid Build Coastguard Worker 
1265*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1266*77c1e3ccSAndroid Build Coastguard Worker   out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
1267*77c1e3ccSAndroid Build Coastguard Worker                                             vreinterpretq_s64_s32(b13.val[0])));
1268*77c1e3ccSAndroid Build Coastguard Worker   out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
1269*77c1e3ccSAndroid Build Coastguard Worker                                             vreinterpretq_s64_s32(b13.val[0])));
1270*77c1e3ccSAndroid Build Coastguard Worker   out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
1271*77c1e3ccSAndroid Build Coastguard Worker                                             vreinterpretq_s64_s32(b13.val[1])));
1272*77c1e3ccSAndroid Build Coastguard Worker   out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
1273*77c1e3ccSAndroid Build Coastguard Worker                                             vreinterpretq_s64_s32(b13.val[1])));
1274*77c1e3ccSAndroid Build Coastguard Worker #else
1275*77c1e3ccSAndroid Build Coastguard Worker   out[0] = vreinterpretq_s16_s32(
1276*77c1e3ccSAndroid Build Coastguard Worker       vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
1277*77c1e3ccSAndroid Build Coastguard Worker   out[2] = vreinterpretq_s16_s32(
1278*77c1e3ccSAndroid Build Coastguard Worker       vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
1279*77c1e3ccSAndroid Build Coastguard Worker   out[1] = vreinterpretq_s16_s32(
1280*77c1e3ccSAndroid Build Coastguard Worker       vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
1281*77c1e3ccSAndroid Build Coastguard Worker   out[3] = vreinterpretq_s16_s32(
1282*77c1e3ccSAndroid Build Coastguard Worker       vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
1283*77c1e3ccSAndroid Build Coastguard Worker #endif
1284*77c1e3ccSAndroid Build Coastguard Worker }
1285*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_s16_8x4(const int16x8_t * const in,int16x4_t * const out)1286*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_arrays_s16_8x4(const int16x8_t *const in,
1287*77c1e3ccSAndroid Build Coastguard Worker                                             int16x4_t *const out) {
1288*77c1e3ccSAndroid Build Coastguard Worker   // Swap 16 bit elements. Goes from:
1289*77c1e3ccSAndroid Build Coastguard Worker   // in[0]: 00 01 02 03 04 05 06 07
1290*77c1e3ccSAndroid Build Coastguard Worker   // in[1]: 10 11 12 13 14 15 16 17
1291*77c1e3ccSAndroid Build Coastguard Worker   // in[2]: 20 21 22 23 24 25 26 27
1292*77c1e3ccSAndroid Build Coastguard Worker   // in[3]: 30 31 32 33 34 35 36 37
1293*77c1e3ccSAndroid Build Coastguard Worker   // to:
1294*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[0]: 00 10 02 12 04 14 06 16
1295*77c1e3ccSAndroid Build Coastguard Worker   // b0.val[1]: 01 11 03 13 05 15 07 17
1296*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[0]: 20 30 22 32 24 34 26 36
1297*77c1e3ccSAndroid Build Coastguard Worker   // b1.val[1]: 21 31 23 33 25 35 27 37
1298*77c1e3ccSAndroid Build Coastguard Worker 
1299*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
1300*77c1e3ccSAndroid Build Coastguard Worker   const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
1301*77c1e3ccSAndroid Build Coastguard Worker 
1302*77c1e3ccSAndroid Build Coastguard Worker   // Swap 32 bit elements resulting in:
1303*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[0]: 00 10 20 30 04 14 24 34
1304*77c1e3ccSAndroid Build Coastguard Worker   // c0.val[1]: 02 12 22 32 06 16 26 36
1305*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[0]: 01 11 21 31 05 15 25 35
1306*77c1e3ccSAndroid Build Coastguard Worker   // c1.val[1]: 03 13 23 33 07 17 27 37
1307*77c1e3ccSAndroid Build Coastguard Worker 
1308*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
1309*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_s16(b1.val[0]));
1310*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
1311*77c1e3ccSAndroid Build Coastguard Worker                                     vreinterpretq_u32_s16(b1.val[1]));
1312*77c1e3ccSAndroid Build Coastguard Worker 
1313*77c1e3ccSAndroid Build Coastguard Worker   // Unpack 64 bit elements resulting in:
1314*77c1e3ccSAndroid Build Coastguard Worker   // out[0]: 00 10 20 30
1315*77c1e3ccSAndroid Build Coastguard Worker   // out[1]: 01 11 21 31
1316*77c1e3ccSAndroid Build Coastguard Worker   // out[2]: 02 12 22 32
1317*77c1e3ccSAndroid Build Coastguard Worker   // out[3]: 03 13 23 33
1318*77c1e3ccSAndroid Build Coastguard Worker   // out[4]: 04 14 24 34
1319*77c1e3ccSAndroid Build Coastguard Worker   // out[5]: 05 15 25 35
1320*77c1e3ccSAndroid Build Coastguard Worker   // out[6]: 06 16 26 36
1321*77c1e3ccSAndroid Build Coastguard Worker   // out[7]: 07 17 27 37
1322*77c1e3ccSAndroid Build Coastguard Worker 
1323*77c1e3ccSAndroid Build Coastguard Worker   out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
1324*77c1e3ccSAndroid Build Coastguard Worker   out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
1325*77c1e3ccSAndroid Build Coastguard Worker   out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
1326*77c1e3ccSAndroid Build Coastguard Worker   out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
1327*77c1e3ccSAndroid Build Coastguard Worker   out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
1328*77c1e3ccSAndroid Build Coastguard Worker   out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
1329*77c1e3ccSAndroid Build Coastguard Worker   out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
1330*77c1e3ccSAndroid Build Coastguard Worker   out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
1331*77c1e3ccSAndroid Build Coastguard Worker }
1332*77c1e3ccSAndroid Build Coastguard Worker 
transpose_arrays_s64_4x4(const int64x2_t * in,int64x2_t * out)1333*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_arrays_s64_4x4(const int64x2_t *in,
1334*77c1e3ccSAndroid Build Coastguard Worker                                             int64x2_t *out) {
1335*77c1e3ccSAndroid Build Coastguard Worker   // Perform a 4x4 matrix transpose going from:
1336*77c1e3ccSAndroid Build Coastguard Worker   // in[0] = 00 01
1337*77c1e3ccSAndroid Build Coastguard Worker   // in[1] = 02 03
1338*77c1e3ccSAndroid Build Coastguard Worker   // in[2] = 10 11
1339*77c1e3ccSAndroid Build Coastguard Worker   // in[3] = 12 13
1340*77c1e3ccSAndroid Build Coastguard Worker   // in[4] = 20 21
1341*77c1e3ccSAndroid Build Coastguard Worker   // in[5] = 22 23
1342*77c1e3ccSAndroid Build Coastguard Worker   // in[6] = 30 31
1343*77c1e3ccSAndroid Build Coastguard Worker   // in[7] = 32 33
1344*77c1e3ccSAndroid Build Coastguard Worker   //
1345*77c1e3ccSAndroid Build Coastguard Worker   // to:
1346*77c1e3ccSAndroid Build Coastguard Worker   // out[0] = 00 10
1347*77c1e3ccSAndroid Build Coastguard Worker   // out[1] = 20 30
1348*77c1e3ccSAndroid Build Coastguard Worker   // out[2] = 01 11
1349*77c1e3ccSAndroid Build Coastguard Worker   // out[3] = 21 31
1350*77c1e3ccSAndroid Build Coastguard Worker   // out[4] = 02 12
1351*77c1e3ccSAndroid Build Coastguard Worker   // out[5] = 22 32
1352*77c1e3ccSAndroid Build Coastguard Worker   // out[6] = 03 13
1353*77c1e3ccSAndroid Build Coastguard Worker   // out[7] = 23 33
1354*77c1e3ccSAndroid Build Coastguard Worker 
1355*77c1e3ccSAndroid Build Coastguard Worker   out[0] = aom_vtrn1q_s64(in[0], in[2]);
1356*77c1e3ccSAndroid Build Coastguard Worker   out[1] = aom_vtrn1q_s64(in[4], in[6]);
1357*77c1e3ccSAndroid Build Coastguard Worker   out[2] = aom_vtrn2q_s64(in[0], in[2]);
1358*77c1e3ccSAndroid Build Coastguard Worker   out[3] = aom_vtrn2q_s64(in[4], in[6]);
1359*77c1e3ccSAndroid Build Coastguard Worker   out[4] = aom_vtrn1q_s64(in[1], in[3]);
1360*77c1e3ccSAndroid Build Coastguard Worker   out[5] = aom_vtrn1q_s64(in[5], in[7]);
1361*77c1e3ccSAndroid Build Coastguard Worker   out[6] = aom_vtrn2q_s64(in[1], in[3]);
1362*77c1e3ccSAndroid Build Coastguard Worker   out[7] = aom_vtrn2q_s64(in[5], in[7]);
1363*77c1e3ccSAndroid Build Coastguard Worker }
1364*77c1e3ccSAndroid Build Coastguard Worker 
1365*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
1366