xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/transpose_sse2.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  *  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  *  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  *  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  *  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  *  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  */
10*fb1b10abSAndroid Build Coastguard Worker 
11*fb1b10abSAndroid Build Coastguard Worker #ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
12*fb1b10abSAndroid Build Coastguard Worker #define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
13*fb1b10abSAndroid Build Coastguard Worker 
14*fb1b10abSAndroid Build Coastguard Worker #include <emmintrin.h>  // SSE2
15*fb1b10abSAndroid Build Coastguard Worker 
16*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
17*fb1b10abSAndroid Build Coastguard Worker 
transpose_8bit_4x4(const __m128i * const in)18*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
19*fb1b10abSAndroid Build Coastguard Worker   // Unpack 8 bit elements. Goes from:
20*fb1b10abSAndroid Build Coastguard Worker   // in[0]: 00 01 02 03
21*fb1b10abSAndroid Build Coastguard Worker   // in[1]: 10 11 12 13
22*fb1b10abSAndroid Build Coastguard Worker   // in[2]: 20 21 22 23
23*fb1b10abSAndroid Build Coastguard Worker   // in[3]: 30 31 32 33
24*fb1b10abSAndroid Build Coastguard Worker   // to:
25*fb1b10abSAndroid Build Coastguard Worker   // a0:    00 10 01 11  02 12 03 13
26*fb1b10abSAndroid Build Coastguard Worker   // a1:    20 30 21 31  22 32 23 33
27*fb1b10abSAndroid Build Coastguard Worker   const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
28*fb1b10abSAndroid Build Coastguard Worker   const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
29*fb1b10abSAndroid Build Coastguard Worker 
30*fb1b10abSAndroid Build Coastguard Worker   // Unpack 16 bit elements resulting in:
31*fb1b10abSAndroid Build Coastguard Worker   // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
32*fb1b10abSAndroid Build Coastguard Worker   return _mm_unpacklo_epi16(a0, a1);
33*fb1b10abSAndroid Build Coastguard Worker }
34*fb1b10abSAndroid Build Coastguard Worker 
transpose_8bit_8x8(const __m128i * const in,__m128i * const out)35*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_8bit_8x8(const __m128i *const in,
36*fb1b10abSAndroid Build Coastguard Worker                                       __m128i *const out) {
37*fb1b10abSAndroid Build Coastguard Worker   // Unpack 8 bit elements. Goes from:
38*fb1b10abSAndroid Build Coastguard Worker   // in[0]: 00 01 02 03 04 05 06 07
39*fb1b10abSAndroid Build Coastguard Worker   // in[1]: 10 11 12 13 14 15 16 17
40*fb1b10abSAndroid Build Coastguard Worker   // in[2]: 20 21 22 23 24 25 26 27
41*fb1b10abSAndroid Build Coastguard Worker   // in[3]: 30 31 32 33 34 35 36 37
42*fb1b10abSAndroid Build Coastguard Worker   // in[4]: 40 41 42 43 44 45 46 47
43*fb1b10abSAndroid Build Coastguard Worker   // in[5]: 50 51 52 53 54 55 56 57
44*fb1b10abSAndroid Build Coastguard Worker   // in[6]: 60 61 62 63 64 65 66 67
45*fb1b10abSAndroid Build Coastguard Worker   // in[7]: 70 71 72 73 74 75 76 77
46*fb1b10abSAndroid Build Coastguard Worker   // to:
47*fb1b10abSAndroid Build Coastguard Worker   // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
48*fb1b10abSAndroid Build Coastguard Worker   // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
49*fb1b10abSAndroid Build Coastguard Worker   // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
50*fb1b10abSAndroid Build Coastguard Worker   // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
51*fb1b10abSAndroid Build Coastguard Worker   const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
52*fb1b10abSAndroid Build Coastguard Worker   const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
53*fb1b10abSAndroid Build Coastguard Worker   const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
54*fb1b10abSAndroid Build Coastguard Worker   const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
55*fb1b10abSAndroid Build Coastguard Worker 
56*fb1b10abSAndroid Build Coastguard Worker   // Unpack 16 bit elements resulting in:
57*fb1b10abSAndroid Build Coastguard Worker   // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
58*fb1b10abSAndroid Build Coastguard Worker   // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
59*fb1b10abSAndroid Build Coastguard Worker   // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
60*fb1b10abSAndroid Build Coastguard Worker   // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
61*fb1b10abSAndroid Build Coastguard Worker   const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
62*fb1b10abSAndroid Build Coastguard Worker   const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
63*fb1b10abSAndroid Build Coastguard Worker   const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
64*fb1b10abSAndroid Build Coastguard Worker   const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
65*fb1b10abSAndroid Build Coastguard Worker 
66*fb1b10abSAndroid Build Coastguard Worker   // Unpack 32 bit elements resulting in:
67*fb1b10abSAndroid Build Coastguard Worker   // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
68*fb1b10abSAndroid Build Coastguard Worker   // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
69*fb1b10abSAndroid Build Coastguard Worker   // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
70*fb1b10abSAndroid Build Coastguard Worker   // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
71*fb1b10abSAndroid Build Coastguard Worker   const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
72*fb1b10abSAndroid Build Coastguard Worker   const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
73*fb1b10abSAndroid Build Coastguard Worker   const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
74*fb1b10abSAndroid Build Coastguard Worker   const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
75*fb1b10abSAndroid Build Coastguard Worker 
76*fb1b10abSAndroid Build Coastguard Worker   // Unpack 64 bit elements resulting in:
77*fb1b10abSAndroid Build Coastguard Worker   // out[0]: 00 10 20 30 40 50 60 70
78*fb1b10abSAndroid Build Coastguard Worker   // out[1]: 01 11 21 31 41 51 61 71
79*fb1b10abSAndroid Build Coastguard Worker   // out[2]: 02 12 22 32 42 52 62 72
80*fb1b10abSAndroid Build Coastguard Worker   // out[3]: 03 13 23 33 43 53 63 73
81*fb1b10abSAndroid Build Coastguard Worker   // out[4]: 04 14 24 34 44 54 64 74
82*fb1b10abSAndroid Build Coastguard Worker   // out[5]: 05 15 25 35 45 55 65 75
83*fb1b10abSAndroid Build Coastguard Worker   // out[6]: 06 16 26 36 46 56 66 76
84*fb1b10abSAndroid Build Coastguard Worker   // out[7]: 07 17 27 37 47 57 67 77
85*fb1b10abSAndroid Build Coastguard Worker   out[0] = _mm_unpacklo_epi64(c0, c0);
86*fb1b10abSAndroid Build Coastguard Worker   out[1] = _mm_unpackhi_epi64(c0, c0);
87*fb1b10abSAndroid Build Coastguard Worker   out[2] = _mm_unpacklo_epi64(c1, c1);
88*fb1b10abSAndroid Build Coastguard Worker   out[3] = _mm_unpackhi_epi64(c1, c1);
89*fb1b10abSAndroid Build Coastguard Worker   out[4] = _mm_unpacklo_epi64(c2, c2);
90*fb1b10abSAndroid Build Coastguard Worker   out[5] = _mm_unpackhi_epi64(c2, c2);
91*fb1b10abSAndroid Build Coastguard Worker   out[6] = _mm_unpacklo_epi64(c3, c3);
92*fb1b10abSAndroid Build Coastguard Worker   out[7] = _mm_unpackhi_epi64(c3, c3);
93*fb1b10abSAndroid Build Coastguard Worker }
94*fb1b10abSAndroid Build Coastguard Worker 
transpose_16bit_4x4(const __m128i * const in,__m128i * const out)95*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_16bit_4x4(const __m128i *const in,
96*fb1b10abSAndroid Build Coastguard Worker                                        __m128i *const out) {
97*fb1b10abSAndroid Build Coastguard Worker   // Unpack 16 bit elements. Goes from:
98*fb1b10abSAndroid Build Coastguard Worker   // in[0]: 00 01 02 03  XX XX XX XX
99*fb1b10abSAndroid Build Coastguard Worker   // in[1]: 10 11 12 13  XX XX XX XX
100*fb1b10abSAndroid Build Coastguard Worker   // in[2]: 20 21 22 23  XX XX XX XX
101*fb1b10abSAndroid Build Coastguard Worker   // in[3]: 30 31 32 33  XX XX XX XX
102*fb1b10abSAndroid Build Coastguard Worker   // to:
103*fb1b10abSAndroid Build Coastguard Worker   // a0:    00 10 01 11  02 12 03 13
104*fb1b10abSAndroid Build Coastguard Worker   // a1:    20 30 21 31  22 32 23 33
105*fb1b10abSAndroid Build Coastguard Worker   const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
106*fb1b10abSAndroid Build Coastguard Worker   const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
107*fb1b10abSAndroid Build Coastguard Worker 
108*fb1b10abSAndroid Build Coastguard Worker   // Unpack 32 bit elements resulting in:
109*fb1b10abSAndroid Build Coastguard Worker   // out[0]: 00 10 20 30  01 11 21 31
110*fb1b10abSAndroid Build Coastguard Worker   // out[1]: 02 12 22 32  03 13 23 33
111*fb1b10abSAndroid Build Coastguard Worker   out[0] = _mm_unpacklo_epi32(a0, a1);
112*fb1b10abSAndroid Build Coastguard Worker   out[1] = _mm_unpackhi_epi32(a0, a1);
113*fb1b10abSAndroid Build Coastguard Worker }
114*fb1b10abSAndroid Build Coastguard Worker 
transpose_16bit_4x8(const __m128i * const in,__m128i * const out)115*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_16bit_4x8(const __m128i *const in,
116*fb1b10abSAndroid Build Coastguard Worker                                        __m128i *const out) {
117*fb1b10abSAndroid Build Coastguard Worker   // Unpack 16 bit elements. Goes from:
118*fb1b10abSAndroid Build Coastguard Worker   // in[0]: 00 01 02 03  XX XX XX XX
119*fb1b10abSAndroid Build Coastguard Worker   // in[1]: 10 11 12 13  XX XX XX XX
120*fb1b10abSAndroid Build Coastguard Worker   // in[2]: 20 21 22 23  XX XX XX XX
121*fb1b10abSAndroid Build Coastguard Worker   // in[3]: 30 31 32 33  XX XX XX XX
122*fb1b10abSAndroid Build Coastguard Worker   // in[4]: 40 41 42 43  XX XX XX XX
123*fb1b10abSAndroid Build Coastguard Worker   // in[5]: 50 51 52 53  XX XX XX XX
124*fb1b10abSAndroid Build Coastguard Worker   // in[6]: 60 61 62 63  XX XX XX XX
125*fb1b10abSAndroid Build Coastguard Worker   // in[7]: 70 71 72 73  XX XX XX XX
126*fb1b10abSAndroid Build Coastguard Worker   // to:
127*fb1b10abSAndroid Build Coastguard Worker   // a0:    00 10 01 11  02 12 03 13
128*fb1b10abSAndroid Build Coastguard Worker   // a1:    20 30 21 31  22 32 23 33
129*fb1b10abSAndroid Build Coastguard Worker   // a2:    40 50 41 51  42 52 43 53
130*fb1b10abSAndroid Build Coastguard Worker   // a3:    60 70 61 71  62 72 63 73
131*fb1b10abSAndroid Build Coastguard Worker   const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
132*fb1b10abSAndroid Build Coastguard Worker   const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
133*fb1b10abSAndroid Build Coastguard Worker   const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
134*fb1b10abSAndroid Build Coastguard Worker   const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
135*fb1b10abSAndroid Build Coastguard Worker 
136*fb1b10abSAndroid Build Coastguard Worker   // Unpack 32 bit elements resulting in:
137*fb1b10abSAndroid Build Coastguard Worker   // b0: 00 10 20 30  01 11 21 31
138*fb1b10abSAndroid Build Coastguard Worker   // b1: 40 50 60 70  41 51 61 71
139*fb1b10abSAndroid Build Coastguard Worker   // b2: 02 12 22 32  03 13 23 33
140*fb1b10abSAndroid Build Coastguard Worker   // b3: 42 52 62 72  43 53 63 73
141*fb1b10abSAndroid Build Coastguard Worker   const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
142*fb1b10abSAndroid Build Coastguard Worker   const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
143*fb1b10abSAndroid Build Coastguard Worker   const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
144*fb1b10abSAndroid Build Coastguard Worker   const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
145*fb1b10abSAndroid Build Coastguard Worker 
146*fb1b10abSAndroid Build Coastguard Worker   // Unpack 64 bit elements resulting in:
147*fb1b10abSAndroid Build Coastguard Worker   // out[0]: 00 10 20 30  40 50 60 70
148*fb1b10abSAndroid Build Coastguard Worker   // out[1]: 01 11 21 31  41 51 61 71
149*fb1b10abSAndroid Build Coastguard Worker   // out[2]: 02 12 22 32  42 52 62 72
150*fb1b10abSAndroid Build Coastguard Worker   // out[3]: 03 13 23 33  43 53 63 73
151*fb1b10abSAndroid Build Coastguard Worker   out[0] = _mm_unpacklo_epi64(b0, b1);
152*fb1b10abSAndroid Build Coastguard Worker   out[1] = _mm_unpackhi_epi64(b0, b1);
153*fb1b10abSAndroid Build Coastguard Worker   out[2] = _mm_unpacklo_epi64(b2, b3);
154*fb1b10abSAndroid Build Coastguard Worker   out[3] = _mm_unpackhi_epi64(b2, b3);
155*fb1b10abSAndroid Build Coastguard Worker }
156*fb1b10abSAndroid Build Coastguard Worker 
transpose_16bit_8x8(const __m128i * const in,__m128i * const out)157*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_16bit_8x8(const __m128i *const in,
158*fb1b10abSAndroid Build Coastguard Worker                                        __m128i *const out) {
159*fb1b10abSAndroid Build Coastguard Worker   // Unpack 16 bit elements. Goes from:
160*fb1b10abSAndroid Build Coastguard Worker   // in[0]: 00 01 02 03  04 05 06 07
161*fb1b10abSAndroid Build Coastguard Worker   // in[1]: 10 11 12 13  14 15 16 17
162*fb1b10abSAndroid Build Coastguard Worker   // in[2]: 20 21 22 23  24 25 26 27
163*fb1b10abSAndroid Build Coastguard Worker   // in[3]: 30 31 32 33  34 35 36 37
164*fb1b10abSAndroid Build Coastguard Worker   // in[4]: 40 41 42 43  44 45 46 47
165*fb1b10abSAndroid Build Coastguard Worker   // in[5]: 50 51 52 53  54 55 56 57
166*fb1b10abSAndroid Build Coastguard Worker   // in[6]: 60 61 62 63  64 65 66 67
167*fb1b10abSAndroid Build Coastguard Worker   // in[7]: 70 71 72 73  74 75 76 77
168*fb1b10abSAndroid Build Coastguard Worker   // to:
169*fb1b10abSAndroid Build Coastguard Worker   // a0:    00 10 01 11  02 12 03 13
170*fb1b10abSAndroid Build Coastguard Worker   // a1:    20 30 21 31  22 32 23 33
171*fb1b10abSAndroid Build Coastguard Worker   // a2:    40 50 41 51  42 52 43 53
172*fb1b10abSAndroid Build Coastguard Worker   // a3:    60 70 61 71  62 72 63 73
173*fb1b10abSAndroid Build Coastguard Worker   // a4:    04 14 05 15  06 16 07 17
174*fb1b10abSAndroid Build Coastguard Worker   // a5:    24 34 25 35  26 36 27 37
175*fb1b10abSAndroid Build Coastguard Worker   // a6:    44 54 45 55  46 56 47 57
176*fb1b10abSAndroid Build Coastguard Worker   // a7:    64 74 65 75  66 76 67 77
177*fb1b10abSAndroid Build Coastguard Worker   const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
178*fb1b10abSAndroid Build Coastguard Worker   const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
179*fb1b10abSAndroid Build Coastguard Worker   const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
180*fb1b10abSAndroid Build Coastguard Worker   const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
181*fb1b10abSAndroid Build Coastguard Worker   const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
182*fb1b10abSAndroid Build Coastguard Worker   const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
183*fb1b10abSAndroid Build Coastguard Worker   const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
184*fb1b10abSAndroid Build Coastguard Worker   const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
185*fb1b10abSAndroid Build Coastguard Worker 
186*fb1b10abSAndroid Build Coastguard Worker   // Unpack 32 bit elements resulting in:
187*fb1b10abSAndroid Build Coastguard Worker   // b0: 00 10 20 30  01 11 21 31
188*fb1b10abSAndroid Build Coastguard Worker   // b1: 40 50 60 70  41 51 61 71
189*fb1b10abSAndroid Build Coastguard Worker   // b2: 04 14 24 34  05 15 25 35
190*fb1b10abSAndroid Build Coastguard Worker   // b3: 44 54 64 74  45 55 65 75
191*fb1b10abSAndroid Build Coastguard Worker   // b4: 02 12 22 32  03 13 23 33
192*fb1b10abSAndroid Build Coastguard Worker   // b5: 42 52 62 72  43 53 63 73
193*fb1b10abSAndroid Build Coastguard Worker   // b6: 06 16 26 36  07 17 27 37
194*fb1b10abSAndroid Build Coastguard Worker   // b7: 46 56 66 76  47 57 67 77
195*fb1b10abSAndroid Build Coastguard Worker   const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
196*fb1b10abSAndroid Build Coastguard Worker   const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
197*fb1b10abSAndroid Build Coastguard Worker   const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
198*fb1b10abSAndroid Build Coastguard Worker   const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
199*fb1b10abSAndroid Build Coastguard Worker   const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
200*fb1b10abSAndroid Build Coastguard Worker   const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
201*fb1b10abSAndroid Build Coastguard Worker   const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
202*fb1b10abSAndroid Build Coastguard Worker   const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
203*fb1b10abSAndroid Build Coastguard Worker 
204*fb1b10abSAndroid Build Coastguard Worker   // Unpack 64 bit elements resulting in:
205*fb1b10abSAndroid Build Coastguard Worker   // out[0]: 00 10 20 30  40 50 60 70
206*fb1b10abSAndroid Build Coastguard Worker   // out[1]: 01 11 21 31  41 51 61 71
207*fb1b10abSAndroid Build Coastguard Worker   // out[2]: 02 12 22 32  42 52 62 72
208*fb1b10abSAndroid Build Coastguard Worker   // out[3]: 03 13 23 33  43 53 63 73
209*fb1b10abSAndroid Build Coastguard Worker   // out[4]: 04 14 24 34  44 54 64 74
210*fb1b10abSAndroid Build Coastguard Worker   // out[5]: 05 15 25 35  45 55 65 75
211*fb1b10abSAndroid Build Coastguard Worker   // out[6]: 06 16 26 36  46 56 66 76
212*fb1b10abSAndroid Build Coastguard Worker   // out[7]: 07 17 27 37  47 57 67 77
213*fb1b10abSAndroid Build Coastguard Worker   out[0] = _mm_unpacklo_epi64(b0, b1);
214*fb1b10abSAndroid Build Coastguard Worker   out[1] = _mm_unpackhi_epi64(b0, b1);
215*fb1b10abSAndroid Build Coastguard Worker   out[2] = _mm_unpacklo_epi64(b4, b5);
216*fb1b10abSAndroid Build Coastguard Worker   out[3] = _mm_unpackhi_epi64(b4, b5);
217*fb1b10abSAndroid Build Coastguard Worker   out[4] = _mm_unpacklo_epi64(b2, b3);
218*fb1b10abSAndroid Build Coastguard Worker   out[5] = _mm_unpackhi_epi64(b2, b3);
219*fb1b10abSAndroid Build Coastguard Worker   out[6] = _mm_unpacklo_epi64(b6, b7);
220*fb1b10abSAndroid Build Coastguard Worker   out[7] = _mm_unpackhi_epi64(b6, b7);
221*fb1b10abSAndroid Build Coastguard Worker }
222*fb1b10abSAndroid Build Coastguard Worker 
223*fb1b10abSAndroid Build Coastguard Worker // Transpose in-place
transpose_16bit_16x16(__m128i * const left,__m128i * const right)224*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_16bit_16x16(__m128i *const left,
225*fb1b10abSAndroid Build Coastguard Worker                                          __m128i *const right) {
226*fb1b10abSAndroid Build Coastguard Worker   __m128i tbuf[8];
227*fb1b10abSAndroid Build Coastguard Worker   transpose_16bit_8x8(left, left);
228*fb1b10abSAndroid Build Coastguard Worker   transpose_16bit_8x8(right, tbuf);
229*fb1b10abSAndroid Build Coastguard Worker   transpose_16bit_8x8(left + 8, right);
230*fb1b10abSAndroid Build Coastguard Worker   transpose_16bit_8x8(right + 8, right + 8);
231*fb1b10abSAndroid Build Coastguard Worker 
232*fb1b10abSAndroid Build Coastguard Worker   left[8] = tbuf[0];
233*fb1b10abSAndroid Build Coastguard Worker   left[9] = tbuf[1];
234*fb1b10abSAndroid Build Coastguard Worker   left[10] = tbuf[2];
235*fb1b10abSAndroid Build Coastguard Worker   left[11] = tbuf[3];
236*fb1b10abSAndroid Build Coastguard Worker   left[12] = tbuf[4];
237*fb1b10abSAndroid Build Coastguard Worker   left[13] = tbuf[5];
238*fb1b10abSAndroid Build Coastguard Worker   left[14] = tbuf[6];
239*fb1b10abSAndroid Build Coastguard Worker   left[15] = tbuf[7];
240*fb1b10abSAndroid Build Coastguard Worker }
241*fb1b10abSAndroid Build Coastguard Worker 
transpose_32bit_4x4(const __m128i * const in,__m128i * const out)242*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_32bit_4x4(const __m128i *const in,
243*fb1b10abSAndroid Build Coastguard Worker                                        __m128i *const out) {
244*fb1b10abSAndroid Build Coastguard Worker   // Unpack 32 bit elements. Goes from:
245*fb1b10abSAndroid Build Coastguard Worker   // in[0]: 00 01 02 03
246*fb1b10abSAndroid Build Coastguard Worker   // in[1]: 10 11 12 13
247*fb1b10abSAndroid Build Coastguard Worker   // in[2]: 20 21 22 23
248*fb1b10abSAndroid Build Coastguard Worker   // in[3]: 30 31 32 33
249*fb1b10abSAndroid Build Coastguard Worker   // to:
250*fb1b10abSAndroid Build Coastguard Worker   // a0:    00 10 01 11
251*fb1b10abSAndroid Build Coastguard Worker   // a1:    20 30 21 31
252*fb1b10abSAndroid Build Coastguard Worker   // a2:    02 12 03 13
253*fb1b10abSAndroid Build Coastguard Worker   // a3:    22 32 23 33
254*fb1b10abSAndroid Build Coastguard Worker 
255*fb1b10abSAndroid Build Coastguard Worker   const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
256*fb1b10abSAndroid Build Coastguard Worker   const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
257*fb1b10abSAndroid Build Coastguard Worker   const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
258*fb1b10abSAndroid Build Coastguard Worker   const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
259*fb1b10abSAndroid Build Coastguard Worker 
260*fb1b10abSAndroid Build Coastguard Worker   // Unpack 64 bit elements resulting in:
261*fb1b10abSAndroid Build Coastguard Worker   // out[0]: 00 10 20 30
262*fb1b10abSAndroid Build Coastguard Worker   // out[1]: 01 11 21 31
263*fb1b10abSAndroid Build Coastguard Worker   // out[2]: 02 12 22 32
264*fb1b10abSAndroid Build Coastguard Worker   // out[3]: 03 13 23 33
265*fb1b10abSAndroid Build Coastguard Worker   out[0] = _mm_unpacklo_epi64(a0, a1);
266*fb1b10abSAndroid Build Coastguard Worker   out[1] = _mm_unpackhi_epi64(a0, a1);
267*fb1b10abSAndroid Build Coastguard Worker   out[2] = _mm_unpacklo_epi64(a2, a3);
268*fb1b10abSAndroid Build Coastguard Worker   out[3] = _mm_unpackhi_epi64(a2, a3);
269*fb1b10abSAndroid Build Coastguard Worker }
270*fb1b10abSAndroid Build Coastguard Worker 
transpose_32bit_4x4x2(const __m128i * const in,__m128i * const out)271*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
272*fb1b10abSAndroid Build Coastguard Worker                                          __m128i *const out) {
273*fb1b10abSAndroid Build Coastguard Worker   // Unpack 32 bit elements. Goes from:
274*fb1b10abSAndroid Build Coastguard Worker   // in[0]: 00 01 02 03
275*fb1b10abSAndroid Build Coastguard Worker   // in[1]: 10 11 12 13
276*fb1b10abSAndroid Build Coastguard Worker   // in[2]: 20 21 22 23
277*fb1b10abSAndroid Build Coastguard Worker   // in[3]: 30 31 32 33
278*fb1b10abSAndroid Build Coastguard Worker   // in[4]: 04 05 06 07
279*fb1b10abSAndroid Build Coastguard Worker   // in[5]: 14 15 16 17
280*fb1b10abSAndroid Build Coastguard Worker   // in[6]: 24 25 26 27
281*fb1b10abSAndroid Build Coastguard Worker   // in[7]: 34 35 36 37
282*fb1b10abSAndroid Build Coastguard Worker   // to:
283*fb1b10abSAndroid Build Coastguard Worker   // a0:    00 10 01 11
284*fb1b10abSAndroid Build Coastguard Worker   // a1:    20 30 21 31
285*fb1b10abSAndroid Build Coastguard Worker   // a2:    02 12 03 13
286*fb1b10abSAndroid Build Coastguard Worker   // a3:    22 32 23 33
287*fb1b10abSAndroid Build Coastguard Worker   // a4:    04 14 05 15
288*fb1b10abSAndroid Build Coastguard Worker   // a5:    24 34 25 35
289*fb1b10abSAndroid Build Coastguard Worker   // a6:    06 16 07 17
290*fb1b10abSAndroid Build Coastguard Worker   // a7:    26 36 27 37
291*fb1b10abSAndroid Build Coastguard Worker   const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
292*fb1b10abSAndroid Build Coastguard Worker   const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
293*fb1b10abSAndroid Build Coastguard Worker   const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
294*fb1b10abSAndroid Build Coastguard Worker   const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
295*fb1b10abSAndroid Build Coastguard Worker   const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
296*fb1b10abSAndroid Build Coastguard Worker   const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
297*fb1b10abSAndroid Build Coastguard Worker   const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
298*fb1b10abSAndroid Build Coastguard Worker   const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
299*fb1b10abSAndroid Build Coastguard Worker 
300*fb1b10abSAndroid Build Coastguard Worker   // Unpack 64 bit elements resulting in:
301*fb1b10abSAndroid Build Coastguard Worker   // out[0]: 00 10 20 30
302*fb1b10abSAndroid Build Coastguard Worker   // out[1]: 01 11 21 31
303*fb1b10abSAndroid Build Coastguard Worker   // out[2]: 02 12 22 32
304*fb1b10abSAndroid Build Coastguard Worker   // out[3]: 03 13 23 33
305*fb1b10abSAndroid Build Coastguard Worker   // out[4]: 04 14 24 34
306*fb1b10abSAndroid Build Coastguard Worker   // out[5]: 05 15 25 35
307*fb1b10abSAndroid Build Coastguard Worker   // out[6]: 06 16 26 36
308*fb1b10abSAndroid Build Coastguard Worker   // out[7]: 07 17 27 37
309*fb1b10abSAndroid Build Coastguard Worker   out[0] = _mm_unpacklo_epi64(a0, a1);
310*fb1b10abSAndroid Build Coastguard Worker   out[1] = _mm_unpackhi_epi64(a0, a1);
311*fb1b10abSAndroid Build Coastguard Worker   out[2] = _mm_unpacklo_epi64(a2, a3);
312*fb1b10abSAndroid Build Coastguard Worker   out[3] = _mm_unpackhi_epi64(a2, a3);
313*fb1b10abSAndroid Build Coastguard Worker   out[4] = _mm_unpacklo_epi64(a4, a5);
314*fb1b10abSAndroid Build Coastguard Worker   out[5] = _mm_unpackhi_epi64(a4, a5);
315*fb1b10abSAndroid Build Coastguard Worker   out[6] = _mm_unpacklo_epi64(a6, a7);
316*fb1b10abSAndroid Build Coastguard Worker   out[7] = _mm_unpackhi_epi64(a6, a7);
317*fb1b10abSAndroid Build Coastguard Worker }
318*fb1b10abSAndroid Build Coastguard Worker 
transpose_32bit_8x4(const __m128i * const in,__m128i * const out)319*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_32bit_8x4(const __m128i *const in,
320*fb1b10abSAndroid Build Coastguard Worker                                        __m128i *const out) {
321*fb1b10abSAndroid Build Coastguard Worker   // Unpack 32 bit elements. Goes from:
322*fb1b10abSAndroid Build Coastguard Worker   // in[0]: 00 01 02 03
323*fb1b10abSAndroid Build Coastguard Worker   // in[1]: 04 05 06 07
324*fb1b10abSAndroid Build Coastguard Worker   // in[2]: 10 11 12 13
325*fb1b10abSAndroid Build Coastguard Worker   // in[3]: 14 15 16 17
326*fb1b10abSAndroid Build Coastguard Worker   // in[4]: 20 21 22 23
327*fb1b10abSAndroid Build Coastguard Worker   // in[5]: 24 25 26 27
328*fb1b10abSAndroid Build Coastguard Worker   // in[6]: 30 31 32 33
329*fb1b10abSAndroid Build Coastguard Worker   // in[7]: 34 35 36 37
330*fb1b10abSAndroid Build Coastguard Worker   // to:
331*fb1b10abSAndroid Build Coastguard Worker   // a0: 00 10 01 11
332*fb1b10abSAndroid Build Coastguard Worker   // a1: 20 30 21 31
333*fb1b10abSAndroid Build Coastguard Worker   // a2: 02 12 03 13
334*fb1b10abSAndroid Build Coastguard Worker   // a3: 22 32 23 33
335*fb1b10abSAndroid Build Coastguard Worker   // a4: 04 14 05 15
336*fb1b10abSAndroid Build Coastguard Worker   // a5: 24 34 25 35
337*fb1b10abSAndroid Build Coastguard Worker   // a6: 06 16 07 17
338*fb1b10abSAndroid Build Coastguard Worker   // a7: 26 36 27 37
339*fb1b10abSAndroid Build Coastguard Worker   const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
340*fb1b10abSAndroid Build Coastguard Worker   const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
341*fb1b10abSAndroid Build Coastguard Worker   const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
342*fb1b10abSAndroid Build Coastguard Worker   const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
343*fb1b10abSAndroid Build Coastguard Worker   const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
344*fb1b10abSAndroid Build Coastguard Worker   const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
345*fb1b10abSAndroid Build Coastguard Worker   const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
346*fb1b10abSAndroid Build Coastguard Worker   const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
347*fb1b10abSAndroid Build Coastguard Worker 
348*fb1b10abSAndroid Build Coastguard Worker   // Unpack 64 bit elements resulting in:
349*fb1b10abSAndroid Build Coastguard Worker   // out[0]: 00 10 20 30
350*fb1b10abSAndroid Build Coastguard Worker   // out[1]: 01 11 21 31
351*fb1b10abSAndroid Build Coastguard Worker   // out[2]: 02 12 22 32
352*fb1b10abSAndroid Build Coastguard Worker   // out[3]: 03 13 23 33
353*fb1b10abSAndroid Build Coastguard Worker   // out[4]: 04 14 24 34
354*fb1b10abSAndroid Build Coastguard Worker   // out[5]: 05 15 25 35
355*fb1b10abSAndroid Build Coastguard Worker   // out[6]: 06 16 26 36
356*fb1b10abSAndroid Build Coastguard Worker   // out[7]: 07 17 27 37
357*fb1b10abSAndroid Build Coastguard Worker   out[0] = _mm_unpacklo_epi64(a0, a1);
358*fb1b10abSAndroid Build Coastguard Worker   out[1] = _mm_unpackhi_epi64(a0, a1);
359*fb1b10abSAndroid Build Coastguard Worker   out[2] = _mm_unpacklo_epi64(a2, a3);
360*fb1b10abSAndroid Build Coastguard Worker   out[3] = _mm_unpackhi_epi64(a2, a3);
361*fb1b10abSAndroid Build Coastguard Worker   out[4] = _mm_unpacklo_epi64(a4, a5);
362*fb1b10abSAndroid Build Coastguard Worker   out[5] = _mm_unpackhi_epi64(a4, a5);
363*fb1b10abSAndroid Build Coastguard Worker   out[6] = _mm_unpacklo_epi64(a6, a7);
364*fb1b10abSAndroid Build Coastguard Worker   out[7] = _mm_unpackhi_epi64(a6, a7);
365*fb1b10abSAndroid Build Coastguard Worker }
366*fb1b10abSAndroid Build Coastguard Worker 
367*fb1b10abSAndroid Build Coastguard Worker #endif  // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
368