1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker *
4*fb1b10abSAndroid Build Coastguard Worker * Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker * that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker * tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker * in the file PATENTS. All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker * be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker */
10*fb1b10abSAndroid Build Coastguard Worker
11*fb1b10abSAndroid Build Coastguard Worker #ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
12*fb1b10abSAndroid Build Coastguard Worker #define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
13*fb1b10abSAndroid Build Coastguard Worker
14*fb1b10abSAndroid Build Coastguard Worker #include <emmintrin.h> // SSE2
15*fb1b10abSAndroid Build Coastguard Worker
16*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_config.h"
17*fb1b10abSAndroid Build Coastguard Worker
transpose_8bit_4x4(const __m128i * const in)18*fb1b10abSAndroid Build Coastguard Worker static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
19*fb1b10abSAndroid Build Coastguard Worker // Unpack 8 bit elements. Goes from:
20*fb1b10abSAndroid Build Coastguard Worker // in[0]: 00 01 02 03
21*fb1b10abSAndroid Build Coastguard Worker // in[1]: 10 11 12 13
22*fb1b10abSAndroid Build Coastguard Worker // in[2]: 20 21 22 23
23*fb1b10abSAndroid Build Coastguard Worker // in[3]: 30 31 32 33
24*fb1b10abSAndroid Build Coastguard Worker // to:
25*fb1b10abSAndroid Build Coastguard Worker // a0: 00 10 01 11 02 12 03 13
26*fb1b10abSAndroid Build Coastguard Worker // a1: 20 30 21 31 22 32 23 33
27*fb1b10abSAndroid Build Coastguard Worker const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
28*fb1b10abSAndroid Build Coastguard Worker const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
29*fb1b10abSAndroid Build Coastguard Worker
30*fb1b10abSAndroid Build Coastguard Worker // Unpack 16 bit elements resulting in:
31*fb1b10abSAndroid Build Coastguard Worker // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
32*fb1b10abSAndroid Build Coastguard Worker return _mm_unpacklo_epi16(a0, a1);
33*fb1b10abSAndroid Build Coastguard Worker }
34*fb1b10abSAndroid Build Coastguard Worker
transpose_8bit_8x8(const __m128i * const in,__m128i * const out)35*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_8bit_8x8(const __m128i *const in,
36*fb1b10abSAndroid Build Coastguard Worker __m128i *const out) {
37*fb1b10abSAndroid Build Coastguard Worker // Unpack 8 bit elements. Goes from:
38*fb1b10abSAndroid Build Coastguard Worker // in[0]: 00 01 02 03 04 05 06 07
39*fb1b10abSAndroid Build Coastguard Worker // in[1]: 10 11 12 13 14 15 16 17
40*fb1b10abSAndroid Build Coastguard Worker // in[2]: 20 21 22 23 24 25 26 27
41*fb1b10abSAndroid Build Coastguard Worker // in[3]: 30 31 32 33 34 35 36 37
42*fb1b10abSAndroid Build Coastguard Worker // in[4]: 40 41 42 43 44 45 46 47
43*fb1b10abSAndroid Build Coastguard Worker // in[5]: 50 51 52 53 54 55 56 57
44*fb1b10abSAndroid Build Coastguard Worker // in[6]: 60 61 62 63 64 65 66 67
45*fb1b10abSAndroid Build Coastguard Worker // in[7]: 70 71 72 73 74 75 76 77
46*fb1b10abSAndroid Build Coastguard Worker // to:
47*fb1b10abSAndroid Build Coastguard Worker // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
48*fb1b10abSAndroid Build Coastguard Worker // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
49*fb1b10abSAndroid Build Coastguard Worker // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
50*fb1b10abSAndroid Build Coastguard Worker // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
51*fb1b10abSAndroid Build Coastguard Worker const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
52*fb1b10abSAndroid Build Coastguard Worker const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
53*fb1b10abSAndroid Build Coastguard Worker const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
54*fb1b10abSAndroid Build Coastguard Worker const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
55*fb1b10abSAndroid Build Coastguard Worker
56*fb1b10abSAndroid Build Coastguard Worker // Unpack 16 bit elements resulting in:
57*fb1b10abSAndroid Build Coastguard Worker // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
58*fb1b10abSAndroid Build Coastguard Worker // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
59*fb1b10abSAndroid Build Coastguard Worker // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
60*fb1b10abSAndroid Build Coastguard Worker // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
61*fb1b10abSAndroid Build Coastguard Worker const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
62*fb1b10abSAndroid Build Coastguard Worker const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
63*fb1b10abSAndroid Build Coastguard Worker const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
64*fb1b10abSAndroid Build Coastguard Worker const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
65*fb1b10abSAndroid Build Coastguard Worker
66*fb1b10abSAndroid Build Coastguard Worker // Unpack 32 bit elements resulting in:
67*fb1b10abSAndroid Build Coastguard Worker // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
68*fb1b10abSAndroid Build Coastguard Worker // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
69*fb1b10abSAndroid Build Coastguard Worker // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
70*fb1b10abSAndroid Build Coastguard Worker // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
71*fb1b10abSAndroid Build Coastguard Worker const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
72*fb1b10abSAndroid Build Coastguard Worker const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
73*fb1b10abSAndroid Build Coastguard Worker const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
74*fb1b10abSAndroid Build Coastguard Worker const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
75*fb1b10abSAndroid Build Coastguard Worker
76*fb1b10abSAndroid Build Coastguard Worker // Unpack 64 bit elements resulting in:
77*fb1b10abSAndroid Build Coastguard Worker // out[0]: 00 10 20 30 40 50 60 70
78*fb1b10abSAndroid Build Coastguard Worker // out[1]: 01 11 21 31 41 51 61 71
79*fb1b10abSAndroid Build Coastguard Worker // out[2]: 02 12 22 32 42 52 62 72
80*fb1b10abSAndroid Build Coastguard Worker // out[3]: 03 13 23 33 43 53 63 73
81*fb1b10abSAndroid Build Coastguard Worker // out[4]: 04 14 24 34 44 54 64 74
82*fb1b10abSAndroid Build Coastguard Worker // out[5]: 05 15 25 35 45 55 65 75
83*fb1b10abSAndroid Build Coastguard Worker // out[6]: 06 16 26 36 46 56 66 76
84*fb1b10abSAndroid Build Coastguard Worker // out[7]: 07 17 27 37 47 57 67 77
85*fb1b10abSAndroid Build Coastguard Worker out[0] = _mm_unpacklo_epi64(c0, c0);
86*fb1b10abSAndroid Build Coastguard Worker out[1] = _mm_unpackhi_epi64(c0, c0);
87*fb1b10abSAndroid Build Coastguard Worker out[2] = _mm_unpacklo_epi64(c1, c1);
88*fb1b10abSAndroid Build Coastguard Worker out[3] = _mm_unpackhi_epi64(c1, c1);
89*fb1b10abSAndroid Build Coastguard Worker out[4] = _mm_unpacklo_epi64(c2, c2);
90*fb1b10abSAndroid Build Coastguard Worker out[5] = _mm_unpackhi_epi64(c2, c2);
91*fb1b10abSAndroid Build Coastguard Worker out[6] = _mm_unpacklo_epi64(c3, c3);
92*fb1b10abSAndroid Build Coastguard Worker out[7] = _mm_unpackhi_epi64(c3, c3);
93*fb1b10abSAndroid Build Coastguard Worker }
94*fb1b10abSAndroid Build Coastguard Worker
transpose_16bit_4x4(const __m128i * const in,__m128i * const out)95*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_16bit_4x4(const __m128i *const in,
96*fb1b10abSAndroid Build Coastguard Worker __m128i *const out) {
97*fb1b10abSAndroid Build Coastguard Worker // Unpack 16 bit elements. Goes from:
98*fb1b10abSAndroid Build Coastguard Worker // in[0]: 00 01 02 03 XX XX XX XX
99*fb1b10abSAndroid Build Coastguard Worker // in[1]: 10 11 12 13 XX XX XX XX
100*fb1b10abSAndroid Build Coastguard Worker // in[2]: 20 21 22 23 XX XX XX XX
101*fb1b10abSAndroid Build Coastguard Worker // in[3]: 30 31 32 33 XX XX XX XX
102*fb1b10abSAndroid Build Coastguard Worker // to:
103*fb1b10abSAndroid Build Coastguard Worker // a0: 00 10 01 11 02 12 03 13
104*fb1b10abSAndroid Build Coastguard Worker // a1: 20 30 21 31 22 32 23 33
105*fb1b10abSAndroid Build Coastguard Worker const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
106*fb1b10abSAndroid Build Coastguard Worker const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
107*fb1b10abSAndroid Build Coastguard Worker
108*fb1b10abSAndroid Build Coastguard Worker // Unpack 32 bit elements resulting in:
109*fb1b10abSAndroid Build Coastguard Worker // out[0]: 00 10 20 30 01 11 21 31
110*fb1b10abSAndroid Build Coastguard Worker // out[1]: 02 12 22 32 03 13 23 33
111*fb1b10abSAndroid Build Coastguard Worker out[0] = _mm_unpacklo_epi32(a0, a1);
112*fb1b10abSAndroid Build Coastguard Worker out[1] = _mm_unpackhi_epi32(a0, a1);
113*fb1b10abSAndroid Build Coastguard Worker }
114*fb1b10abSAndroid Build Coastguard Worker
transpose_16bit_4x8(const __m128i * const in,__m128i * const out)115*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_16bit_4x8(const __m128i *const in,
116*fb1b10abSAndroid Build Coastguard Worker __m128i *const out) {
117*fb1b10abSAndroid Build Coastguard Worker // Unpack 16 bit elements. Goes from:
118*fb1b10abSAndroid Build Coastguard Worker // in[0]: 00 01 02 03 XX XX XX XX
119*fb1b10abSAndroid Build Coastguard Worker // in[1]: 10 11 12 13 XX XX XX XX
120*fb1b10abSAndroid Build Coastguard Worker // in[2]: 20 21 22 23 XX XX XX XX
121*fb1b10abSAndroid Build Coastguard Worker // in[3]: 30 31 32 33 XX XX XX XX
122*fb1b10abSAndroid Build Coastguard Worker // in[4]: 40 41 42 43 XX XX XX XX
123*fb1b10abSAndroid Build Coastguard Worker // in[5]: 50 51 52 53 XX XX XX XX
124*fb1b10abSAndroid Build Coastguard Worker // in[6]: 60 61 62 63 XX XX XX XX
125*fb1b10abSAndroid Build Coastguard Worker // in[7]: 70 71 72 73 XX XX XX XX
126*fb1b10abSAndroid Build Coastguard Worker // to:
127*fb1b10abSAndroid Build Coastguard Worker // a0: 00 10 01 11 02 12 03 13
128*fb1b10abSAndroid Build Coastguard Worker // a1: 20 30 21 31 22 32 23 33
129*fb1b10abSAndroid Build Coastguard Worker // a2: 40 50 41 51 42 52 43 53
130*fb1b10abSAndroid Build Coastguard Worker // a3: 60 70 61 71 62 72 63 73
131*fb1b10abSAndroid Build Coastguard Worker const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
132*fb1b10abSAndroid Build Coastguard Worker const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
133*fb1b10abSAndroid Build Coastguard Worker const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
134*fb1b10abSAndroid Build Coastguard Worker const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
135*fb1b10abSAndroid Build Coastguard Worker
136*fb1b10abSAndroid Build Coastguard Worker // Unpack 32 bit elements resulting in:
137*fb1b10abSAndroid Build Coastguard Worker // b0: 00 10 20 30 01 11 21 31
138*fb1b10abSAndroid Build Coastguard Worker // b1: 40 50 60 70 41 51 61 71
139*fb1b10abSAndroid Build Coastguard Worker // b2: 02 12 22 32 03 13 23 33
140*fb1b10abSAndroid Build Coastguard Worker // b3: 42 52 62 72 43 53 63 73
141*fb1b10abSAndroid Build Coastguard Worker const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
142*fb1b10abSAndroid Build Coastguard Worker const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
143*fb1b10abSAndroid Build Coastguard Worker const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
144*fb1b10abSAndroid Build Coastguard Worker const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
145*fb1b10abSAndroid Build Coastguard Worker
146*fb1b10abSAndroid Build Coastguard Worker // Unpack 64 bit elements resulting in:
147*fb1b10abSAndroid Build Coastguard Worker // out[0]: 00 10 20 30 40 50 60 70
148*fb1b10abSAndroid Build Coastguard Worker // out[1]: 01 11 21 31 41 51 61 71
149*fb1b10abSAndroid Build Coastguard Worker // out[2]: 02 12 22 32 42 52 62 72
150*fb1b10abSAndroid Build Coastguard Worker // out[3]: 03 13 23 33 43 53 63 73
151*fb1b10abSAndroid Build Coastguard Worker out[0] = _mm_unpacklo_epi64(b0, b1);
152*fb1b10abSAndroid Build Coastguard Worker out[1] = _mm_unpackhi_epi64(b0, b1);
153*fb1b10abSAndroid Build Coastguard Worker out[2] = _mm_unpacklo_epi64(b2, b3);
154*fb1b10abSAndroid Build Coastguard Worker out[3] = _mm_unpackhi_epi64(b2, b3);
155*fb1b10abSAndroid Build Coastguard Worker }
156*fb1b10abSAndroid Build Coastguard Worker
transpose_16bit_8x8(const __m128i * const in,__m128i * const out)157*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_16bit_8x8(const __m128i *const in,
158*fb1b10abSAndroid Build Coastguard Worker __m128i *const out) {
159*fb1b10abSAndroid Build Coastguard Worker // Unpack 16 bit elements. Goes from:
160*fb1b10abSAndroid Build Coastguard Worker // in[0]: 00 01 02 03 04 05 06 07
161*fb1b10abSAndroid Build Coastguard Worker // in[1]: 10 11 12 13 14 15 16 17
162*fb1b10abSAndroid Build Coastguard Worker // in[2]: 20 21 22 23 24 25 26 27
163*fb1b10abSAndroid Build Coastguard Worker // in[3]: 30 31 32 33 34 35 36 37
164*fb1b10abSAndroid Build Coastguard Worker // in[4]: 40 41 42 43 44 45 46 47
165*fb1b10abSAndroid Build Coastguard Worker // in[5]: 50 51 52 53 54 55 56 57
166*fb1b10abSAndroid Build Coastguard Worker // in[6]: 60 61 62 63 64 65 66 67
167*fb1b10abSAndroid Build Coastguard Worker // in[7]: 70 71 72 73 74 75 76 77
168*fb1b10abSAndroid Build Coastguard Worker // to:
169*fb1b10abSAndroid Build Coastguard Worker // a0: 00 10 01 11 02 12 03 13
170*fb1b10abSAndroid Build Coastguard Worker // a1: 20 30 21 31 22 32 23 33
171*fb1b10abSAndroid Build Coastguard Worker // a2: 40 50 41 51 42 52 43 53
172*fb1b10abSAndroid Build Coastguard Worker // a3: 60 70 61 71 62 72 63 73
173*fb1b10abSAndroid Build Coastguard Worker // a4: 04 14 05 15 06 16 07 17
174*fb1b10abSAndroid Build Coastguard Worker // a5: 24 34 25 35 26 36 27 37
175*fb1b10abSAndroid Build Coastguard Worker // a6: 44 54 45 55 46 56 47 57
176*fb1b10abSAndroid Build Coastguard Worker // a7: 64 74 65 75 66 76 67 77
177*fb1b10abSAndroid Build Coastguard Worker const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
178*fb1b10abSAndroid Build Coastguard Worker const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
179*fb1b10abSAndroid Build Coastguard Worker const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
180*fb1b10abSAndroid Build Coastguard Worker const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
181*fb1b10abSAndroid Build Coastguard Worker const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
182*fb1b10abSAndroid Build Coastguard Worker const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
183*fb1b10abSAndroid Build Coastguard Worker const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
184*fb1b10abSAndroid Build Coastguard Worker const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
185*fb1b10abSAndroid Build Coastguard Worker
186*fb1b10abSAndroid Build Coastguard Worker // Unpack 32 bit elements resulting in:
187*fb1b10abSAndroid Build Coastguard Worker // b0: 00 10 20 30 01 11 21 31
188*fb1b10abSAndroid Build Coastguard Worker // b1: 40 50 60 70 41 51 61 71
189*fb1b10abSAndroid Build Coastguard Worker // b2: 04 14 24 34 05 15 25 35
190*fb1b10abSAndroid Build Coastguard Worker // b3: 44 54 64 74 45 55 65 75
191*fb1b10abSAndroid Build Coastguard Worker // b4: 02 12 22 32 03 13 23 33
192*fb1b10abSAndroid Build Coastguard Worker // b5: 42 52 62 72 43 53 63 73
193*fb1b10abSAndroid Build Coastguard Worker // b6: 06 16 26 36 07 17 27 37
194*fb1b10abSAndroid Build Coastguard Worker // b7: 46 56 66 76 47 57 67 77
195*fb1b10abSAndroid Build Coastguard Worker const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
196*fb1b10abSAndroid Build Coastguard Worker const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
197*fb1b10abSAndroid Build Coastguard Worker const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
198*fb1b10abSAndroid Build Coastguard Worker const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
199*fb1b10abSAndroid Build Coastguard Worker const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
200*fb1b10abSAndroid Build Coastguard Worker const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
201*fb1b10abSAndroid Build Coastguard Worker const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
202*fb1b10abSAndroid Build Coastguard Worker const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
203*fb1b10abSAndroid Build Coastguard Worker
204*fb1b10abSAndroid Build Coastguard Worker // Unpack 64 bit elements resulting in:
205*fb1b10abSAndroid Build Coastguard Worker // out[0]: 00 10 20 30 40 50 60 70
206*fb1b10abSAndroid Build Coastguard Worker // out[1]: 01 11 21 31 41 51 61 71
207*fb1b10abSAndroid Build Coastguard Worker // out[2]: 02 12 22 32 42 52 62 72
208*fb1b10abSAndroid Build Coastguard Worker // out[3]: 03 13 23 33 43 53 63 73
209*fb1b10abSAndroid Build Coastguard Worker // out[4]: 04 14 24 34 44 54 64 74
210*fb1b10abSAndroid Build Coastguard Worker // out[5]: 05 15 25 35 45 55 65 75
211*fb1b10abSAndroid Build Coastguard Worker // out[6]: 06 16 26 36 46 56 66 76
212*fb1b10abSAndroid Build Coastguard Worker // out[7]: 07 17 27 37 47 57 67 77
213*fb1b10abSAndroid Build Coastguard Worker out[0] = _mm_unpacklo_epi64(b0, b1);
214*fb1b10abSAndroid Build Coastguard Worker out[1] = _mm_unpackhi_epi64(b0, b1);
215*fb1b10abSAndroid Build Coastguard Worker out[2] = _mm_unpacklo_epi64(b4, b5);
216*fb1b10abSAndroid Build Coastguard Worker out[3] = _mm_unpackhi_epi64(b4, b5);
217*fb1b10abSAndroid Build Coastguard Worker out[4] = _mm_unpacklo_epi64(b2, b3);
218*fb1b10abSAndroid Build Coastguard Worker out[5] = _mm_unpackhi_epi64(b2, b3);
219*fb1b10abSAndroid Build Coastguard Worker out[6] = _mm_unpacklo_epi64(b6, b7);
220*fb1b10abSAndroid Build Coastguard Worker out[7] = _mm_unpackhi_epi64(b6, b7);
221*fb1b10abSAndroid Build Coastguard Worker }
222*fb1b10abSAndroid Build Coastguard Worker
223*fb1b10abSAndroid Build Coastguard Worker // Transpose in-place
transpose_16bit_16x16(__m128i * const left,__m128i * const right)224*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_16bit_16x16(__m128i *const left,
225*fb1b10abSAndroid Build Coastguard Worker __m128i *const right) {
226*fb1b10abSAndroid Build Coastguard Worker __m128i tbuf[8];
227*fb1b10abSAndroid Build Coastguard Worker transpose_16bit_8x8(left, left);
228*fb1b10abSAndroid Build Coastguard Worker transpose_16bit_8x8(right, tbuf);
229*fb1b10abSAndroid Build Coastguard Worker transpose_16bit_8x8(left + 8, right);
230*fb1b10abSAndroid Build Coastguard Worker transpose_16bit_8x8(right + 8, right + 8);
231*fb1b10abSAndroid Build Coastguard Worker
232*fb1b10abSAndroid Build Coastguard Worker left[8] = tbuf[0];
233*fb1b10abSAndroid Build Coastguard Worker left[9] = tbuf[1];
234*fb1b10abSAndroid Build Coastguard Worker left[10] = tbuf[2];
235*fb1b10abSAndroid Build Coastguard Worker left[11] = tbuf[3];
236*fb1b10abSAndroid Build Coastguard Worker left[12] = tbuf[4];
237*fb1b10abSAndroid Build Coastguard Worker left[13] = tbuf[5];
238*fb1b10abSAndroid Build Coastguard Worker left[14] = tbuf[6];
239*fb1b10abSAndroid Build Coastguard Worker left[15] = tbuf[7];
240*fb1b10abSAndroid Build Coastguard Worker }
241*fb1b10abSAndroid Build Coastguard Worker
transpose_32bit_4x4(const __m128i * const in,__m128i * const out)242*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_32bit_4x4(const __m128i *const in,
243*fb1b10abSAndroid Build Coastguard Worker __m128i *const out) {
244*fb1b10abSAndroid Build Coastguard Worker // Unpack 32 bit elements. Goes from:
245*fb1b10abSAndroid Build Coastguard Worker // in[0]: 00 01 02 03
246*fb1b10abSAndroid Build Coastguard Worker // in[1]: 10 11 12 13
247*fb1b10abSAndroid Build Coastguard Worker // in[2]: 20 21 22 23
248*fb1b10abSAndroid Build Coastguard Worker // in[3]: 30 31 32 33
249*fb1b10abSAndroid Build Coastguard Worker // to:
250*fb1b10abSAndroid Build Coastguard Worker // a0: 00 10 01 11
251*fb1b10abSAndroid Build Coastguard Worker // a1: 20 30 21 31
252*fb1b10abSAndroid Build Coastguard Worker // a2: 02 12 03 13
253*fb1b10abSAndroid Build Coastguard Worker // a3: 22 32 23 33
254*fb1b10abSAndroid Build Coastguard Worker
255*fb1b10abSAndroid Build Coastguard Worker const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
256*fb1b10abSAndroid Build Coastguard Worker const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
257*fb1b10abSAndroid Build Coastguard Worker const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
258*fb1b10abSAndroid Build Coastguard Worker const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
259*fb1b10abSAndroid Build Coastguard Worker
260*fb1b10abSAndroid Build Coastguard Worker // Unpack 64 bit elements resulting in:
261*fb1b10abSAndroid Build Coastguard Worker // out[0]: 00 10 20 30
262*fb1b10abSAndroid Build Coastguard Worker // out[1]: 01 11 21 31
263*fb1b10abSAndroid Build Coastguard Worker // out[2]: 02 12 22 32
264*fb1b10abSAndroid Build Coastguard Worker // out[3]: 03 13 23 33
265*fb1b10abSAndroid Build Coastguard Worker out[0] = _mm_unpacklo_epi64(a0, a1);
266*fb1b10abSAndroid Build Coastguard Worker out[1] = _mm_unpackhi_epi64(a0, a1);
267*fb1b10abSAndroid Build Coastguard Worker out[2] = _mm_unpacklo_epi64(a2, a3);
268*fb1b10abSAndroid Build Coastguard Worker out[3] = _mm_unpackhi_epi64(a2, a3);
269*fb1b10abSAndroid Build Coastguard Worker }
270*fb1b10abSAndroid Build Coastguard Worker
transpose_32bit_4x4x2(const __m128i * const in,__m128i * const out)271*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
272*fb1b10abSAndroid Build Coastguard Worker __m128i *const out) {
273*fb1b10abSAndroid Build Coastguard Worker // Unpack 32 bit elements. Goes from:
274*fb1b10abSAndroid Build Coastguard Worker // in[0]: 00 01 02 03
275*fb1b10abSAndroid Build Coastguard Worker // in[1]: 10 11 12 13
276*fb1b10abSAndroid Build Coastguard Worker // in[2]: 20 21 22 23
277*fb1b10abSAndroid Build Coastguard Worker // in[3]: 30 31 32 33
278*fb1b10abSAndroid Build Coastguard Worker // in[4]: 04 05 06 07
279*fb1b10abSAndroid Build Coastguard Worker // in[5]: 14 15 16 17
280*fb1b10abSAndroid Build Coastguard Worker // in[6]: 24 25 26 27
281*fb1b10abSAndroid Build Coastguard Worker // in[7]: 34 35 36 37
282*fb1b10abSAndroid Build Coastguard Worker // to:
283*fb1b10abSAndroid Build Coastguard Worker // a0: 00 10 01 11
284*fb1b10abSAndroid Build Coastguard Worker // a1: 20 30 21 31
285*fb1b10abSAndroid Build Coastguard Worker // a2: 02 12 03 13
286*fb1b10abSAndroid Build Coastguard Worker // a3: 22 32 23 33
287*fb1b10abSAndroid Build Coastguard Worker // a4: 04 14 05 15
288*fb1b10abSAndroid Build Coastguard Worker // a5: 24 34 25 35
289*fb1b10abSAndroid Build Coastguard Worker // a6: 06 16 07 17
290*fb1b10abSAndroid Build Coastguard Worker // a7: 26 36 27 37
291*fb1b10abSAndroid Build Coastguard Worker const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
292*fb1b10abSAndroid Build Coastguard Worker const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
293*fb1b10abSAndroid Build Coastguard Worker const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
294*fb1b10abSAndroid Build Coastguard Worker const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
295*fb1b10abSAndroid Build Coastguard Worker const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
296*fb1b10abSAndroid Build Coastguard Worker const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
297*fb1b10abSAndroid Build Coastguard Worker const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
298*fb1b10abSAndroid Build Coastguard Worker const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
299*fb1b10abSAndroid Build Coastguard Worker
300*fb1b10abSAndroid Build Coastguard Worker // Unpack 64 bit elements resulting in:
301*fb1b10abSAndroid Build Coastguard Worker // out[0]: 00 10 20 30
302*fb1b10abSAndroid Build Coastguard Worker // out[1]: 01 11 21 31
303*fb1b10abSAndroid Build Coastguard Worker // out[2]: 02 12 22 32
304*fb1b10abSAndroid Build Coastguard Worker // out[3]: 03 13 23 33
305*fb1b10abSAndroid Build Coastguard Worker // out[4]: 04 14 24 34
306*fb1b10abSAndroid Build Coastguard Worker // out[5]: 05 15 25 35
307*fb1b10abSAndroid Build Coastguard Worker // out[6]: 06 16 26 36
308*fb1b10abSAndroid Build Coastguard Worker // out[7]: 07 17 27 37
309*fb1b10abSAndroid Build Coastguard Worker out[0] = _mm_unpacklo_epi64(a0, a1);
310*fb1b10abSAndroid Build Coastguard Worker out[1] = _mm_unpackhi_epi64(a0, a1);
311*fb1b10abSAndroid Build Coastguard Worker out[2] = _mm_unpacklo_epi64(a2, a3);
312*fb1b10abSAndroid Build Coastguard Worker out[3] = _mm_unpackhi_epi64(a2, a3);
313*fb1b10abSAndroid Build Coastguard Worker out[4] = _mm_unpacklo_epi64(a4, a5);
314*fb1b10abSAndroid Build Coastguard Worker out[5] = _mm_unpackhi_epi64(a4, a5);
315*fb1b10abSAndroid Build Coastguard Worker out[6] = _mm_unpacklo_epi64(a6, a7);
316*fb1b10abSAndroid Build Coastguard Worker out[7] = _mm_unpackhi_epi64(a6, a7);
317*fb1b10abSAndroid Build Coastguard Worker }
318*fb1b10abSAndroid Build Coastguard Worker
transpose_32bit_8x4(const __m128i * const in,__m128i * const out)319*fb1b10abSAndroid Build Coastguard Worker static INLINE void transpose_32bit_8x4(const __m128i *const in,
320*fb1b10abSAndroid Build Coastguard Worker __m128i *const out) {
321*fb1b10abSAndroid Build Coastguard Worker // Unpack 32 bit elements. Goes from:
322*fb1b10abSAndroid Build Coastguard Worker // in[0]: 00 01 02 03
323*fb1b10abSAndroid Build Coastguard Worker // in[1]: 04 05 06 07
324*fb1b10abSAndroid Build Coastguard Worker // in[2]: 10 11 12 13
325*fb1b10abSAndroid Build Coastguard Worker // in[3]: 14 15 16 17
326*fb1b10abSAndroid Build Coastguard Worker // in[4]: 20 21 22 23
327*fb1b10abSAndroid Build Coastguard Worker // in[5]: 24 25 26 27
328*fb1b10abSAndroid Build Coastguard Worker // in[6]: 30 31 32 33
329*fb1b10abSAndroid Build Coastguard Worker // in[7]: 34 35 36 37
330*fb1b10abSAndroid Build Coastguard Worker // to:
331*fb1b10abSAndroid Build Coastguard Worker // a0: 00 10 01 11
332*fb1b10abSAndroid Build Coastguard Worker // a1: 20 30 21 31
333*fb1b10abSAndroid Build Coastguard Worker // a2: 02 12 03 13
334*fb1b10abSAndroid Build Coastguard Worker // a3: 22 32 23 33
335*fb1b10abSAndroid Build Coastguard Worker // a4: 04 14 05 15
336*fb1b10abSAndroid Build Coastguard Worker // a5: 24 34 25 35
337*fb1b10abSAndroid Build Coastguard Worker // a6: 06 16 07 17
338*fb1b10abSAndroid Build Coastguard Worker // a7: 26 36 27 37
339*fb1b10abSAndroid Build Coastguard Worker const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
340*fb1b10abSAndroid Build Coastguard Worker const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
341*fb1b10abSAndroid Build Coastguard Worker const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
342*fb1b10abSAndroid Build Coastguard Worker const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
343*fb1b10abSAndroid Build Coastguard Worker const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
344*fb1b10abSAndroid Build Coastguard Worker const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
345*fb1b10abSAndroid Build Coastguard Worker const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
346*fb1b10abSAndroid Build Coastguard Worker const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
347*fb1b10abSAndroid Build Coastguard Worker
348*fb1b10abSAndroid Build Coastguard Worker // Unpack 64 bit elements resulting in:
349*fb1b10abSAndroid Build Coastguard Worker // out[0]: 00 10 20 30
350*fb1b10abSAndroid Build Coastguard Worker // out[1]: 01 11 21 31
351*fb1b10abSAndroid Build Coastguard Worker // out[2]: 02 12 22 32
352*fb1b10abSAndroid Build Coastguard Worker // out[3]: 03 13 23 33
353*fb1b10abSAndroid Build Coastguard Worker // out[4]: 04 14 24 34
354*fb1b10abSAndroid Build Coastguard Worker // out[5]: 05 15 25 35
355*fb1b10abSAndroid Build Coastguard Worker // out[6]: 06 16 26 36
356*fb1b10abSAndroid Build Coastguard Worker // out[7]: 07 17 27 37
357*fb1b10abSAndroid Build Coastguard Worker out[0] = _mm_unpacklo_epi64(a0, a1);
358*fb1b10abSAndroid Build Coastguard Worker out[1] = _mm_unpackhi_epi64(a0, a1);
359*fb1b10abSAndroid Build Coastguard Worker out[2] = _mm_unpacklo_epi64(a2, a3);
360*fb1b10abSAndroid Build Coastguard Worker out[3] = _mm_unpackhi_epi64(a2, a3);
361*fb1b10abSAndroid Build Coastguard Worker out[4] = _mm_unpacklo_epi64(a4, a5);
362*fb1b10abSAndroid Build Coastguard Worker out[5] = _mm_unpackhi_epi64(a4, a5);
363*fb1b10abSAndroid Build Coastguard Worker out[6] = _mm_unpacklo_epi64(a6, a7);
364*fb1b10abSAndroid Build Coastguard Worker out[7] = _mm_unpackhi_epi64(a6, a7);
365*fb1b10abSAndroid Build Coastguard Worker }
366*fb1b10abSAndroid Build Coastguard Worker
367*fb1b10abSAndroid Build Coastguard Worker #endif // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
368