xref: /aosp_15_r20/external/XNNPACK/src/x8-zip/x3-sse2.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1*4bdc9457SAndroid Build Coastguard Worker // Copyright (c) Facebook, Inc. and its affiliates.
2*4bdc9457SAndroid Build Coastguard Worker // All rights reserved.
3*4bdc9457SAndroid Build Coastguard Worker //
4*4bdc9457SAndroid Build Coastguard Worker // Copyright 2019 Google LLC
5*4bdc9457SAndroid Build Coastguard Worker //
6*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
7*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
8*4bdc9457SAndroid Build Coastguard Worker 
9*4bdc9457SAndroid Build Coastguard Worker #include <emmintrin.h>
10*4bdc9457SAndroid Build Coastguard Worker 
11*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/zip.h>
12*4bdc9457SAndroid Build Coastguard Worker 
13*4bdc9457SAndroid Build Coastguard Worker 
xnn_x8_zip_x3_ukernel__sse2(size_t n,const uint8_t * input,uint8_t * output)14*4bdc9457SAndroid Build Coastguard Worker void xnn_x8_zip_x3_ukernel__sse2(
15*4bdc9457SAndroid Build Coastguard Worker     size_t n,
16*4bdc9457SAndroid Build Coastguard Worker     const uint8_t* input,
17*4bdc9457SAndroid Build Coastguard Worker     uint8_t* output)
18*4bdc9457SAndroid Build Coastguard Worker {
19*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* x = input;
20*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
21*4bdc9457SAndroid Build Coastguard Worker   const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
22*4bdc9457SAndroid Build Coastguard Worker   uint8_t* o = output;
23*4bdc9457SAndroid Build Coastguard Worker 
24*4bdc9457SAndroid Build Coastguard Worker   if (n >= 16) {
25*4bdc9457SAndroid Build Coastguard Worker     const __m128i vmask0x00FF00FF = _mm_set1_epi16(0x00FF);
26*4bdc9457SAndroid Build Coastguard Worker     const __m128i vmask0x0000FFFF = _mm_set1_epi32(0x0000FFFF);
27*4bdc9457SAndroid Build Coastguard Worker     do {
28*4bdc9457SAndroid Build Coastguard Worker       // vx  = ( x15, x14, x13, x12, x11, x10,  x9,  x8,  x7,  x6,  x5,  x4, x3, x2, x1, x0 )
29*4bdc9457SAndroid Build Coastguard Worker       const __m128i vx = _mm_loadu_si128((const __m128i*) x);
30*4bdc9457SAndroid Build Coastguard Worker       x += 16;
31*4bdc9457SAndroid Build Coastguard Worker       // vy  = ( y15, y14, y13, y12, y11, y10,  y9,  y8,  y7,  y6,  y5,  y4, y3, y2, y1, y0 )
32*4bdc9457SAndroid Build Coastguard Worker       const __m128i vy = _mm_loadu_si128((const __m128i*) y);
33*4bdc9457SAndroid Build Coastguard Worker       y += 16;
34*4bdc9457SAndroid Build Coastguard Worker       // vz  = ( z15, z14, z13, z12, z11, z10,  z9,  z8,  z7,  z6,  z5,  z4, z3, z2, z1, z0 )
35*4bdc9457SAndroid Build Coastguard Worker       const __m128i vz = _mm_loadu_si128((const __m128i*) z);
36*4bdc9457SAndroid Build Coastguard Worker       z += 16;
37*4bdc9457SAndroid Build Coastguard Worker 
38*4bdc9457SAndroid Build Coastguard Worker       // vxeye     = ( y14, x14, y12, x12, y10, x10,  y8,  x8,  y6,  x6,  y4,  x4,  y2,  x2,  y0,  x0 )
39*4bdc9457SAndroid Build Coastguard Worker       const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
40*4bdc9457SAndroid Build Coastguard Worker       // vyozo     = ( z15, y15, z13, y13, z11, y11,  z9,  y9,  z7,  y7,  z5,  y5,  z3,  y3,  z1,  y1 )
41*4bdc9457SAndroid Build Coastguard Worker       const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
42*4bdc9457SAndroid Build Coastguard Worker       // vzoxo     = ( x15, z14, x13, z12, x11, z10,  x9,  z8,  x7,  z6,  x5,  z4,  x3,  z2,  x1,  z0 )
43*4bdc9457SAndroid Build Coastguard Worker       const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
44*4bdc9457SAndroid Build Coastguard Worker 
45*4bdc9457SAndroid Build Coastguard Worker       // vxeyezexo = ( x13, z12, y12, x12,  x9,  z8,  y8,  x8,  x5,  z4,  y4,  x4,  x1,  z0,  y0,  x0 )
46*4bdc9457SAndroid Build Coastguard Worker       const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
47*4bdc9457SAndroid Build Coastguard Worker       // vyozoxeye = ( y14, x14, z13, y13, y10, x10,  z9,  y9,  y6,  x6,  z5,  y5,  y2,  x2,  z1,  y1 )
48*4bdc9457SAndroid Build Coastguard Worker       const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
49*4bdc9457SAndroid Build Coastguard Worker       // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10,  z7,  y7,  x7,  z6,  z3,  y3,  x3,  z2 )
50*4bdc9457SAndroid Build Coastguard Worker       const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
51*4bdc9457SAndroid Build Coastguard Worker 
52*4bdc9457SAndroid Build Coastguard Worker       // vtemp0    = ( x13, z12, y12, x12,  x5,  z4,  y4,  x4, z11, y11, x11, z10,  z3,  y3,  x3,  z2 )
53*4bdc9457SAndroid Build Coastguard Worker       const __m128i vtemp0 = _mm_castps_si128(
54*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
55*4bdc9457SAndroid Build Coastguard Worker       // vtemp1    = ( y10, x10,  z9,  y9,  y2,  x2,  z1,  y1,  x9,  z8,  y8,  x8,  x1,  z0,  y0,  x0 )
56*4bdc9457SAndroid Build Coastguard Worker       const __m128i vtemp1 = _mm_castps_si128(
57*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
58*4bdc9457SAndroid Build Coastguard Worker       // vtemp2    = ( z15, y15, x15, z14,  z7,  y7,  x7,  z6, y14, x14, z13, y13,  y6,  x6,  z5,  y5 )
59*4bdc9457SAndroid Build Coastguard Worker       const __m128i vtemp2 = _mm_castps_si128(
60*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
61*4bdc9457SAndroid Build Coastguard Worker 
62*4bdc9457SAndroid Build Coastguard Worker       // vxyz0     = (  x5,  z4,  y4,  x4,  z3,  y3,  x3,  z2,  y2,  x2,  z1,  y1,  x1,  z0,  y0,  x0 )
63*4bdc9457SAndroid Build Coastguard Worker       const __m128i vxyz0 = _mm_castps_si128(
64*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
65*4bdc9457SAndroid Build Coastguard Worker       // vxyz1     = ( y10, x10,  z9,  y9,  x9,  z8,  y8,  x8,  z7,  y7,  x7,  z6,  y6,  x6,  z5,  y5 )
66*4bdc9457SAndroid Build Coastguard Worker       const __m128i vxyz1 = _mm_castps_si128(
67*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
68*4bdc9457SAndroid Build Coastguard Worker       // vxyz2     = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 )
69*4bdc9457SAndroid Build Coastguard Worker       const __m128i vxyz2 = _mm_castps_si128(
70*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
71*4bdc9457SAndroid Build Coastguard Worker 
72*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) o, vxyz0);
73*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) o + 1, vxyz1);
74*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) o + 2, vxyz2);
75*4bdc9457SAndroid Build Coastguard Worker       o += 48;
76*4bdc9457SAndroid Build Coastguard Worker       n -= 16;
77*4bdc9457SAndroid Build Coastguard Worker     } while (n >= 16);
78*4bdc9457SAndroid Build Coastguard Worker     if (n != 0) {
79*4bdc9457SAndroid Build Coastguard Worker       const size_t address_increment = n - 16;
80*4bdc9457SAndroid Build Coastguard Worker       // vx  = ( x15, x14, x13, x12, x11, x10,  x9,  x8,  x7,  x6,  x5,  x4, x3, x2, x1, x0 )
81*4bdc9457SAndroid Build Coastguard Worker       const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment));
82*4bdc9457SAndroid Build Coastguard Worker       // vy  = ( y15, y14, y13, y12, y11, y10,  y9,  y8,  y7,  y6,  y5,  y4, y3, y2, y1, y0 )
83*4bdc9457SAndroid Build Coastguard Worker       const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment));
84*4bdc9457SAndroid Build Coastguard Worker       // vz  = ( z15, z14, z13, z12, z11, z10,  z9,  z8,  z7,  z6,  z5,  z4, z3, z2, z1, z0 )
85*4bdc9457SAndroid Build Coastguard Worker       const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment));
86*4bdc9457SAndroid Build Coastguard Worker 
87*4bdc9457SAndroid Build Coastguard Worker       // vxeye     = ( y14, x14, y12, x12, y10, x10,  y8,  x8,  y6,  x6,  y4,  x4,  y2,  x2,  y0,  x0 )
88*4bdc9457SAndroid Build Coastguard Worker       const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
89*4bdc9457SAndroid Build Coastguard Worker       // vyozo     = ( z15, y15, z13, y13, z11, y11,  z9,  y9,  z7,  y7,  z5,  y5,  z3,  y3,  z1,  y1 )
90*4bdc9457SAndroid Build Coastguard Worker       const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
91*4bdc9457SAndroid Build Coastguard Worker       // vzoxo     = ( x15, z14, x13, z12, x11, z10,  x9,  z8,  x7,  z6,  x5,  z4,  x3,  z2,  x1,  z0 )
92*4bdc9457SAndroid Build Coastguard Worker       const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
93*4bdc9457SAndroid Build Coastguard Worker 
94*4bdc9457SAndroid Build Coastguard Worker       // vxeyezexo = ( x13, z12, y12, x12,  x9,  z8,  y8,  x8,  x5,  z4,  y4,  x4,  x1,  z0,  y0,  x0 )
95*4bdc9457SAndroid Build Coastguard Worker       const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
96*4bdc9457SAndroid Build Coastguard Worker       // vyozoxeye = ( y14, x14, z13, y13, y10, x10,  z9,  y9,  y6,  x6,  z5,  y5,  y2,  x2,  z1,  y1 )
97*4bdc9457SAndroid Build Coastguard Worker       const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
98*4bdc9457SAndroid Build Coastguard Worker       // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10,  z7,  y7,  x7,  z6,  z3,  y3,  x3,  z2 )
99*4bdc9457SAndroid Build Coastguard Worker       const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
100*4bdc9457SAndroid Build Coastguard Worker 
101*4bdc9457SAndroid Build Coastguard Worker       // vtemp0    = ( x13, z12, y12, x12,  x5,  z4,  y4,  x4, z11, y11, x11, z10,  z3,  y3,  x3,  z2 )
102*4bdc9457SAndroid Build Coastguard Worker       const __m128i vtemp0 = _mm_castps_si128(
103*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
104*4bdc9457SAndroid Build Coastguard Worker       // vtemp1    = ( y10, x10,  z9,  y9,  y2,  x2,  z1,  y1,  x9,  z8,  y8,  x8,  x1,  z0,  y0,  x0 )
105*4bdc9457SAndroid Build Coastguard Worker       const __m128i vtemp1 = _mm_castps_si128(
106*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
107*4bdc9457SAndroid Build Coastguard Worker       // vtemp2    = ( z15, y15, x15, z14,  z7,  y7,  x7,  z6, y14, x14, z13, y13,  y6,  x6,  z5,  y5 )
108*4bdc9457SAndroid Build Coastguard Worker       const __m128i vtemp2 = _mm_castps_si128(
109*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
110*4bdc9457SAndroid Build Coastguard Worker 
111*4bdc9457SAndroid Build Coastguard Worker       // vxyz0     = (  x5,  z4,  y4,  x4,  z3,  y3,  x3,  z2,  y2,  x2,  z1,  y1,  x1,  z0,  y0,  x0 )
112*4bdc9457SAndroid Build Coastguard Worker       const __m128i vxyz0 = _mm_castps_si128(
113*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
114*4bdc9457SAndroid Build Coastguard Worker       // vxyz1     = ( y10, x10,  z9,  y9,  x9,  z8,  y8,  x8,  z7,  y7,  x7,  z6,  y6,  x6,  z5,  y5 )
115*4bdc9457SAndroid Build Coastguard Worker       const __m128i vxyz1 = _mm_castps_si128(
116*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
117*4bdc9457SAndroid Build Coastguard Worker       // vxyz2     = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 )
118*4bdc9457SAndroid Build Coastguard Worker       const __m128i vxyz2 = _mm_castps_si128(
119*4bdc9457SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
120*4bdc9457SAndroid Build Coastguard Worker 
121*4bdc9457SAndroid Build Coastguard Worker       o = (uint8_t*) ((uintptr_t) o + address_increment * 3);
122*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) o, vxyz0);
123*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) o + 1, vxyz1);
124*4bdc9457SAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i*) o + 2, vxyz2);
125*4bdc9457SAndroid Build Coastguard Worker     }
126*4bdc9457SAndroid Build Coastguard Worker   } else {
127*4bdc9457SAndroid Build Coastguard Worker     do {
128*4bdc9457SAndroid Build Coastguard Worker       const uint8_t vx = *x++;
129*4bdc9457SAndroid Build Coastguard Worker       const uint8_t vy = *y++;
130*4bdc9457SAndroid Build Coastguard Worker       const uint8_t vz = *z++;
131*4bdc9457SAndroid Build Coastguard Worker       o[0] = vx;
132*4bdc9457SAndroid Build Coastguard Worker       o[1] = vy;
133*4bdc9457SAndroid Build Coastguard Worker       o[2] = vz;
134*4bdc9457SAndroid Build Coastguard Worker       o += 3;
135*4bdc9457SAndroid Build Coastguard Worker     } while (--n != 0);
136*4bdc9457SAndroid Build Coastguard Worker   }
137*4bdc9457SAndroid Build Coastguard Worker }
138