1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <emmintrin.h>
10
11 #include <xnnpack/zip.h>
12 #include <xnnpack/unaligned.h>
13
14
xnn_x8_zip_xm_ukernel__sse2(size_t n,size_t m,const uint8_t * input,uint8_t * output)15 void xnn_x8_zip_xm_ukernel__sse2(
16 size_t n,
17 size_t m,
18 const uint8_t* input,
19 uint8_t* output)
20 {
21 const uint8_t* w = input;
22 const size_t input_increment = n * 3;
23 const size_t output_increment = 4 - m * n;
24 const uint8_t* last_input = w + n * (m - 1);
25 uint8_t* last_output = (uint8_t*) ((uintptr_t) output + (m - 4));
26
27 if (n >= 8) {
28 for (size_t i = 0; i < m; i += 4) {
29 size_t k = n;
30 w = (const uint8_t*) ((uintptr_t) w + input_increment);
31 if (w >= last_input) {
32 w = last_input;
33 }
34 const uint8_t* z = (const uint8_t*) ((uintptr_t) w - n);
35 const uint8_t* y = (const uint8_t*) ((uintptr_t) z - n);
36 const uint8_t* x = (const uint8_t*) ((uintptr_t) y - n);
37 while (k >= 16) {
38 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
39 x += 16;
40 const __m128i vy = _mm_loadu_si128((const __m128i*) y);
41 y += 16;
42 const __m128i vz = _mm_loadu_si128((const __m128i*) z);
43 z += 16;
44 const __m128i vw = _mm_loadu_si128((const __m128i*) w);
45 w += 16;
46 const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
47 const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
48 const __m128i vzw_lo = _mm_unpacklo_epi8(vz, vw);
49 const __m128i vzw_hi = _mm_unpackhi_epi8(vz, vw);
50 __m128i vxyzw0 = _mm_unpacklo_epi16(vxy_lo, vzw_lo);
51 __m128i vxyzw1 = _mm_unpackhi_epi16(vxy_lo, vzw_lo);
52 __m128i vxyzw2 = _mm_unpacklo_epi16(vxy_hi, vzw_hi);
53 __m128i vxyzw3 = _mm_unpackhi_epi16(vxy_hi, vzw_hi);
54
55 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
56 output = (uint8_t*) ((uintptr_t) output + m);
57 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
58 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
59 output = (uint8_t*) ((uintptr_t) output + m);
60 vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
61 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
62 output = (uint8_t*) ((uintptr_t) output + m);
63 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
64 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
65 output = (uint8_t*) ((uintptr_t) output + m);
66
67 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
68 output = (uint8_t*) ((uintptr_t) output + m);
69 vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
70 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
71 output = (uint8_t*) ((uintptr_t) output + m);
72 vxyzw1 = _mm_unpackhi_epi64(vxyzw1, vxyzw1);
73 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
74 output = (uint8_t*) ((uintptr_t) output + m);
75 vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
76 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
77 output = (uint8_t*) ((uintptr_t) output + m);
78
79 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
80 output = (uint8_t*) ((uintptr_t) output + m);
81 vxyzw2 = _mm_shufflelo_epi16(vxyzw2, _MM_SHUFFLE(3, 2, 3, 2));
82 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
83 output = (uint8_t*) ((uintptr_t) output + m);
84 vxyzw2 = _mm_unpackhi_epi64(vxyzw2, vxyzw2);
85 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
86 output = (uint8_t*) ((uintptr_t) output + m);
87 vxyzw2 = _mm_shufflelo_epi16(vxyzw2, _MM_SHUFFLE(3, 2, 3, 2));
88 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
89 output = (uint8_t*) ((uintptr_t) output + m);
90
91 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
92 output = (uint8_t*) ((uintptr_t) output + m);
93 vxyzw3 = _mm_shufflelo_epi16(vxyzw3, _MM_SHUFFLE(3, 2, 3, 2));
94 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
95 output = (uint8_t*) ((uintptr_t) output + m);
96 vxyzw3 = _mm_unpackhi_epi64(vxyzw3, vxyzw3);
97 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
98 output = (uint8_t*) ((uintptr_t) output + m);
99 vxyzw3 = _mm_shufflelo_epi16(vxyzw3, _MM_SHUFFLE(3, 2, 3, 2));
100 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
101 output = (uint8_t*) ((uintptr_t) output + m);
102 k -= 16;
103 };
104 if (k >= 8) {
105 const __m128i vx = _mm_loadl_epi64((const __m128i*) x);
106 x += 8;
107 const __m128i vy = _mm_loadl_epi64((const __m128i*) y);
108 y += 8;
109 const __m128i vz = _mm_loadl_epi64((const __m128i*) z);
110 z += 8;
111 const __m128i vw = _mm_loadl_epi64((const __m128i*) w);
112 w += 8;
113 const __m128i vxy = _mm_unpacklo_epi8(vx, vy);
114 const __m128i vzw = _mm_unpacklo_epi8(vz, vw);
115 __m128i vxyzw0 = _mm_unpacklo_epi16(vxy, vzw);
116 __m128i vxyzw1 = _mm_unpackhi_epi16(vxy, vzw);
117
118 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
119 output = (uint8_t*) ((uintptr_t) output + m);
120 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
121 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
122 output = (uint8_t*) ((uintptr_t) output + m);
123 vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
124 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
125 output = (uint8_t*) ((uintptr_t) output + m);
126 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
127 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
128 output = (uint8_t*) ((uintptr_t) output + m);
129
130 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
131 output = (uint8_t*) ((uintptr_t) output + m);
132 vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
133 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
134 output = (uint8_t*) ((uintptr_t) output + m);
135 vxyzw1 = _mm_unpackhi_epi64(vxyzw1, vxyzw1);
136 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
137 output = (uint8_t*) ((uintptr_t) output + m);
138 vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
139 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
140 output = (uint8_t*) ((uintptr_t) output + m);
141 k -= 8;
142 }
143 if (k != 0) {
144 const size_t address_decrement = 8 - k;
145 x -= address_decrement;
146 y -= address_decrement;
147 z -= address_decrement;
148 w -= address_decrement;
149 const __m128i vshift = _mm_cvtsi32_si128((int) address_decrement * 8);
150
151 const __m128i vx = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) x), vshift);
152 const __m128i vy = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) y), vshift);
153 const __m128i vz = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) z), vshift);
154 const __m128i vw = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) w), vshift);
155 w += 8;
156 const __m128i vxy = _mm_unpacklo_epi8(vx, vy);
157 const __m128i vzw = _mm_unpacklo_epi8(vz, vw);
158 __m128i vxyzw0 = _mm_unpacklo_epi16(vxy, vzw);
159 __m128i vxyzw1 = _mm_unpackhi_epi16(vxy, vzw);
160
161 if (k & 4) {
162 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
163 output = (uint8_t*) ((uintptr_t) output + m);
164 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
165 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
166 output = (uint8_t*) ((uintptr_t) output + m);
167 vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
168 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
169 output = (uint8_t*) ((uintptr_t) output + m);
170 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
171 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
172 output = (uint8_t*) ((uintptr_t) output + m);
173 vxyzw0 = vxyzw1;
174 }
175
176 if (k & 2) {
177 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
178 output = (uint8_t*) ((uintptr_t) output + m);
179 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
180 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
181 output = (uint8_t*) ((uintptr_t) output + m);
182 vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
183 }
184 if (k & 1) {
185 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
186 output = (uint8_t*) ((uintptr_t) output + m);
187 }
188 }
189 output = (uint8_t*) ((uintptr_t) output + output_increment);
190 if (output > last_output) {
191 output = last_output;
192 }
193 }
194 } else {
195 const uint8_t* i = input;
196 uint8_t* o = output;
197 size_t k = n;
198 do {
199 size_t l = m;
200 const uint8_t* ii = i++;
201 do {
202 *o++ = *ii;
203 ii += n;
204 } while (--l != 0);
205 } while (--k != 0);
206 }
207 }
208