1 // Auto-generated file. Do not edit!
2 // Template: src/x32-transposec/sse2.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <immintrin.h>
11
12 #include <assert.h>
13
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 #include <xnnpack/unaligned.h>
18
19
xnn_x8_transposec_ukernel__16x16_reuse_switch_sse2(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)20 void xnn_x8_transposec_ukernel__16x16_reuse_switch_sse2(
21 const uint8_t* input,
22 uint8_t* output,
23 size_t input_stride,
24 size_t output_stride,
25 size_t block_width,
26 size_t block_height) XNN_OOB_READS
27 {
28 assert(output_stride >= block_height * sizeof(uint8_t));
29 assert(input_stride >= block_width * sizeof(uint8_t));
30
31 const size_t tile_height = 16;
32 const size_t tile_width = 16;
33 const size_t tile_hbytes = tile_height * sizeof(uint8_t);
34 const size_t tile_wbytes = tile_width * sizeof(uint8_t);
35 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
36 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t);
37
38 const uint8_t* i0 = input;
39 uint8_t* o = (uint8_t*) output;
40 const size_t minus_output_stride = -output_stride;
41
42 do {
43 const size_t rem = min(block_width - 1, 15);
44 const size_t oN_stride = rem * output_stride;
45 size_t bh = block_height;
46 for (; bh >= 16; bh -= 16) {
47 const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
48 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
49 const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i0);
50 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
51 const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i0);
52 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
53 const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i0);
54 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
55 const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i0);
56 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
57 const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i0);
58 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
59 const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i0);
60 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
61 const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i0);
62 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
63 const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i0);
64 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
65 const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i0);
66 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
67 const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i0);
68 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
69 const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i0);
70 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
71 const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i0);
72 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
73 const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i0);
74 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
75 const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i0);
76 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
77 const __m128i v4_15 = _mm_loadu_si128((const __m128i*) i0);
78 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
79
80 const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
81 const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
82 const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
83 const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
84 const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
85 const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
86 const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
87 const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
88 const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
89 const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
90 const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
91 const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
92 const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
93 const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
94 const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
95 const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
96
97 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
98 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
99 const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
100 const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
101 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
102 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
103 const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
104 const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
105 const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
106 const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
107 const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
108 const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
109 const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
110 const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
111 const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
112 const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
113
114 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
115 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
116 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
117 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
118 const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
119 const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
120 const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
121 const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
122 const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
123 const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
124 const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
125 const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
126 const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
127 const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
128 const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
129 const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
130
131 const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
132 const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
133 const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
134 const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
135 const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
136 const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
137 const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
138 const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
139 const __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
140 const __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
141 const __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
142 const __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
143 const __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
144 const __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
145 const __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
146 const __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
147
148 uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
149 switch (rem) {
150 case 15:
151 _mm_storeu_si128((__m128i*) oN, v0_15);
152 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
153 case 14:
154 _mm_storeu_si128((__m128i*) oN, v0_14);
155 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
156 case 13:
157 _mm_storeu_si128((__m128i*) oN, v0_13);
158 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
159 case 12:
160 _mm_storeu_si128((__m128i*) oN, v0_12);
161 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
162 case 11:
163 _mm_storeu_si128((__m128i*) oN, v0_11);
164 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
165 case 10:
166 _mm_storeu_si128((__m128i*) oN, v0_10);
167 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
168 case 9:
169 _mm_storeu_si128((__m128i*) oN, v0_9);
170 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
171 case 8:
172 _mm_storeu_si128((__m128i*) oN, v0_8);
173 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
174 case 7:
175 _mm_storeu_si128((__m128i*) oN, v0_7);
176 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
177 case 6:
178 _mm_storeu_si128((__m128i*) oN, v0_6);
179 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
180 case 5:
181 _mm_storeu_si128((__m128i*) oN, v0_5);
182 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
183 case 4:
184 _mm_storeu_si128((__m128i*) oN, v0_4);
185 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
186 case 3:
187 _mm_storeu_si128((__m128i*) oN, v0_3);
188 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
189 case 2:
190 _mm_storeu_si128((__m128i*) oN, v0_2);
191 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
192 case 1:
193 _mm_storeu_si128((__m128i*) oN, v0_1);
194 case 0:
195 _mm_storeu_si128((__m128i*) o, v0_0);
196 o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
197 break;
198 default:
199 XNN_UNREACHABLE;
200 }
201 }
202 if (bh != 0) {
203 const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
204 const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
205 if XNN_UNPREDICTABLE(bh < 2) {
206 i1 = i0;
207 }
208 const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i1);
209 const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
210 if XNN_UNPREDICTABLE(bh <= 2) {
211 i2 = i1;
212 }
213 const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i2);
214 const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
215 if XNN_UNPREDICTABLE(bh < 4) {
216 i3 = i2;
217 }
218 const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i3);
219 const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
220 if XNN_UNPREDICTABLE(bh <= 4) {
221 i4 = i3;
222 }
223 const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i4);
224 const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
225 if XNN_UNPREDICTABLE(bh < 6) {
226 i5 = i4;
227 }
228 const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i5);
229 const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
230 if XNN_UNPREDICTABLE(bh <= 6) {
231 i6 = i5;
232 }
233 const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i6);
234 const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
235 if XNN_UNPREDICTABLE(bh < 8) {
236 i7 = i6;
237 }
238 const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i7);
239 const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
240 if XNN_UNPREDICTABLE(bh <= 8) {
241 i8 = i7;
242 }
243 const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i8);
244 const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
245 if XNN_UNPREDICTABLE(bh < 10) {
246 i9 = i8;
247 }
248 const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i9);
249 const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
250 if XNN_UNPREDICTABLE(bh <= 10) {
251 i10 = i9;
252 }
253 const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i10);
254 const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
255 if XNN_UNPREDICTABLE(bh < 12) {
256 i11 = i10;
257 }
258 const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i11);
259 const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
260 if XNN_UNPREDICTABLE(bh <= 12) {
261 i12 = i11;
262 }
263 const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i12);
264 const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
265 if XNN_UNPREDICTABLE(bh < 14) {
266 i13 = i12;
267 }
268 const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i13);
269 const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
270 if XNN_UNPREDICTABLE(bh <= 14) {
271 i14 = i13;
272 }
273 const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i14);
274 const __m128i v4_15 = _mm_undefined_si128();
275
276 const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
277 const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
278 const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
279 const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
280 const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
281 const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
282 const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
283 const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
284 const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
285 const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
286 const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
287 const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
288 const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
289 const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
290 const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
291 const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
292
293 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
294 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
295 const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
296 const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
297 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
298 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
299 const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
300 const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
301 const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
302 const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
303 const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
304 const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
305 const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
306 const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
307 const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
308 const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
309
310 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
311 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
312 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
313 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
314 const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
315 const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
316 const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
317 const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
318 const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
319 const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
320 const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
321 const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
322 const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
323 const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
324 const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
325 const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
326
327 __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
328 __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
329 __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
330 __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
331 __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
332 __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
333 __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
334 __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
335 __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
336 __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
337 __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
338 __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
339 __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
340 __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
341 __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
342 __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
343
344 if (bh & 8) {
345 uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
346 switch (rem) {
347 case 15:
348 _mm_storel_epi64((__m128i*) oN, v0_15);
349 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
350 case 14:
351 _mm_storel_epi64((__m128i*) oN, v0_14);
352 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
353 case 13:
354 _mm_storel_epi64((__m128i*) oN, v0_13);
355 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
356 case 12:
357 _mm_storel_epi64((__m128i*) oN, v0_12);
358 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
359 case 11:
360 _mm_storel_epi64((__m128i*) oN, v0_11);
361 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
362 case 10:
363 _mm_storel_epi64((__m128i*) oN, v0_10);
364 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
365 case 9:
366 _mm_storel_epi64((__m128i*) oN, v0_9);
367 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
368 case 8:
369 _mm_storel_epi64((__m128i*) oN, v0_8);
370 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
371 case 7:
372 _mm_storel_epi64((__m128i*) oN, v0_7);
373 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
374 case 6:
375 _mm_storel_epi64((__m128i*) oN, v0_6);
376 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
377 case 5:
378 _mm_storel_epi64((__m128i*) oN, v0_5);
379 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
380 case 4:
381 _mm_storel_epi64((__m128i*) oN, v0_4);
382 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
383 case 3:
384 _mm_storel_epi64((__m128i*) oN, v0_3);
385 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
386 case 2:
387 _mm_storel_epi64((__m128i*) oN, v0_2);
388 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
389 case 1:
390 _mm_storel_epi64((__m128i*) oN, v0_1);
391 case 0:
392 _mm_storel_epi64((__m128i*) o, v0_0);
393 break;
394 default:
395 XNN_UNREACHABLE;
396 }
397 o += 8;
398 v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
399 v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
400 v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
401 v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
402 v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
403 v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
404 v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
405 v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
406 v0_8 = _mm_unpackhi_epi64(v0_8, v0_8);
407 v0_9 = _mm_unpackhi_epi64(v0_9, v0_9);
408 v0_10 = _mm_unpackhi_epi64(v0_10, v0_10);
409 v0_11 = _mm_unpackhi_epi64(v0_11, v0_11);
410 v0_12 = _mm_unpackhi_epi64(v0_12, v0_12);
411 v0_13 = _mm_unpackhi_epi64(v0_13, v0_13);
412 v0_14 = _mm_unpackhi_epi64(v0_14, v0_14);
413 v0_15 = _mm_unpackhi_epi64(v0_15, v0_15);
414 }
415
416 if (bh & 4) {
417 uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
418 switch (rem) {
419 case 15:
420 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_15));
421 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
422 case 14:
423 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_14));
424 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
425 case 13:
426 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_13));
427 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
428 case 12:
429 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_12));
430 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
431 case 11:
432 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_11));
433 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
434 case 10:
435 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_10));
436 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
437 case 9:
438 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_9));
439 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
440 case 8:
441 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_8));
442 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
443 case 7:
444 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_7));
445 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
446 case 6:
447 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_6));
448 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
449 case 5:
450 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_5));
451 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
452 case 4:
453 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_4));
454 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
455 case 3:
456 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_3));
457 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
458 case 2:
459 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_2));
460 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
461 case 1:
462 unaligned_store_u32(oN, (uint32_t) _mm_cvtsi128_si32(v0_1));
463 case 0:
464 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_0));
465 break;
466 default:
467 XNN_UNREACHABLE;
468 }
469 o += 4;
470 v0_0 = _mm_srli_epi64(v0_0, 32);
471 v0_1 = _mm_srli_epi64(v0_1, 32);
472 v0_2 = _mm_srli_epi64(v0_2, 32);
473 v0_3 = _mm_srli_epi64(v0_3, 32);
474 v0_4 = _mm_srli_epi64(v0_4, 32);
475 v0_5 = _mm_srli_epi64(v0_5, 32);
476 v0_6 = _mm_srli_epi64(v0_6, 32);
477 v0_7 = _mm_srli_epi64(v0_7, 32);
478 v0_8 = _mm_srli_epi64(v0_8, 32);
479 v0_9 = _mm_srli_epi64(v0_9, 32);
480 v0_10 = _mm_srli_epi64(v0_10, 32);
481 v0_11 = _mm_srli_epi64(v0_11, 32);
482 v0_12 = _mm_srli_epi64(v0_12, 32);
483 v0_13 = _mm_srli_epi64(v0_13, 32);
484 v0_14 = _mm_srli_epi64(v0_14, 32);
485 v0_15 = _mm_srli_epi64(v0_15, 32);
486 }
487 if (bh & 2) {
488 uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
489 switch (rem) {
490 case 15:
491 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_15));
492 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
493 case 14:
494 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_14));
495 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
496 case 13:
497 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_13));
498 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
499 case 12:
500 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_12));
501 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
502 case 11:
503 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_11));
504 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
505 case 10:
506 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_10));
507 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
508 case 9:
509 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_9));
510 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
511 case 8:
512 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_8));
513 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
514 case 7:
515 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_7));
516 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
517 case 6:
518 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_6));
519 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
520 case 5:
521 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_5));
522 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
523 case 4:
524 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_4));
525 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
526 case 3:
527 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_3));
528 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
529 case 2:
530 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_2));
531 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
532 case 1:
533 unaligned_store_u16(oN, (uint16_t) _mm_cvtsi128_si32(v0_1));
534 case 0:
535 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0));
536 break;
537 default:
538 XNN_UNREACHABLE;
539 }
540 o += 2;
541 v0_0 = _mm_srli_epi32(v0_0, 16);
542 v0_1 = _mm_srli_epi32(v0_1, 16);
543 v0_2 = _mm_srli_epi32(v0_2, 16);
544 v0_3 = _mm_srli_epi32(v0_3, 16);
545 v0_4 = _mm_srli_epi32(v0_4, 16);
546 v0_5 = _mm_srli_epi32(v0_5, 16);
547 v0_6 = _mm_srli_epi32(v0_6, 16);
548 v0_7 = _mm_srli_epi32(v0_7, 16);
549 v0_8 = _mm_srli_epi32(v0_8, 16);
550 v0_9 = _mm_srli_epi32(v0_9, 16);
551 v0_10 = _mm_srli_epi32(v0_10, 16);
552 v0_11 = _mm_srli_epi32(v0_11, 16);
553 v0_12 = _mm_srli_epi32(v0_12, 16);
554 v0_13 = _mm_srli_epi32(v0_13, 16);
555 v0_14 = _mm_srli_epi32(v0_14, 16);
556 v0_15 = _mm_srli_epi32(v0_15, 16);
557 }
558 if (bh & 1) {
559 uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
560 switch (rem) {
561 case 15:
562 *oN = (uint8_t) _mm_cvtsi128_si32(v0_15);
563 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
564 case 14:
565 *oN = (uint8_t) _mm_cvtsi128_si32(v0_14);
566 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
567 case 13:
568 *oN = (uint8_t) _mm_cvtsi128_si32(v0_13);
569 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
570 case 12:
571 *oN = (uint8_t) _mm_cvtsi128_si32(v0_12);
572 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
573 case 11:
574 *oN = (uint8_t) _mm_cvtsi128_si32(v0_11);
575 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
576 case 10:
577 *oN = (uint8_t) _mm_cvtsi128_si32(v0_10);
578 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
579 case 9:
580 *oN = (uint8_t) _mm_cvtsi128_si32(v0_9);
581 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
582 case 8:
583 *oN = (uint8_t) _mm_cvtsi128_si32(v0_8);
584 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
585 case 7:
586 *oN = (uint8_t) _mm_cvtsi128_si32(v0_7);
587 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
588 case 6:
589 *oN = (uint8_t) _mm_cvtsi128_si32(v0_6);
590 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
591 case 5:
592 *oN = (uint8_t) _mm_cvtsi128_si32(v0_5);
593 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
594 case 4:
595 *oN = (uint8_t) _mm_cvtsi128_si32(v0_4);
596 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
597 case 3:
598 *oN = (uint8_t) _mm_cvtsi128_si32(v0_3);
599 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
600 case 2:
601 *oN = (uint8_t) _mm_cvtsi128_si32(v0_2);
602 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
603 case 1:
604 *oN = (uint8_t) _mm_cvtsi128_si32(v0_1);
605 case 0:
606 *o = (uint8_t) _mm_cvtsi128_si32(v0_0);
607 break;
608 default:
609 XNN_UNREACHABLE;
610 }
611 }
612 }
613
614 i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
615 o = (uint8_t*) ((uintptr_t) o + output_reset);
616 block_width = doz(block_width, tile_width);
617 } while (block_width != 0);
618 }
619