xref: /aosp_15_r20/external/XNNPACK/src/x8-transposec/gen/16x16-reuse-mov-sse2.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transposec/sse2.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <immintrin.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 #include <xnnpack/unaligned.h>
18 
19 
xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)20 void xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2(
21     const uint8_t* input,
22     uint8_t* output,
23     size_t input_stride,
24     size_t output_stride,
25     size_t block_width,
26     size_t block_height) XNN_OOB_READS
27 {
28   assert(output_stride >= block_height * sizeof(uint8_t));
29   assert(input_stride >= block_width * sizeof(uint8_t));
30 
31   const size_t tile_height = 16;
32   const size_t tile_width = 16;
33   const size_t tile_hbytes = tile_height * sizeof(uint8_t);
34   const size_t tile_wbytes = tile_width * sizeof(uint8_t);
35   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
36   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes;
37 
38   const uint8_t* i0 = input;
39   uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes);
40   const size_t minus_output_stride = -output_stride;
41 
42   do {
43     const size_t rem = min(block_width - 1, 15);
44     const size_t oN_stride = rem * output_stride;
45     const size_t oN_offset = oN_stride + tile_hbytes;
46     size_t bh = block_height;
47     for (; bh >= 16; bh -= 16) {
48       const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
49       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
50       const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i0);
51       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
52       const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i0);
53       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
54       const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i0);
55       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
56       const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i0);
57       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
58       const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i0);
59       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
60       const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i0);
61       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
62       const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i0);
63       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
64       const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i0);
65       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
66       const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i0);
67       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
68       const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i0);
69       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
70       const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i0);
71       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
72       const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i0);
73       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
74       const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i0);
75       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
76       const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i0);
77       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
78       const __m128i v4_15 = _mm_loadu_si128((const __m128i*) i0);
79       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
80 
81       const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
82       const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
83       const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
84       const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
85       const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
86       const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
87       const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
88       const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
89       const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
90       const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
91       const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
92       const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
93       const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
94       const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
95       const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
96       const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
97 
98       const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
99       const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
100       const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
101       const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
102       const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
103       const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
104       const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
105       const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
106       const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
107       const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
108       const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
109       const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
110       const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
111       const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
112       const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
113       const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
114 
115       const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
116       const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
117       const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
118       const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
119       const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
120       const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
121       const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
122       const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
123       const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
124       const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
125       const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
126       const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
127       const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
128       const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
129       const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
130       const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
131 
132       const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
133       const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
134       const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
135       const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
136       const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
137       const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
138       const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
139       const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
140       const __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
141       const __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
142       const __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
143       const __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
144       const __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
145       const __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
146       const __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
147       const __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
148 
149       o = (uint8_t*) ((uintptr_t) o + oN_offset);
150       _mm_storeu_si128((__m128i*) o, v0_15);
151       uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
152       if XNN_UNPREDICTABLE(block_width > 15) {
153         o = oN;
154       }
155       _mm_storeu_si128((__m128i*) o, v0_14);
156       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
157       if XNN_UNPREDICTABLE(block_width >= 15) {
158         o = oN;
159       }
160       _mm_storeu_si128((__m128i*) o, v0_13);
161       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
162       if XNN_UNPREDICTABLE(block_width > 13) {
163         o = oN;
164       }
165       _mm_storeu_si128((__m128i*) o, v0_12);
166       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
167       if XNN_UNPREDICTABLE(block_width >= 13) {
168         o = oN;
169       }
170       _mm_storeu_si128((__m128i*) o, v0_11);
171       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
172       if XNN_UNPREDICTABLE(block_width > 11) {
173         o = oN;
174       }
175       _mm_storeu_si128((__m128i*) o, v0_10);
176       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
177       if XNN_UNPREDICTABLE(block_width >= 11) {
178         o = oN;
179       }
180       _mm_storeu_si128((__m128i*) o, v0_9);
181       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
182       if XNN_UNPREDICTABLE(block_width > 9) {
183         o = oN;
184       }
185       _mm_storeu_si128((__m128i*) o, v0_8);
186       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
187       if XNN_UNPREDICTABLE(block_width >= 9) {
188         o = oN;
189       }
190       _mm_storeu_si128((__m128i*) o, v0_7);
191       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
192       if XNN_UNPREDICTABLE(block_width > 7) {
193         o = oN;
194       }
195       _mm_storeu_si128((__m128i*) o, v0_6);
196       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
197       if XNN_UNPREDICTABLE(block_width >= 7) {
198         o = oN;
199       }
200       _mm_storeu_si128((__m128i*) o, v0_5);
201       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
202       if XNN_UNPREDICTABLE(block_width > 5) {
203         o = oN;
204       }
205       _mm_storeu_si128((__m128i*) o, v0_4);
206       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
207       if XNN_UNPREDICTABLE(block_width >= 5) {
208         o = oN;
209       }
210       _mm_storeu_si128((__m128i*) o, v0_3);
211       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
212       if XNN_UNPREDICTABLE(block_width > 3) {
213         o = oN;
214       }
215       _mm_storeu_si128((__m128i*) o, v0_2);
216       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
217       if XNN_UNPREDICTABLE(block_width >= 3) {
218         o = oN;
219       }
220       _mm_storeu_si128((__m128i*) o, v0_1);
221       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
222       if XNN_UNPREDICTABLE(block_width > 1) {
223         o = oN;
224       }
225       _mm_storeu_si128((__m128i*) o, v0_0);
226     }
227     o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
228     if (bh != 0) {
229       const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
230       const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
231       if XNN_UNPREDICTABLE(bh < 2) {
232         i1 = i0;
233       }
234       const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i1);
235       const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
236       if XNN_UNPREDICTABLE(bh <= 2) {
237         i2 = i1;
238       }
239       const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i2);
240       const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
241       if XNN_UNPREDICTABLE(bh < 4) {
242         i3 = i2;
243       }
244       const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i3);
245       const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
246       if XNN_UNPREDICTABLE(bh <= 4) {
247         i4 = i3;
248       }
249       const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i4);
250       const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
251       if XNN_UNPREDICTABLE(bh < 6) {
252         i5 = i4;
253       }
254       const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i5);
255       const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
256       if XNN_UNPREDICTABLE(bh <= 6) {
257         i6 = i5;
258       }
259       const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i6);
260       const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
261       if XNN_UNPREDICTABLE(bh < 8) {
262         i7 = i6;
263       }
264       const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i7);
265       const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
266       if XNN_UNPREDICTABLE(bh <= 8) {
267         i8 = i7;
268       }
269       const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i8);
270       const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
271       if XNN_UNPREDICTABLE(bh < 10) {
272         i9 = i8;
273       }
274       const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i9);
275       const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
276       if XNN_UNPREDICTABLE(bh <= 10) {
277         i10 = i9;
278       }
279       const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i10);
280       const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
281       if XNN_UNPREDICTABLE(bh < 12) {
282         i11 = i10;
283       }
284       const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i11);
285       const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
286       if XNN_UNPREDICTABLE(bh <= 12) {
287         i12 = i11;
288       }
289       const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i12);
290       const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
291       if XNN_UNPREDICTABLE(bh < 14) {
292         i13 = i12;
293       }
294       const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i13);
295       const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
296       if XNN_UNPREDICTABLE(bh <= 14) {
297         i14 = i13;
298       }
299       const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i14);
300       const __m128i v4_15 = _mm_undefined_si128();
301 
302       const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
303       const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
304       const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
305       const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
306       const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
307       const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
308       const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
309       const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
310       const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
311       const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
312       const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
313       const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
314       const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
315       const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
316       const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
317       const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
318 
319       const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
320       const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
321       const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
322       const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
323       const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
324       const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
325       const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
326       const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
327       const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
328       const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
329       const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
330       const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
331       const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
332       const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
333       const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
334       const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
335 
336       const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
337       const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
338       const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
339       const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
340       const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
341       const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
342       const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
343       const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
344       const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
345       const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
346       const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
347       const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
348       const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
349       const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
350       const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
351       const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
352 
353       __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
354       __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
355       __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
356       __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
357       __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
358       __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
359       __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
360       __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
361       __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
362       __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
363       __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
364       __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
365       __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
366       __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
367       __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
368       __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
369 
370       if (bh & 8) {
371         o = (uint8_t*) ((uintptr_t) o + oN_stride);
372         _mm_storel_epi64((__m128i*) o, v0_15);
373         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
374         if XNN_UNPREDICTABLE(block_width > 15) {
375           o = oN;
376         }
377         _mm_storel_epi64((__m128i*) o, v0_14);
378         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
379         if XNN_UNPREDICTABLE(block_width >= 15) {
380           o = oN;
381         }
382         _mm_storel_epi64((__m128i*) o, v0_13);
383         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
384         if XNN_UNPREDICTABLE(block_width > 13) {
385           o = oN;
386         }
387         _mm_storel_epi64((__m128i*) o, v0_12);
388         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
389         if XNN_UNPREDICTABLE(block_width >= 13) {
390           o = oN;
391         }
392         _mm_storel_epi64((__m128i*) o, v0_11);
393         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
394         if XNN_UNPREDICTABLE(block_width > 11) {
395           o = oN;
396         }
397         _mm_storel_epi64((__m128i*) o, v0_10);
398         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
399         if XNN_UNPREDICTABLE(block_width >= 11) {
400           o = oN;
401         }
402         _mm_storel_epi64((__m128i*) o, v0_9);
403         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
404         if XNN_UNPREDICTABLE(block_width > 9) {
405           o = oN;
406         }
407         _mm_storel_epi64((__m128i*) o, v0_8);
408         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
409         if XNN_UNPREDICTABLE(block_width >= 9) {
410           o = oN;
411         }
412         _mm_storel_epi64((__m128i*) o, v0_7);
413         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
414         if XNN_UNPREDICTABLE(block_width > 7) {
415           o = oN;
416         }
417         _mm_storel_epi64((__m128i*) o, v0_6);
418         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
419         if XNN_UNPREDICTABLE(block_width >= 7) {
420           o = oN;
421         }
422         _mm_storel_epi64((__m128i*) o, v0_5);
423         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
424         if XNN_UNPREDICTABLE(block_width > 5) {
425           o = oN;
426         }
427         _mm_storel_epi64((__m128i*) o, v0_4);
428         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
429         if XNN_UNPREDICTABLE(block_width >= 5) {
430           o = oN;
431         }
432         _mm_storel_epi64((__m128i*) o, v0_3);
433         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
434         if XNN_UNPREDICTABLE(block_width > 3) {
435           o = oN;
436         }
437         _mm_storel_epi64((__m128i*) o, v0_2);
438         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
439         if XNN_UNPREDICTABLE(block_width >= 3) {
440           o = oN;
441         }
442         _mm_storel_epi64((__m128i*) o, v0_1);
443         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
444         if XNN_UNPREDICTABLE(block_width > 1) {
445           o = oN;
446         }
447         _mm_storel_epi64((__m128i*) o, v0_0);
448         o += 8;
449         v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
450         v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
451         v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
452         v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
453         v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
454         v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
455         v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
456         v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
457         v0_8 = _mm_unpackhi_epi64(v0_8, v0_8);
458         v0_9 = _mm_unpackhi_epi64(v0_9, v0_9);
459         v0_10 = _mm_unpackhi_epi64(v0_10, v0_10);
460         v0_11 = _mm_unpackhi_epi64(v0_11, v0_11);
461         v0_12 = _mm_unpackhi_epi64(v0_12, v0_12);
462         v0_13 = _mm_unpackhi_epi64(v0_13, v0_13);
463         v0_14 = _mm_unpackhi_epi64(v0_14, v0_14);
464         v0_15 = _mm_unpackhi_epi64(v0_15, v0_15);
465       }
466 
467       if (bh & 4) {
468         o = (uint8_t*) ((uintptr_t) o + oN_stride);
469         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_15));
470         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
471         if XNN_UNPREDICTABLE(block_width > 15) {
472           o = oN;
473         }
474         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_14));
475         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
476         if XNN_UNPREDICTABLE(block_width >= 15) {
477           o = oN;
478         }
479         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_13));
480         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
481         if XNN_UNPREDICTABLE(block_width > 13) {
482           o = oN;
483         }
484         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_12));
485         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
486         if XNN_UNPREDICTABLE(block_width >= 13) {
487           o = oN;
488         }
489         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_11));
490         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
491         if XNN_UNPREDICTABLE(block_width > 11) {
492           o = oN;
493         }
494         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_10));
495         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
496         if XNN_UNPREDICTABLE(block_width >= 11) {
497           o = oN;
498         }
499         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_9));
500         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
501         if XNN_UNPREDICTABLE(block_width > 9) {
502           o = oN;
503         }
504         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_8));
505         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
506         if XNN_UNPREDICTABLE(block_width >= 9) {
507           o = oN;
508         }
509         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_7));
510         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
511         if XNN_UNPREDICTABLE(block_width > 7) {
512           o = oN;
513         }
514         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_6));
515         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
516         if XNN_UNPREDICTABLE(block_width >= 7) {
517           o = oN;
518         }
519         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_5));
520         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
521         if XNN_UNPREDICTABLE(block_width > 5) {
522           o = oN;
523         }
524         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_4));
525         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
526         if XNN_UNPREDICTABLE(block_width >= 5) {
527           o = oN;
528         }
529         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_3));
530         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
531         if XNN_UNPREDICTABLE(block_width > 3) {
532           o = oN;
533         }
534         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_2));
535         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
536         if XNN_UNPREDICTABLE(block_width >= 3) {
537           o = oN;
538         }
539         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_1));
540         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
541         if XNN_UNPREDICTABLE(block_width > 1) {
542           o = oN;
543         }
544         unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_0));
545         o += 4;
546         v0_0 = _mm_srli_epi64(v0_0, 32);
547         v0_1 = _mm_srli_epi64(v0_1, 32);
548         v0_2 = _mm_srli_epi64(v0_2, 32);
549         v0_3 = _mm_srli_epi64(v0_3, 32);
550         v0_4 = _mm_srli_epi64(v0_4, 32);
551         v0_5 = _mm_srli_epi64(v0_5, 32);
552         v0_6 = _mm_srli_epi64(v0_6, 32);
553         v0_7 = _mm_srli_epi64(v0_7, 32);
554         v0_8 = _mm_srli_epi64(v0_8, 32);
555         v0_9 = _mm_srli_epi64(v0_9, 32);
556         v0_10 = _mm_srli_epi64(v0_10, 32);
557         v0_11 = _mm_srli_epi64(v0_11, 32);
558         v0_12 = _mm_srli_epi64(v0_12, 32);
559         v0_13 = _mm_srli_epi64(v0_13, 32);
560         v0_14 = _mm_srli_epi64(v0_14, 32);
561         v0_15 = _mm_srli_epi64(v0_15, 32);
562       }
563       if (bh & 2) {
564         o = (uint8_t*) ((uintptr_t) o + oN_stride);
565         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_15));
566         uint8_t* oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
567         if XNN_UNPREDICTABLE(block_width > 15) {
568           o = oN;
569         }
570         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_14));
571         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
572         if XNN_UNPREDICTABLE(block_width >= 15) {
573           o = oN;
574         }
575         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_13));
576         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
577         if XNN_UNPREDICTABLE(block_width > 13) {
578           o = oN;
579         }
580         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_12));
581         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
582         if XNN_UNPREDICTABLE(block_width >= 13) {
583           o = oN;
584         }
585         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_11));
586         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
587         if XNN_UNPREDICTABLE(block_width > 11) {
588           o = oN;
589         }
590         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_10));
591         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
592         if XNN_UNPREDICTABLE(block_width >= 11) {
593           o = oN;
594         }
595         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_9));
596         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
597         if XNN_UNPREDICTABLE(block_width > 9) {
598           o = oN;
599         }
600         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_8));
601         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
602         if XNN_UNPREDICTABLE(block_width >= 9) {
603           o = oN;
604         }
605         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_7));
606         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
607         if XNN_UNPREDICTABLE(block_width > 7) {
608           o = oN;
609         }
610         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_6));
611         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
612         if XNN_UNPREDICTABLE(block_width >= 7) {
613           o = oN;
614         }
615         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_5));
616         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
617         if XNN_UNPREDICTABLE(block_width > 5) {
618           o = oN;
619         }
620         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_4));
621         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
622         if XNN_UNPREDICTABLE(block_width >= 5) {
623           o = oN;
624         }
625         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_3));
626         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
627         if XNN_UNPREDICTABLE(block_width > 3) {
628           o = oN;
629         }
630         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_2));
631         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
632         if XNN_UNPREDICTABLE(block_width >= 3) {
633           o = oN;
634         }
635         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_1));
636         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
637         if XNN_UNPREDICTABLE(block_width > 1) {
638           o = oN;
639         }
640         unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0));
641         o += 2;
642         v0_0 = _mm_srli_epi32(v0_0, 16);
643         v0_1 = _mm_srli_epi32(v0_1, 16);
644         v0_2 = _mm_srli_epi32(v0_2, 16);
645         v0_3 = _mm_srli_epi32(v0_3, 16);
646         v0_4 = _mm_srli_epi32(v0_4, 16);
647         v0_5 = _mm_srli_epi32(v0_5, 16);
648         v0_6 = _mm_srli_epi32(v0_6, 16);
649         v0_7 = _mm_srli_epi32(v0_7, 16);
650         v0_8 = _mm_srli_epi32(v0_8, 16);
651         v0_9 = _mm_srli_epi32(v0_9, 16);
652         v0_10 = _mm_srli_epi32(v0_10, 16);
653         v0_11 = _mm_srli_epi32(v0_11, 16);
654         v0_12 = _mm_srli_epi32(v0_12, 16);
655         v0_13 = _mm_srli_epi32(v0_13, 16);
656         v0_14 = _mm_srli_epi32(v0_14, 16);
657         v0_15 = _mm_srli_epi32(v0_15, 16);
658       }
659       if (bh & 1) {
660         o = (uint8_t*) ((uintptr_t) o + oN_stride);
661         *o = (uint8_t) _mm_cvtsi128_si32(v0_15);
662         uint8_t* oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
663         if XNN_UNPREDICTABLE(block_width > 15) {
664           o = oN;
665         }
666         *o = (uint8_t) _mm_cvtsi128_si32(v0_14);
667         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
668         if XNN_UNPREDICTABLE(block_width >= 15) {
669           o = oN;
670         }
671         *o = (uint8_t) _mm_cvtsi128_si32(v0_13);
672         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
673         if XNN_UNPREDICTABLE(block_width > 13) {
674           o = oN;
675         }
676         *o = (uint8_t) _mm_cvtsi128_si32(v0_12);
677         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
678         if XNN_UNPREDICTABLE(block_width >= 13) {
679           o = oN;
680         }
681         *o = (uint8_t) _mm_cvtsi128_si32(v0_11);
682         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
683         if XNN_UNPREDICTABLE(block_width > 11) {
684           o = oN;
685         }
686         *o = (uint8_t) _mm_cvtsi128_si32(v0_10);
687         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
688         if XNN_UNPREDICTABLE(block_width >= 11) {
689           o = oN;
690         }
691         *o = (uint8_t) _mm_cvtsi128_si32(v0_9);
692         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
693         if XNN_UNPREDICTABLE(block_width > 9) {
694           o = oN;
695         }
696         *o = (uint8_t) _mm_cvtsi128_si32(v0_8);
697         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
698         if XNN_UNPREDICTABLE(block_width >= 9) {
699           o = oN;
700         }
701         *o = (uint8_t) _mm_cvtsi128_si32(v0_7);
702         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
703         if XNN_UNPREDICTABLE(block_width > 7) {
704           o = oN;
705         }
706         *o = (uint8_t) _mm_cvtsi128_si32(v0_6);
707         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
708         if XNN_UNPREDICTABLE(block_width >= 7) {
709           o = oN;
710         }
711         *o = (uint8_t) _mm_cvtsi128_si32(v0_5);
712         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
713         if XNN_UNPREDICTABLE(block_width > 5) {
714           o = oN;
715         }
716         *o = (uint8_t) _mm_cvtsi128_si32(v0_4);
717         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
718         if XNN_UNPREDICTABLE(block_width >= 5) {
719           o = oN;
720         }
721         *o = (uint8_t) _mm_cvtsi128_si32(v0_3);
722         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
723         if XNN_UNPREDICTABLE(block_width > 3) {
724           o = oN;
725         }
726         *o = (uint8_t) _mm_cvtsi128_si32(v0_2);
727         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
728         if XNN_UNPREDICTABLE(block_width >= 3) {
729           o = oN;
730         }
731         *o = (uint8_t) _mm_cvtsi128_si32(v0_1);
732         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
733         if XNN_UNPREDICTABLE(block_width > 1) {
734           o = oN;
735         }
736         *o = (uint8_t) _mm_cvtsi128_si32(v0_0);
737       }
738     }
739 
740     i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
741     o = (uint8_t*) ((uintptr_t) o + output_reset);
742     block_width = doz(block_width, tile_width);
743   } while (block_width != 0);
744 }
745