1 // Auto-generated file. Do not edit!
2 // Template: src/x32-transposec/sse2.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <immintrin.h>
11
12 #include <assert.h>
13
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 #include <xnnpack/unaligned.h>
18
19
xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)20 void xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2(
21 const uint8_t* input,
22 uint8_t* output,
23 size_t input_stride,
24 size_t output_stride,
25 size_t block_width,
26 size_t block_height) XNN_OOB_READS
27 {
28 assert(output_stride >= block_height * sizeof(uint8_t));
29 assert(input_stride >= block_width * sizeof(uint8_t));
30
31 const size_t tile_height = 16;
32 const size_t tile_width = 16;
33 const size_t tile_hbytes = tile_height * sizeof(uint8_t);
34 const size_t tile_wbytes = tile_width * sizeof(uint8_t);
35 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
36 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes;
37
38 const uint8_t* i0 = input;
39 uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes);
40 const size_t minus_output_stride = -output_stride;
41
42 do {
43 const size_t rem = min(block_width - 1, 15);
44 const size_t oN_stride = rem * output_stride;
45 const size_t oN_offset = oN_stride + tile_hbytes;
46 size_t bh = block_height;
47 for (; bh >= 16; bh -= 16) {
48 const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
49 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
50 const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i0);
51 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
52 const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i0);
53 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
54 const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i0);
55 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
56 const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i0);
57 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
58 const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i0);
59 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
60 const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i0);
61 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
62 const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i0);
63 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
64 const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i0);
65 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
66 const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i0);
67 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
68 const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i0);
69 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
70 const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i0);
71 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
72 const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i0);
73 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
74 const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i0);
75 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
76 const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i0);
77 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
78 const __m128i v4_15 = _mm_loadu_si128((const __m128i*) i0);
79 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
80
81 const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
82 const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
83 const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
84 const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
85 const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
86 const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
87 const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
88 const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
89 const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
90 const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
91 const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
92 const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
93 const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
94 const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
95 const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
96 const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
97
98 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
99 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
100 const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
101 const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
102 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
103 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
104 const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
105 const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
106 const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
107 const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
108 const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
109 const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
110 const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
111 const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
112 const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
113 const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
114
115 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
116 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
117 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
118 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
119 const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
120 const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
121 const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
122 const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
123 const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
124 const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
125 const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
126 const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
127 const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
128 const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
129 const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
130 const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
131
132 const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
133 const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
134 const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
135 const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
136 const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
137 const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
138 const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
139 const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
140 const __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
141 const __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
142 const __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
143 const __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
144 const __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
145 const __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
146 const __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
147 const __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
148
149 o = (uint8_t*) ((uintptr_t) o + oN_offset);
150 _mm_storeu_si128((__m128i*) o, v0_15);
151 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
152 if XNN_UNPREDICTABLE(block_width > 15) {
153 o = oN;
154 }
155 _mm_storeu_si128((__m128i*) o, v0_14);
156 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
157 if XNN_UNPREDICTABLE(block_width >= 15) {
158 o = oN;
159 }
160 _mm_storeu_si128((__m128i*) o, v0_13);
161 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
162 if XNN_UNPREDICTABLE(block_width > 13) {
163 o = oN;
164 }
165 _mm_storeu_si128((__m128i*) o, v0_12);
166 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
167 if XNN_UNPREDICTABLE(block_width >= 13) {
168 o = oN;
169 }
170 _mm_storeu_si128((__m128i*) o, v0_11);
171 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
172 if XNN_UNPREDICTABLE(block_width > 11) {
173 o = oN;
174 }
175 _mm_storeu_si128((__m128i*) o, v0_10);
176 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
177 if XNN_UNPREDICTABLE(block_width >= 11) {
178 o = oN;
179 }
180 _mm_storeu_si128((__m128i*) o, v0_9);
181 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
182 if XNN_UNPREDICTABLE(block_width > 9) {
183 o = oN;
184 }
185 _mm_storeu_si128((__m128i*) o, v0_8);
186 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
187 if XNN_UNPREDICTABLE(block_width >= 9) {
188 o = oN;
189 }
190 _mm_storeu_si128((__m128i*) o, v0_7);
191 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
192 if XNN_UNPREDICTABLE(block_width > 7) {
193 o = oN;
194 }
195 _mm_storeu_si128((__m128i*) o, v0_6);
196 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
197 if XNN_UNPREDICTABLE(block_width >= 7) {
198 o = oN;
199 }
200 _mm_storeu_si128((__m128i*) o, v0_5);
201 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
202 if XNN_UNPREDICTABLE(block_width > 5) {
203 o = oN;
204 }
205 _mm_storeu_si128((__m128i*) o, v0_4);
206 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
207 if XNN_UNPREDICTABLE(block_width >= 5) {
208 o = oN;
209 }
210 _mm_storeu_si128((__m128i*) o, v0_3);
211 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
212 if XNN_UNPREDICTABLE(block_width > 3) {
213 o = oN;
214 }
215 _mm_storeu_si128((__m128i*) o, v0_2);
216 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
217 if XNN_UNPREDICTABLE(block_width >= 3) {
218 o = oN;
219 }
220 _mm_storeu_si128((__m128i*) o, v0_1);
221 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
222 if XNN_UNPREDICTABLE(block_width > 1) {
223 o = oN;
224 }
225 _mm_storeu_si128((__m128i*) o, v0_0);
226 }
227 o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
228 if (bh != 0) {
229 const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
230 const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
231 if XNN_UNPREDICTABLE(bh < 2) {
232 i1 = i0;
233 }
234 const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i1);
235 const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
236 if XNN_UNPREDICTABLE(bh <= 2) {
237 i2 = i1;
238 }
239 const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i2);
240 const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
241 if XNN_UNPREDICTABLE(bh < 4) {
242 i3 = i2;
243 }
244 const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i3);
245 const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
246 if XNN_UNPREDICTABLE(bh <= 4) {
247 i4 = i3;
248 }
249 const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i4);
250 const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
251 if XNN_UNPREDICTABLE(bh < 6) {
252 i5 = i4;
253 }
254 const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i5);
255 const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
256 if XNN_UNPREDICTABLE(bh <= 6) {
257 i6 = i5;
258 }
259 const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i6);
260 const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
261 if XNN_UNPREDICTABLE(bh < 8) {
262 i7 = i6;
263 }
264 const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i7);
265 const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
266 if XNN_UNPREDICTABLE(bh <= 8) {
267 i8 = i7;
268 }
269 const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i8);
270 const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
271 if XNN_UNPREDICTABLE(bh < 10) {
272 i9 = i8;
273 }
274 const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i9);
275 const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
276 if XNN_UNPREDICTABLE(bh <= 10) {
277 i10 = i9;
278 }
279 const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i10);
280 const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
281 if XNN_UNPREDICTABLE(bh < 12) {
282 i11 = i10;
283 }
284 const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i11);
285 const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
286 if XNN_UNPREDICTABLE(bh <= 12) {
287 i12 = i11;
288 }
289 const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i12);
290 const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
291 if XNN_UNPREDICTABLE(bh < 14) {
292 i13 = i12;
293 }
294 const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i13);
295 const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
296 if XNN_UNPREDICTABLE(bh <= 14) {
297 i14 = i13;
298 }
299 const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i14);
300 const __m128i v4_15 = _mm_undefined_si128();
301
302 const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
303 const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
304 const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
305 const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
306 const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
307 const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
308 const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
309 const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
310 const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
311 const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
312 const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
313 const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
314 const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
315 const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
316 const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
317 const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
318
319 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
320 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
321 const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
322 const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
323 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
324 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
325 const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
326 const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
327 const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
328 const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
329 const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
330 const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
331 const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
332 const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
333 const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
334 const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
335
336 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
337 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
338 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
339 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
340 const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
341 const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
342 const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
343 const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
344 const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
345 const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
346 const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
347 const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
348 const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
349 const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
350 const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
351 const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
352
353 __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
354 __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
355 __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
356 __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
357 __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
358 __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
359 __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
360 __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
361 __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
362 __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
363 __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
364 __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
365 __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
366 __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
367 __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
368 __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
369
370 if (bh & 8) {
371 o = (uint8_t*) ((uintptr_t) o + oN_stride);
372 _mm_storel_epi64((__m128i*) o, v0_15);
373 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
374 if XNN_UNPREDICTABLE(block_width > 15) {
375 o = oN;
376 }
377 _mm_storel_epi64((__m128i*) o, v0_14);
378 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
379 if XNN_UNPREDICTABLE(block_width >= 15) {
380 o = oN;
381 }
382 _mm_storel_epi64((__m128i*) o, v0_13);
383 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
384 if XNN_UNPREDICTABLE(block_width > 13) {
385 o = oN;
386 }
387 _mm_storel_epi64((__m128i*) o, v0_12);
388 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
389 if XNN_UNPREDICTABLE(block_width >= 13) {
390 o = oN;
391 }
392 _mm_storel_epi64((__m128i*) o, v0_11);
393 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
394 if XNN_UNPREDICTABLE(block_width > 11) {
395 o = oN;
396 }
397 _mm_storel_epi64((__m128i*) o, v0_10);
398 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
399 if XNN_UNPREDICTABLE(block_width >= 11) {
400 o = oN;
401 }
402 _mm_storel_epi64((__m128i*) o, v0_9);
403 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
404 if XNN_UNPREDICTABLE(block_width > 9) {
405 o = oN;
406 }
407 _mm_storel_epi64((__m128i*) o, v0_8);
408 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
409 if XNN_UNPREDICTABLE(block_width >= 9) {
410 o = oN;
411 }
412 _mm_storel_epi64((__m128i*) o, v0_7);
413 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
414 if XNN_UNPREDICTABLE(block_width > 7) {
415 o = oN;
416 }
417 _mm_storel_epi64((__m128i*) o, v0_6);
418 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
419 if XNN_UNPREDICTABLE(block_width >= 7) {
420 o = oN;
421 }
422 _mm_storel_epi64((__m128i*) o, v0_5);
423 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
424 if XNN_UNPREDICTABLE(block_width > 5) {
425 o = oN;
426 }
427 _mm_storel_epi64((__m128i*) o, v0_4);
428 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
429 if XNN_UNPREDICTABLE(block_width >= 5) {
430 o = oN;
431 }
432 _mm_storel_epi64((__m128i*) o, v0_3);
433 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
434 if XNN_UNPREDICTABLE(block_width > 3) {
435 o = oN;
436 }
437 _mm_storel_epi64((__m128i*) o, v0_2);
438 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
439 if XNN_UNPREDICTABLE(block_width >= 3) {
440 o = oN;
441 }
442 _mm_storel_epi64((__m128i*) o, v0_1);
443 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
444 if XNN_UNPREDICTABLE(block_width > 1) {
445 o = oN;
446 }
447 _mm_storel_epi64((__m128i*) o, v0_0);
448 o += 8;
449 v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
450 v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
451 v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
452 v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
453 v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
454 v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
455 v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
456 v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
457 v0_8 = _mm_unpackhi_epi64(v0_8, v0_8);
458 v0_9 = _mm_unpackhi_epi64(v0_9, v0_9);
459 v0_10 = _mm_unpackhi_epi64(v0_10, v0_10);
460 v0_11 = _mm_unpackhi_epi64(v0_11, v0_11);
461 v0_12 = _mm_unpackhi_epi64(v0_12, v0_12);
462 v0_13 = _mm_unpackhi_epi64(v0_13, v0_13);
463 v0_14 = _mm_unpackhi_epi64(v0_14, v0_14);
464 v0_15 = _mm_unpackhi_epi64(v0_15, v0_15);
465 }
466
467 if (bh & 4) {
468 o = (uint8_t*) ((uintptr_t) o + oN_stride);
469 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_15));
470 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
471 if XNN_UNPREDICTABLE(block_width > 15) {
472 o = oN;
473 }
474 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_14));
475 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
476 if XNN_UNPREDICTABLE(block_width >= 15) {
477 o = oN;
478 }
479 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_13));
480 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
481 if XNN_UNPREDICTABLE(block_width > 13) {
482 o = oN;
483 }
484 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_12));
485 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
486 if XNN_UNPREDICTABLE(block_width >= 13) {
487 o = oN;
488 }
489 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_11));
490 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
491 if XNN_UNPREDICTABLE(block_width > 11) {
492 o = oN;
493 }
494 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_10));
495 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
496 if XNN_UNPREDICTABLE(block_width >= 11) {
497 o = oN;
498 }
499 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_9));
500 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
501 if XNN_UNPREDICTABLE(block_width > 9) {
502 o = oN;
503 }
504 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_8));
505 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
506 if XNN_UNPREDICTABLE(block_width >= 9) {
507 o = oN;
508 }
509 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_7));
510 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
511 if XNN_UNPREDICTABLE(block_width > 7) {
512 o = oN;
513 }
514 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_6));
515 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
516 if XNN_UNPREDICTABLE(block_width >= 7) {
517 o = oN;
518 }
519 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_5));
520 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
521 if XNN_UNPREDICTABLE(block_width > 5) {
522 o = oN;
523 }
524 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_4));
525 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
526 if XNN_UNPREDICTABLE(block_width >= 5) {
527 o = oN;
528 }
529 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_3));
530 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
531 if XNN_UNPREDICTABLE(block_width > 3) {
532 o = oN;
533 }
534 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_2));
535 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
536 if XNN_UNPREDICTABLE(block_width >= 3) {
537 o = oN;
538 }
539 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_1));
540 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
541 if XNN_UNPREDICTABLE(block_width > 1) {
542 o = oN;
543 }
544 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_0));
545 o += 4;
546 v0_0 = _mm_srli_epi64(v0_0, 32);
547 v0_1 = _mm_srli_epi64(v0_1, 32);
548 v0_2 = _mm_srli_epi64(v0_2, 32);
549 v0_3 = _mm_srli_epi64(v0_3, 32);
550 v0_4 = _mm_srli_epi64(v0_4, 32);
551 v0_5 = _mm_srli_epi64(v0_5, 32);
552 v0_6 = _mm_srli_epi64(v0_6, 32);
553 v0_7 = _mm_srli_epi64(v0_7, 32);
554 v0_8 = _mm_srli_epi64(v0_8, 32);
555 v0_9 = _mm_srli_epi64(v0_9, 32);
556 v0_10 = _mm_srli_epi64(v0_10, 32);
557 v0_11 = _mm_srli_epi64(v0_11, 32);
558 v0_12 = _mm_srli_epi64(v0_12, 32);
559 v0_13 = _mm_srli_epi64(v0_13, 32);
560 v0_14 = _mm_srli_epi64(v0_14, 32);
561 v0_15 = _mm_srli_epi64(v0_15, 32);
562 }
563 if (bh & 2) {
564 o = (uint8_t*) ((uintptr_t) o + oN_stride);
565 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_15));
566 uint8_t* oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
567 if XNN_UNPREDICTABLE(block_width > 15) {
568 o = oN;
569 }
570 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_14));
571 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
572 if XNN_UNPREDICTABLE(block_width >= 15) {
573 o = oN;
574 }
575 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_13));
576 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
577 if XNN_UNPREDICTABLE(block_width > 13) {
578 o = oN;
579 }
580 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_12));
581 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
582 if XNN_UNPREDICTABLE(block_width >= 13) {
583 o = oN;
584 }
585 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_11));
586 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
587 if XNN_UNPREDICTABLE(block_width > 11) {
588 o = oN;
589 }
590 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_10));
591 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
592 if XNN_UNPREDICTABLE(block_width >= 11) {
593 o = oN;
594 }
595 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_9));
596 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
597 if XNN_UNPREDICTABLE(block_width > 9) {
598 o = oN;
599 }
600 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_8));
601 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
602 if XNN_UNPREDICTABLE(block_width >= 9) {
603 o = oN;
604 }
605 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_7));
606 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
607 if XNN_UNPREDICTABLE(block_width > 7) {
608 o = oN;
609 }
610 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_6));
611 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
612 if XNN_UNPREDICTABLE(block_width >= 7) {
613 o = oN;
614 }
615 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_5));
616 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
617 if XNN_UNPREDICTABLE(block_width > 5) {
618 o = oN;
619 }
620 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_4));
621 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
622 if XNN_UNPREDICTABLE(block_width >= 5) {
623 o = oN;
624 }
625 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_3));
626 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
627 if XNN_UNPREDICTABLE(block_width > 3) {
628 o = oN;
629 }
630 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_2));
631 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
632 if XNN_UNPREDICTABLE(block_width >= 3) {
633 o = oN;
634 }
635 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_1));
636 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
637 if XNN_UNPREDICTABLE(block_width > 1) {
638 o = oN;
639 }
640 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0));
641 o += 2;
642 v0_0 = _mm_srli_epi32(v0_0, 16);
643 v0_1 = _mm_srli_epi32(v0_1, 16);
644 v0_2 = _mm_srli_epi32(v0_2, 16);
645 v0_3 = _mm_srli_epi32(v0_3, 16);
646 v0_4 = _mm_srli_epi32(v0_4, 16);
647 v0_5 = _mm_srli_epi32(v0_5, 16);
648 v0_6 = _mm_srli_epi32(v0_6, 16);
649 v0_7 = _mm_srli_epi32(v0_7, 16);
650 v0_8 = _mm_srli_epi32(v0_8, 16);
651 v0_9 = _mm_srli_epi32(v0_9, 16);
652 v0_10 = _mm_srli_epi32(v0_10, 16);
653 v0_11 = _mm_srli_epi32(v0_11, 16);
654 v0_12 = _mm_srli_epi32(v0_12, 16);
655 v0_13 = _mm_srli_epi32(v0_13, 16);
656 v0_14 = _mm_srli_epi32(v0_14, 16);
657 v0_15 = _mm_srli_epi32(v0_15, 16);
658 }
659 if (bh & 1) {
660 o = (uint8_t*) ((uintptr_t) o + oN_stride);
661 *o = (uint8_t) _mm_cvtsi128_si32(v0_15);
662 uint8_t* oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
663 if XNN_UNPREDICTABLE(block_width > 15) {
664 o = oN;
665 }
666 *o = (uint8_t) _mm_cvtsi128_si32(v0_14);
667 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
668 if XNN_UNPREDICTABLE(block_width >= 15) {
669 o = oN;
670 }
671 *o = (uint8_t) _mm_cvtsi128_si32(v0_13);
672 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
673 if XNN_UNPREDICTABLE(block_width > 13) {
674 o = oN;
675 }
676 *o = (uint8_t) _mm_cvtsi128_si32(v0_12);
677 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
678 if XNN_UNPREDICTABLE(block_width >= 13) {
679 o = oN;
680 }
681 *o = (uint8_t) _mm_cvtsi128_si32(v0_11);
682 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
683 if XNN_UNPREDICTABLE(block_width > 11) {
684 o = oN;
685 }
686 *o = (uint8_t) _mm_cvtsi128_si32(v0_10);
687 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
688 if XNN_UNPREDICTABLE(block_width >= 11) {
689 o = oN;
690 }
691 *o = (uint8_t) _mm_cvtsi128_si32(v0_9);
692 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
693 if XNN_UNPREDICTABLE(block_width > 9) {
694 o = oN;
695 }
696 *o = (uint8_t) _mm_cvtsi128_si32(v0_8);
697 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
698 if XNN_UNPREDICTABLE(block_width >= 9) {
699 o = oN;
700 }
701 *o = (uint8_t) _mm_cvtsi128_si32(v0_7);
702 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
703 if XNN_UNPREDICTABLE(block_width > 7) {
704 o = oN;
705 }
706 *o = (uint8_t) _mm_cvtsi128_si32(v0_6);
707 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
708 if XNN_UNPREDICTABLE(block_width >= 7) {
709 o = oN;
710 }
711 *o = (uint8_t) _mm_cvtsi128_si32(v0_5);
712 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
713 if XNN_UNPREDICTABLE(block_width > 5) {
714 o = oN;
715 }
716 *o = (uint8_t) _mm_cvtsi128_si32(v0_4);
717 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
718 if XNN_UNPREDICTABLE(block_width >= 5) {
719 o = oN;
720 }
721 *o = (uint8_t) _mm_cvtsi128_si32(v0_3);
722 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
723 if XNN_UNPREDICTABLE(block_width > 3) {
724 o = oN;
725 }
726 *o = (uint8_t) _mm_cvtsi128_si32(v0_2);
727 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
728 if XNN_UNPREDICTABLE(block_width >= 3) {
729 o = oN;
730 }
731 *o = (uint8_t) _mm_cvtsi128_si32(v0_1);
732 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
733 if XNN_UNPREDICTABLE(block_width > 1) {
734 o = oN;
735 }
736 *o = (uint8_t) _mm_cvtsi128_si32(v0_0);
737 }
738 }
739
740 i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
741 o = (uint8_t*) ((uintptr_t) o + output_reset);
742 block_width = doz(block_width, tile_width);
743 } while (block_width != 0);
744 }
745