xref: /aosp_15_r20/external/XNNPACK/src/x8-transposec/gen/16x16-reuse-switch-zip-neon.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transposec/neon-zip.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <arm_neon.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 
xnn_x8_transposec_ukernel__16x16_reuse_switch_zip_neon(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x8_transposec_ukernel__16x16_reuse_switch_zip_neon(
19     const uint8_t* input,
20     uint8_t* output,
21     size_t input_stride,
22     size_t output_stride,
23     size_t block_width,
24     size_t block_height) XNN_OOB_READS
25 {
26   assert(output_stride >= block_height * sizeof(uint8_t));
27   assert(input_stride >= block_width * sizeof(uint8_t));
28 
29   const size_t tile_height = 16;
30   const size_t tile_width = 16;
31   const size_t tile_hbytes = tile_height * sizeof(uint8_t);
32   const size_t tile_wbytes = tile_width * sizeof(uint8_t);
33   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t);
35 
36   const uint8_t* i0 = input;
37   uint8_t* o = (uint8_t*) output;
38   const size_t minus_output_stride = -output_stride;
39 
40   do {
41     const size_t rem = min(block_width - 1, 15);
42     const size_t oN_stride = rem * output_stride;
43     size_t bh = block_height;
44     for (; bh >= 16; bh -= 16) {
45       const uint8x16_t v4_0 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
46       const uint8x16_t v4_1 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
47       const uint8x16_t v4_2 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
48       const uint8x16_t v4_3 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
49       const uint8x16_t v4_4 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
50       const uint8x16_t v4_5 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
51       const uint8x16_t v4_6 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
52       const uint8x16_t v4_7 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
53       const uint8x16_t v4_8 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
54       const uint8x16_t v4_9 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
55       const uint8x16_t v4_10 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
56       const uint8x16_t v4_11 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
57       const uint8x16_t v4_12 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
58       const uint8x16_t v4_13 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
59       const uint8x16_t v4_14 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
60       const uint8x16_t v4_15 = vld1q_u8(i0); i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
61 
62       const uint8x16x2_t v3_0 = vzipq_u8(v4_0, v4_8);
63       const uint8x16x2_t v3_1 = vzipq_u8(v4_1, v4_9);
64       const uint8x16x2_t v3_2 = vzipq_u8(v4_2, v4_10);
65       const uint8x16x2_t v3_3 = vzipq_u8(v4_3, v4_11);
66       const uint8x16x2_t v3_4 = vzipq_u8(v4_4, v4_12);
67       const uint8x16x2_t v3_5 = vzipq_u8(v4_5, v4_13);
68       const uint8x16x2_t v3_6 = vzipq_u8(v4_6, v4_14);
69       const uint8x16x2_t v3_7 = vzipq_u8(v4_7, v4_15);
70 
71       const uint8x16x2_t v2_0 = vzipq_u8(v3_0.val[0], v3_4.val[0]);
72       const uint8x16x2_t v2_1 = vzipq_u8(v3_0.val[1], v3_4.val[1]);
73       const uint8x16x2_t v2_2 = vzipq_u8(v3_1.val[0], v3_5.val[0]);
74       const uint8x16x2_t v2_3 = vzipq_u8(v3_1.val[1], v3_5.val[1]);
75       const uint8x16x2_t v2_4 = vzipq_u8(v3_2.val[0], v3_6.val[0]);
76       const uint8x16x2_t v2_5 = vzipq_u8(v3_2.val[1], v3_6.val[1]);
77       const uint8x16x2_t v2_6 = vzipq_u8(v3_3.val[0], v3_7.val[0]);
78       const uint8x16x2_t v2_7 = vzipq_u8(v3_3.val[1], v3_7.val[1]);
79       const uint8x16x2_t v1_0 = vzipq_u8(v2_0.val[0], v2_4.val[0]);
80       const uint8x16x2_t v1_1 = vzipq_u8(v2_0.val[1], v2_4.val[1]);
81       const uint8x16x2_t v1_2 = vzipq_u8(v2_1.val[0], v2_5.val[0]);
82       const uint8x16x2_t v1_3 = vzipq_u8(v2_1.val[1], v2_5.val[1]);
83       const uint8x16x2_t v1_4 = vzipq_u8(v2_2.val[0], v2_6.val[0]);
84       const uint8x16x2_t v1_5 = vzipq_u8(v2_2.val[1], v2_6.val[1]);
85       const uint8x16x2_t v1_6 = vzipq_u8(v2_3.val[0], v2_7.val[0]);
86       const uint8x16x2_t v1_7 = vzipq_u8(v2_3.val[1], v2_7.val[1]);
87       const uint8x16x2_t v0_0 = vzipq_u8(v1_0.val[0], v1_4.val[0]);
88       const uint8x16x2_t v0_1 = vzipq_u8(v1_0.val[1], v1_4.val[1]);
89       const uint8x16x2_t v0_2 = vzipq_u8(v1_1.val[0], v1_5.val[0]);
90       const uint8x16x2_t v0_3 = vzipq_u8(v1_1.val[1], v1_5.val[1]);
91       const uint8x16x2_t v0_4 = vzipq_u8(v1_2.val[0], v1_6.val[0]);
92       const uint8x16x2_t v0_5 = vzipq_u8(v1_2.val[1], v1_6.val[1]);
93       const uint8x16x2_t v0_6 = vzipq_u8(v1_3.val[0], v1_7.val[0]);
94       const uint8x16x2_t v0_7 = vzipq_u8(v1_3.val[1], v1_7.val[1]);
95 
96       uint8_t *oN = (uint8_t*) ((uintptr_t) o + oN_stride);
97       switch (rem) {
98         case 15:
99           vst1q_u8(oN, v0_7.val[1]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
100         case 14:
101           vst1q_u8(oN, v0_7.val[0]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
102         case 13:
103           vst1q_u8(oN, v0_6.val[1]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
104         case 12:
105           vst1q_u8(oN, v0_6.val[0]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
106         case 11:
107           vst1q_u8(oN, v0_5.val[1]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
108         case 10:
109           vst1q_u8(oN, v0_5.val[0]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
110         case 9:
111           vst1q_u8(oN, v0_4.val[1]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
112         case 8:
113           vst1q_u8(oN, v0_4.val[0]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
114         case 7:
115           vst1q_u8(oN, v0_3.val[1]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
116         case 6:
117           vst1q_u8(oN, v0_3.val[0]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
118         case 5:
119           vst1q_u8(oN, v0_2.val[1]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
120         case 4:
121           vst1q_u8(oN, v0_2.val[0]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
122         case 3:
123           vst1q_u8(oN, v0_1.val[1]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
124         case 2:
125           vst1q_u8(oN, v0_1.val[0]); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
126         case 1:
127           vst1q_u8(oN, v0_0.val[1]);
128         case 0:
129           vst1q_u8(o, v0_0.val[0]); o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
130           break;
131         default:
132           XNN_UNREACHABLE;
133       }
134     }
135 
136     if (bh != 0) {
137       const uint8x16_t v4_0 = vld1q_u8(i0);
138       const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
139       if XNN_UNPREDICTABLE(bh < 2) {
140         i1 = i0;
141       }
142       const uint8x16_t v4_1 = vld1q_u8(i1);
143       const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
144       if XNN_UNPREDICTABLE(bh <= 2) {
145         i2 = i1;
146       }
147       const uint8x16_t v4_2 = vld1q_u8(i2);
148       const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
149       if XNN_UNPREDICTABLE(bh < 4) {
150         i3 = i2;
151       }
152       const uint8x16_t v4_3 = vld1q_u8(i3);
153       const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
154       if XNN_UNPREDICTABLE(bh <= 4) {
155         i4 = i3;
156       }
157       const uint8x16_t v4_4 = vld1q_u8(i4);
158       const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
159       if XNN_UNPREDICTABLE(bh < 6) {
160         i5 = i4;
161       }
162       const uint8x16_t v4_5 = vld1q_u8(i5);
163       const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
164       if XNN_UNPREDICTABLE(bh <= 6) {
165         i6 = i5;
166       }
167       const uint8x16_t v4_6 = vld1q_u8(i6);
168       const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
169       if XNN_UNPREDICTABLE(bh < 8) {
170         i7 = i6;
171       }
172       const uint8x16_t v4_7 = vld1q_u8(i7);
173       const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
174       if XNN_UNPREDICTABLE(bh <= 8) {
175         i8 = i7;
176       }
177       const uint8x16_t v4_8 = vld1q_u8(i8);
178       const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
179       if XNN_UNPREDICTABLE(bh < 10) {
180         i9 = i8;
181       }
182       const uint8x16_t v4_9 = vld1q_u8(i9);
183       const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
184       if XNN_UNPREDICTABLE(bh <= 10) {
185         i10 = i9;
186       }
187       const uint8x16_t v4_10 = vld1q_u8(i10);
188       const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
189       if XNN_UNPREDICTABLE(bh < 12) {
190         i11 = i10;
191       }
192       const uint8x16_t v4_11 = vld1q_u8(i11);
193       const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
194       if XNN_UNPREDICTABLE(bh <= 12) {
195         i12 = i11;
196       }
197       const uint8x16_t v4_12 = vld1q_u8(i12);
198       const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
199       if XNN_UNPREDICTABLE(bh < 14) {
200         i13 = i12;
201       }
202       const uint8x16_t v4_13 = vld1q_u8(i13);
203       const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
204       if XNN_UNPREDICTABLE(bh <= 14) {
205         i14 = i13;
206       }
207       const uint8x16_t v4_14 = vld1q_u8(i14);
208       const uint8x16_t v4_15 = vmovq_n_u8(0);
209 
210       const uint8x16x2_t v3_0 = vzipq_u8(v4_0, v4_8);
211       const uint8x16x2_t v3_1 = vzipq_u8(v4_1, v4_9);
212       const uint8x16x2_t v3_2 = vzipq_u8(v4_2, v4_10);
213       const uint8x16x2_t v3_3 = vzipq_u8(v4_3, v4_11);
214       const uint8x16x2_t v3_4 = vzipq_u8(v4_4, v4_12);
215       const uint8x16x2_t v3_5 = vzipq_u8(v4_5, v4_13);
216       const uint8x16x2_t v3_6 = vzipq_u8(v4_6, v4_14);
217       const uint8x16x2_t v3_7 = vzipq_u8(v4_7, v4_15);
218 
219       const uint8x16x2_t v2_0 = vzipq_u8(v3_0.val[0], v3_4.val[0]);
220       const uint8x16x2_t v2_1 = vzipq_u8(v3_0.val[1], v3_4.val[1]);
221       const uint8x16x2_t v2_2 = vzipq_u8(v3_1.val[0], v3_5.val[0]);
222       const uint8x16x2_t v2_3 = vzipq_u8(v3_1.val[1], v3_5.val[1]);
223       const uint8x16x2_t v2_4 = vzipq_u8(v3_2.val[0], v3_6.val[0]);
224       const uint8x16x2_t v2_5 = vzipq_u8(v3_2.val[1], v3_6.val[1]);
225       const uint8x16x2_t v2_6 = vzipq_u8(v3_3.val[0], v3_7.val[0]);
226       const uint8x16x2_t v2_7 = vzipq_u8(v3_3.val[1], v3_7.val[1]);
227       const uint8x16x2_t v1_0 = vzipq_u8(v2_0.val[0], v2_4.val[0]);
228       const uint8x16x2_t v1_1 = vzipq_u8(v2_0.val[1], v2_4.val[1]);
229       const uint8x16x2_t v1_2 = vzipq_u8(v2_1.val[0], v2_5.val[0]);
230       const uint8x16x2_t v1_3 = vzipq_u8(v2_1.val[1], v2_5.val[1]);
231       const uint8x16x2_t v1_4 = vzipq_u8(v2_2.val[0], v2_6.val[0]);
232       const uint8x16x2_t v1_5 = vzipq_u8(v2_2.val[1], v2_6.val[1]);
233       const uint8x16x2_t v1_6 = vzipq_u8(v2_3.val[0], v2_7.val[0]);
234       const uint8x16x2_t v1_7 = vzipq_u8(v2_3.val[1], v2_7.val[1]);
235       const uint8x16x2_t v0_0 = vzipq_u8(v1_0.val[0], v1_4.val[0]);
236       const uint8x16x2_t v0_1 = vzipq_u8(v1_0.val[1], v1_4.val[1]);
237       const uint8x16x2_t v0_2 = vzipq_u8(v1_1.val[0], v1_5.val[0]);
238       const uint8x16x2_t v0_3 = vzipq_u8(v1_1.val[1], v1_5.val[1]);
239       const uint8x16x2_t v0_4 = vzipq_u8(v1_2.val[0], v1_6.val[0]);
240       const uint8x16x2_t v0_5 = vzipq_u8(v1_2.val[1], v1_6.val[1]);
241       const uint8x16x2_t v0_6 = vzipq_u8(v1_3.val[0], v1_7.val[0]);
242       const uint8x16x2_t v0_7 = vzipq_u8(v1_3.val[1], v1_7.val[1]);
243 
244       uint8x8_t v0_low = vget_low_u8(v0_0.val[0]);
245       uint8x8_t v1_low = vget_low_u8(v0_0.val[1]);
246       uint8x8_t v2_low = vget_low_u8(v0_1.val[0]);
247       uint8x8_t v3_low = vget_low_u8(v0_1.val[1]);
248       uint8x8_t v4_low = vget_low_u8(v0_2.val[0]);
249       uint8x8_t v5_low = vget_low_u8(v0_2.val[1]);
250       uint8x8_t v6_low = vget_low_u8(v0_3.val[0]);
251       uint8x8_t v7_low = vget_low_u8(v0_3.val[1]);
252       uint8x8_t v8_low = vget_low_u8(v0_4.val[0]);
253       uint8x8_t v9_low = vget_low_u8(v0_4.val[1]);
254       uint8x8_t v10_low = vget_low_u8(v0_5.val[0]);
255       uint8x8_t v11_low = vget_low_u8(v0_5.val[1]);
256       uint8x8_t v12_low = vget_low_u8(v0_6.val[0]);
257       uint8x8_t v13_low = vget_low_u8(v0_6.val[1]);
258       uint8x8_t v14_low = vget_low_u8(v0_7.val[0]);
259       uint8x8_t v15_low = vget_low_u8(v0_7.val[1]);
260 
261       if (bh & 8) {
262         uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
263         switch (rem) {
264           case 15:
265             vst1_u8(oN, v15_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
266           case 14:
267             vst1_u8(oN, v14_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
268           case 13:
269             vst1_u8(oN, v13_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
270           case 12:
271             vst1_u8(oN, v12_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
272           case 11:
273             vst1_u8(oN, v11_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
274           case 10:
275             vst1_u8(oN, v10_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
276           case 9:
277             vst1_u8(oN, v9_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
278           case 8:
279             vst1_u8(oN, v8_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
280           case 7:
281             vst1_u8(oN, v7_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
282           case 6:
283             vst1_u8(oN, v6_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
284           case 5:
285             vst1_u8(oN, v5_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
286           case 4:
287             vst1_u8(oN, v4_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
288           case 3:
289             vst1_u8(oN, v3_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
290           case 2:
291             vst1_u8(oN, v2_low); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
292           case 1:
293             vst1_u8(oN, v1_low);
294           case 0:
295             vst1_u8(o, v0_low); o += 8;
296             break;
297           default:
298             XNN_UNREACHABLE;
299         }
300         v0_low = vget_high_u8(v0_0.val[0]);
301         v1_low = vget_high_u8(v0_0.val[1]);
302         v2_low = vget_high_u8(v0_1.val[0]);
303         v3_low = vget_high_u8(v0_1.val[1]);
304         v4_low = vget_high_u8(v0_2.val[0]);
305         v5_low = vget_high_u8(v0_2.val[1]);
306         v6_low = vget_high_u8(v0_3.val[0]);
307         v7_low = vget_high_u8(v0_3.val[1]);
308         v8_low = vget_high_u8(v0_4.val[0]);
309         v9_low = vget_high_u8(v0_4.val[1]);
310         v10_low = vget_high_u8(v0_5.val[0]);
311         v11_low = vget_high_u8(v0_5.val[1]);
312         v12_low = vget_high_u8(v0_6.val[0]);
313         v13_low = vget_high_u8(v0_6.val[1]);
314         v14_low = vget_high_u8(v0_7.val[0]);
315         v15_low = vget_high_u8(v0_7.val[1]);
316       }
317 
318       if (bh & 4) {
319         uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
320         switch (rem) {
321           case 15:
322             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v15_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
323           case 14:
324             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v14_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
325           case 13:
326             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v13_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
327           case 12:
328             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v12_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
329           case 11:
330             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v11_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
331           case 10:
332             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v10_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
333           case 9:
334             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v9_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
335           case 8:
336             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v8_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
337           case 7:
338             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v7_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
339           case 6:
340             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v6_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
341           case 5:
342             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v5_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
343           case 4:
344             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v4_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
345           case 3:
346             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v3_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
347           case 2:
348             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v2_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
349           case 1:
350             vst1_lane_u32((void*) oN, vreinterpret_u32_u8(v1_low), 0);
351           case 0:
352             vst1_lane_u32((void*) o, vreinterpret_u32_u8(v0_low), 0); o += 4;
353             break;
354           default:
355             XNN_UNREACHABLE;
356         }
357         v0_low = vext_u8(v0_low, v0_low, 4);
358         v1_low = vext_u8(v1_low, v1_low, 4);
359         v2_low = vext_u8(v2_low, v2_low, 4);
360         v3_low = vext_u8(v3_low, v3_low, 4);
361         v4_low = vext_u8(v4_low, v4_low, 4);
362         v5_low = vext_u8(v5_low, v5_low, 4);
363         v6_low = vext_u8(v6_low, v6_low, 4);
364         v7_low = vext_u8(v7_low, v7_low, 4);
365         v8_low = vext_u8(v8_low, v8_low, 4);
366         v9_low = vext_u8(v9_low, v9_low, 4);
367         v10_low = vext_u8(v10_low, v10_low, 4);
368         v11_low = vext_u8(v11_low, v11_low, 4);
369         v12_low = vext_u8(v12_low, v12_low, 4);
370         v13_low = vext_u8(v13_low, v13_low, 4);
371         v14_low = vext_u8(v14_low, v14_low, 4);
372         v15_low = vext_u8(v15_low, v15_low, 4);
373       }
374       if (bh & 2) {
375         uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
376         switch (rem) {
377           case 15:
378             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v15_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
379           case 14:
380             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v14_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
381           case 13:
382             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v13_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
383           case 12:
384             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v12_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
385           case 11:
386             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v11_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
387           case 10:
388             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v10_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
389           case 9:
390             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v9_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
391           case 8:
392             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v8_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
393           case 7:
394             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v7_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
395           case 6:
396             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v6_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
397           case 5:
398             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v5_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
399           case 4:
400             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v4_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
401           case 3:
402             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v3_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
403           case 2:
404             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v2_low), 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
405           case 1:
406             vst1_lane_u16((void*) oN, vreinterpret_u16_u8(v1_low), 0);
407           case 0:
408             vst1_lane_u16((void*) o, vreinterpret_u16_u8(v0_low), 0); o += 2;
409             break;
410           default:
411             XNN_UNREACHABLE;
412         }
413         v0_low = vext_u8(v0_low, v0_low, 2);
414         v1_low = vext_u8(v1_low, v1_low, 2);
415         v2_low = vext_u8(v2_low, v2_low, 2);
416         v3_low = vext_u8(v3_low, v3_low, 2);
417         v4_low = vext_u8(v4_low, v4_low, 2);
418         v5_low = vext_u8(v5_low, v5_low, 2);
419         v6_low = vext_u8(v6_low, v6_low, 2);
420         v7_low = vext_u8(v7_low, v7_low, 2);
421         v8_low = vext_u8(v8_low, v8_low, 2);
422         v9_low = vext_u8(v9_low, v9_low, 2);
423         v10_low = vext_u8(v10_low, v10_low, 2);
424         v11_low = vext_u8(v11_low, v11_low, 2);
425         v12_low = vext_u8(v12_low, v12_low, 2);
426         v13_low = vext_u8(v13_low, v13_low, 2);
427         v14_low = vext_u8(v14_low, v14_low, 2);
428         v15_low = vext_u8(v15_low, v15_low, 2);
429       }
430       if (bh & 1) {
431         uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
432         switch (rem) {
433           case 15:
434             vst1_lane_u8(oN, v15_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
435           case 14:
436             vst1_lane_u8(oN, v14_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
437           case 13:
438             vst1_lane_u8(oN, v13_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
439           case 12:
440             vst1_lane_u8(oN, v12_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
441           case 11:
442             vst1_lane_u8(oN, v11_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
443           case 10:
444             vst1_lane_u8(oN, v10_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
445           case 9:
446             vst1_lane_u8(oN, v9_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
447           case 8:
448             vst1_lane_u8(oN, v8_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
449           case 7:
450             vst1_lane_u8(oN, v7_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
451           case 6:
452             vst1_lane_u8(oN, v6_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
453           case 5:
454             vst1_lane_u8(oN, v5_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
455           case 4:
456             vst1_lane_u8(oN, v4_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
457           case 3:
458             vst1_lane_u8(oN, v3_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
459           case 2:
460             vst1_lane_u8(oN, v2_low, 0); oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
461           case 1:
462             vst1_lane_u8(oN, v1_low, 0);
463           case 0:
464             vst1_lane_u8(o, v0_low, 0);
465             break;
466           default:
467             XNN_UNREACHABLE;
468         }
469       }
470     }
471 
472     i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
473     o = (uint8_t*) ((uintptr_t) o + output_reset);
474     block_width = doz(block_width, tile_width);
475   } while (block_width != 0);
476 }
477