xref: /aosp_15_r20/external/XNNPACK/src/x8-transposec/gen/16x16-reuse-mov-wasmsimd.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transposec/wasmsimd.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <wasm_simd128.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 
xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd(
19     const uint8_t* input,
20     uint8_t* output,
21     size_t input_stride,
22     size_t output_stride,
23     size_t block_width,
24     size_t block_height) XNN_OOB_READS
25 {
26   assert(output_stride >= block_height * sizeof(uint8_t));
27   assert(input_stride >= block_width * sizeof(uint8_t));
28 
29   const size_t tile_height = 16;
30   const size_t tile_width = 16;
31   const size_t tile_hbytes = tile_height * sizeof(uint8_t);
32   const size_t tile_wbytes = tile_width * sizeof(uint8_t);
33   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes;
35 
36   const uint8_t* i0 = input;
37   uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes);
38   const size_t minus_output_stride = -output_stride;
39 
40   do {
41     const size_t rem = min(block_width - 1, 15);
42     const size_t oN_stride = rem * output_stride;
43     const size_t oN_offset = oN_stride + tile_hbytes;
44     size_t bh = block_height;
45     for (; bh >= 16; bh -= 16) {
46       const v128_t v4_0 = wasm_v128_load(i0);
47       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
48       const v128_t v4_1 = wasm_v128_load(i0);
49       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
50       const v128_t v4_2 = wasm_v128_load(i0);
51       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
52       const v128_t v4_3 = wasm_v128_load(i0);
53       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
54       const v128_t v4_4 = wasm_v128_load(i0);
55       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
56       const v128_t v4_5 = wasm_v128_load(i0);
57       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
58       const v128_t v4_6 = wasm_v128_load(i0);
59       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
60       const v128_t v4_7 = wasm_v128_load(i0);
61       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
62       const v128_t v4_8 = wasm_v128_load(i0);
63       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
64       const v128_t v4_9 = wasm_v128_load(i0);
65       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
66       const v128_t v4_10 = wasm_v128_load(i0);
67       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
68       const v128_t v4_11 = wasm_v128_load(i0);
69       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
70       const v128_t v4_12 = wasm_v128_load(i0);
71       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
72       const v128_t v4_13 = wasm_v128_load(i0);
73       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
74       const v128_t v4_14 = wasm_v128_load(i0);
75       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
76       const v128_t v4_15 = wasm_v128_load(i0);
77       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
78 
79       const v128_t v3_0 = wasm_v8x16_shuffle(v4_0, v4_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
80       const v128_t v3_1 = wasm_v8x16_shuffle(v4_0, v4_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
81       const v128_t v3_2 = wasm_v8x16_shuffle(v4_1, v4_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
82       const v128_t v3_3 = wasm_v8x16_shuffle(v4_1, v4_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
83       const v128_t v3_4 = wasm_v8x16_shuffle(v4_2, v4_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
84       const v128_t v3_5 = wasm_v8x16_shuffle(v4_2, v4_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
85       const v128_t v3_6 = wasm_v8x16_shuffle(v4_3, v4_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
86       const v128_t v3_7 = wasm_v8x16_shuffle(v4_3, v4_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
87       const v128_t v3_8 = wasm_v8x16_shuffle(v4_4, v4_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
88       const v128_t v3_9 = wasm_v8x16_shuffle(v4_4, v4_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
89       const v128_t v3_10 = wasm_v8x16_shuffle(v4_5, v4_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
90       const v128_t v3_11 = wasm_v8x16_shuffle(v4_5, v4_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
91       const v128_t v3_12 = wasm_v8x16_shuffle(v4_6, v4_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
92       const v128_t v3_13 = wasm_v8x16_shuffle(v4_6, v4_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
93       const v128_t v3_14 = wasm_v8x16_shuffle(v4_7, v4_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
94       const v128_t v3_15 = wasm_v8x16_shuffle(v4_7, v4_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
95       const v128_t v2_0 = wasm_v8x16_shuffle(v3_0, v3_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
96       const v128_t v2_1 = wasm_v8x16_shuffle(v3_0, v3_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
97       const v128_t v2_2 = wasm_v8x16_shuffle(v3_1, v3_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
98       const v128_t v2_3 = wasm_v8x16_shuffle(v3_1, v3_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
99       const v128_t v2_4 = wasm_v8x16_shuffle(v3_2, v3_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
100       const v128_t v2_5 = wasm_v8x16_shuffle(v3_2, v3_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
101       const v128_t v2_6 = wasm_v8x16_shuffle(v3_3, v3_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
102       const v128_t v2_7 = wasm_v8x16_shuffle(v3_3, v3_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
103       const v128_t v2_8 = wasm_v8x16_shuffle(v3_4, v3_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
104       const v128_t v2_9 = wasm_v8x16_shuffle(v3_4, v3_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
105       const v128_t v2_10 = wasm_v8x16_shuffle(v3_5, v3_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
106       const v128_t v2_11 = wasm_v8x16_shuffle(v3_5, v3_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
107       const v128_t v2_12 = wasm_v8x16_shuffle(v3_6, v3_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
108       const v128_t v2_13 = wasm_v8x16_shuffle(v3_6, v3_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
109       const v128_t v2_14 = wasm_v8x16_shuffle(v3_7, v3_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
110       const v128_t v2_15 = wasm_v8x16_shuffle(v3_7, v3_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
111       const v128_t v1_0 = wasm_v8x16_shuffle(v2_0, v2_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
112       const v128_t v1_1 = wasm_v8x16_shuffle(v2_0, v2_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
113       const v128_t v1_2 = wasm_v8x16_shuffle(v2_1, v2_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
114       const v128_t v1_3 = wasm_v8x16_shuffle(v2_1, v2_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
115       const v128_t v1_4 = wasm_v8x16_shuffle(v2_2, v2_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
116       const v128_t v1_5 = wasm_v8x16_shuffle(v2_2, v2_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
117       const v128_t v1_6 = wasm_v8x16_shuffle(v2_3, v2_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
118       const v128_t v1_7 = wasm_v8x16_shuffle(v2_3, v2_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
119       const v128_t v1_8 = wasm_v8x16_shuffle(v2_4, v2_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
120       const v128_t v1_9 = wasm_v8x16_shuffle(v2_4, v2_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
121       const v128_t v1_10 = wasm_v8x16_shuffle(v2_5, v2_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
122       const v128_t v1_11 = wasm_v8x16_shuffle(v2_5, v2_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
123       const v128_t v1_12 = wasm_v8x16_shuffle(v2_6, v2_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
124       const v128_t v1_13 = wasm_v8x16_shuffle(v2_6, v2_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
125       const v128_t v1_14 = wasm_v8x16_shuffle(v2_7, v2_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
126       const v128_t v1_15 = wasm_v8x16_shuffle(v2_7, v2_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
127       const v128_t v0_0 = wasm_v8x16_shuffle(v1_0, v1_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
128       const v128_t v0_1 = wasm_v8x16_shuffle(v1_0, v1_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
129       const v128_t v0_2 = wasm_v8x16_shuffle(v1_1, v1_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
130       const v128_t v0_3 = wasm_v8x16_shuffle(v1_1, v1_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
131       const v128_t v0_4 = wasm_v8x16_shuffle(v1_2, v1_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
132       const v128_t v0_5 = wasm_v8x16_shuffle(v1_2, v1_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
133       const v128_t v0_6 = wasm_v8x16_shuffle(v1_3, v1_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
134       const v128_t v0_7 = wasm_v8x16_shuffle(v1_3, v1_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
135       const v128_t v0_8 = wasm_v8x16_shuffle(v1_4, v1_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
136       const v128_t v0_9 = wasm_v8x16_shuffle(v1_4, v1_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
137       const v128_t v0_10 = wasm_v8x16_shuffle(v1_5, v1_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
138       const v128_t v0_11 = wasm_v8x16_shuffle(v1_5, v1_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
139       const v128_t v0_12 = wasm_v8x16_shuffle(v1_6, v1_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
140       const v128_t v0_13 = wasm_v8x16_shuffle(v1_6, v1_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
141       const v128_t v0_14 = wasm_v8x16_shuffle(v1_7, v1_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
142       const v128_t v0_15 = wasm_v8x16_shuffle(v1_7, v1_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
143 
144       o = (uint8_t*) ((uintptr_t) o + oN_offset);
145       wasm_v128_store(o, v0_15);
146       uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
147       if XNN_UNPREDICTABLE(block_width > 15) {
148         o = oN;
149       }
150       wasm_v128_store(o, v0_14);
151       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
152       if XNN_UNPREDICTABLE(block_width >= 15) {
153         o = oN;
154       }
155       wasm_v128_store(o, v0_13);
156       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
157       if XNN_UNPREDICTABLE(block_width > 13) {
158         o = oN;
159       }
160       wasm_v128_store(o, v0_12);
161       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
162       if XNN_UNPREDICTABLE(block_width >= 13) {
163         o = oN;
164       }
165       wasm_v128_store(o, v0_11);
166       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
167       if XNN_UNPREDICTABLE(block_width > 11) {
168         o = oN;
169       }
170       wasm_v128_store(o, v0_10);
171       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
172       if XNN_UNPREDICTABLE(block_width >= 11) {
173         o = oN;
174       }
175       wasm_v128_store(o, v0_9);
176       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
177       if XNN_UNPREDICTABLE(block_width > 9) {
178         o = oN;
179       }
180       wasm_v128_store(o, v0_8);
181       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
182       if XNN_UNPREDICTABLE(block_width >= 9) {
183         o = oN;
184       }
185       wasm_v128_store(o, v0_7);
186       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
187       if XNN_UNPREDICTABLE(block_width > 7) {
188         o = oN;
189       }
190       wasm_v128_store(o, v0_6);
191       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
192       if XNN_UNPREDICTABLE(block_width >= 7) {
193         o = oN;
194       }
195       wasm_v128_store(o, v0_5);
196       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
197       if XNN_UNPREDICTABLE(block_width > 5) {
198         o = oN;
199       }
200       wasm_v128_store(o, v0_4);
201       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
202       if XNN_UNPREDICTABLE(block_width >= 5) {
203         o = oN;
204       }
205       wasm_v128_store(o, v0_3);
206       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
207       if XNN_UNPREDICTABLE(block_width > 3) {
208         o = oN;
209       }
210       wasm_v128_store(o, v0_2);
211       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
212       if XNN_UNPREDICTABLE(block_width >= 3) {
213         o = oN;
214       }
215       wasm_v128_store(o, v0_1);
216       oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
217       if XNN_UNPREDICTABLE(block_width > 1) {
218         o = oN;
219       }
220       wasm_v128_store(o, v0_0);
221     }
222     o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
223 
224     if (bh != 0) {
225       const v128_t v4_0 = wasm_v128_load(i0);
226       const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
227       if XNN_UNPREDICTABLE(bh < 2) {
228         i1 = i0;
229       }
230       const v128_t v4_1 = wasm_v128_load(i1);
231       const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
232       if XNN_UNPREDICTABLE(bh <= 2) {
233         i2 = i1;
234       }
235       const v128_t v4_2 = wasm_v128_load(i2);
236       const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
237       if XNN_UNPREDICTABLE(bh < 4) {
238         i3 = i2;
239       }
240       const v128_t v4_3 = wasm_v128_load(i3);
241       const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
242       if XNN_UNPREDICTABLE(bh <= 4) {
243         i4 = i3;
244       }
245       const v128_t v4_4 = wasm_v128_load(i4);
246       const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
247       if XNN_UNPREDICTABLE(bh < 6) {
248         i5 = i4;
249       }
250       const v128_t v4_5 = wasm_v128_load(i5);
251       const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
252       if XNN_UNPREDICTABLE(bh <= 6) {
253         i6 = i5;
254       }
255       const v128_t v4_6 = wasm_v128_load(i6);
256       const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
257       if XNN_UNPREDICTABLE(bh < 8) {
258         i7 = i6;
259       }
260       const v128_t v4_7 = wasm_v128_load(i7);
261       const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
262       if XNN_UNPREDICTABLE(bh <= 8) {
263         i8 = i7;
264       }
265       const v128_t v4_8 = wasm_v128_load(i8);
266       const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
267       if XNN_UNPREDICTABLE(bh < 10) {
268         i9 = i8;
269       }
270       const v128_t v4_9 = wasm_v128_load(i9);
271       const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
272       if XNN_UNPREDICTABLE(bh <= 10) {
273         i10 = i9;
274       }
275       const v128_t v4_10 = wasm_v128_load(i10);
276       const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
277       if XNN_UNPREDICTABLE(bh < 12) {
278         i11 = i10;
279       }
280       const v128_t v4_11 = wasm_v128_load(i11);
281       const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
282       if XNN_UNPREDICTABLE(bh <= 12) {
283         i12 = i11;
284       }
285       const v128_t v4_12 = wasm_v128_load(i12);
286       const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
287       if XNN_UNPREDICTABLE(bh < 14) {
288         i13 = i12;
289       }
290       const v128_t v4_13 = wasm_v128_load(i13);
291       const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
292       if XNN_UNPREDICTABLE(bh <= 14) {
293         i14 = i13;
294       }
295       const v128_t v4_14 = wasm_v128_load(i14);
296       const v128_t v4_15 = wasm_v128_xor(v4_0, v4_0);
297 
298       const v128_t v3_0 = wasm_v8x16_shuffle(v4_0, v4_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
299       const v128_t v3_1 = wasm_v8x16_shuffle(v4_0, v4_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
300       const v128_t v3_2 = wasm_v8x16_shuffle(v4_1, v4_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
301       const v128_t v3_3 = wasm_v8x16_shuffle(v4_1, v4_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
302       const v128_t v3_4 = wasm_v8x16_shuffle(v4_2, v4_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
303       const v128_t v3_5 = wasm_v8x16_shuffle(v4_2, v4_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
304       const v128_t v3_6 = wasm_v8x16_shuffle(v4_3, v4_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
305       const v128_t v3_7 = wasm_v8x16_shuffle(v4_3, v4_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
306       const v128_t v3_8 = wasm_v8x16_shuffle(v4_4, v4_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
307       const v128_t v3_9 = wasm_v8x16_shuffle(v4_4, v4_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
308       const v128_t v3_10 = wasm_v8x16_shuffle(v4_5, v4_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
309       const v128_t v3_11 = wasm_v8x16_shuffle(v4_5, v4_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
310       const v128_t v3_12 = wasm_v8x16_shuffle(v4_6, v4_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
311       const v128_t v3_13 = wasm_v8x16_shuffle(v4_6, v4_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
312       const v128_t v3_14 = wasm_v8x16_shuffle(v4_7, v4_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
313       const v128_t v3_15 = wasm_v8x16_shuffle(v4_7, v4_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
314       const v128_t v2_0 = wasm_v8x16_shuffle(v3_0, v3_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
315       const v128_t v2_1 = wasm_v8x16_shuffle(v3_0, v3_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
316       const v128_t v2_2 = wasm_v8x16_shuffle(v3_1, v3_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
317       const v128_t v2_3 = wasm_v8x16_shuffle(v3_1, v3_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
318       const v128_t v2_4 = wasm_v8x16_shuffle(v3_2, v3_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
319       const v128_t v2_5 = wasm_v8x16_shuffle(v3_2, v3_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
320       const v128_t v2_6 = wasm_v8x16_shuffle(v3_3, v3_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
321       const v128_t v2_7 = wasm_v8x16_shuffle(v3_3, v3_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
322       const v128_t v2_8 = wasm_v8x16_shuffle(v3_4, v3_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
323       const v128_t v2_9 = wasm_v8x16_shuffle(v3_4, v3_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
324       const v128_t v2_10 = wasm_v8x16_shuffle(v3_5, v3_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
325       const v128_t v2_11 = wasm_v8x16_shuffle(v3_5, v3_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
326       const v128_t v2_12 = wasm_v8x16_shuffle(v3_6, v3_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
327       const v128_t v2_13 = wasm_v8x16_shuffle(v3_6, v3_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
328       const v128_t v2_14 = wasm_v8x16_shuffle(v3_7, v3_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
329       const v128_t v2_15 = wasm_v8x16_shuffle(v3_7, v3_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
330       const v128_t v1_0 = wasm_v8x16_shuffle(v2_0, v2_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
331       const v128_t v1_1 = wasm_v8x16_shuffle(v2_0, v2_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
332       const v128_t v1_2 = wasm_v8x16_shuffle(v2_1, v2_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
333       const v128_t v1_3 = wasm_v8x16_shuffle(v2_1, v2_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
334       const v128_t v1_4 = wasm_v8x16_shuffle(v2_2, v2_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
335       const v128_t v1_5 = wasm_v8x16_shuffle(v2_2, v2_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
336       const v128_t v1_6 = wasm_v8x16_shuffle(v2_3, v2_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
337       const v128_t v1_7 = wasm_v8x16_shuffle(v2_3, v2_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
338       const v128_t v1_8 = wasm_v8x16_shuffle(v2_4, v2_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
339       const v128_t v1_9 = wasm_v8x16_shuffle(v2_4, v2_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
340       const v128_t v1_10 = wasm_v8x16_shuffle(v2_5, v2_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
341       const v128_t v1_11 = wasm_v8x16_shuffle(v2_5, v2_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
342       const v128_t v1_12 = wasm_v8x16_shuffle(v2_6, v2_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
343       const v128_t v1_13 = wasm_v8x16_shuffle(v2_6, v2_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
344       const v128_t v1_14 = wasm_v8x16_shuffle(v2_7, v2_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
345       const v128_t v1_15 = wasm_v8x16_shuffle(v2_7, v2_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
346 
347       v128_t v0_0 = wasm_v8x16_shuffle(v1_0, v1_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
348       v128_t v0_1 = wasm_v8x16_shuffle(v1_0, v1_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
349       v128_t v0_2 = wasm_v8x16_shuffle(v1_1, v1_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
350       v128_t v0_3 = wasm_v8x16_shuffle(v1_1, v1_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
351       v128_t v0_4 = wasm_v8x16_shuffle(v1_2, v1_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
352       v128_t v0_5 = wasm_v8x16_shuffle(v1_2, v1_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
353       v128_t v0_6 = wasm_v8x16_shuffle(v1_3, v1_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
354       v128_t v0_7 = wasm_v8x16_shuffle(v1_3, v1_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
355       v128_t v0_8 = wasm_v8x16_shuffle(v1_4, v1_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
356       v128_t v0_9 = wasm_v8x16_shuffle(v1_4, v1_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
357       v128_t v0_10 = wasm_v8x16_shuffle(v1_5, v1_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
358       v128_t v0_11 = wasm_v8x16_shuffle(v1_5, v1_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
359       v128_t v0_12 = wasm_v8x16_shuffle(v1_6, v1_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
360       v128_t v0_13 = wasm_v8x16_shuffle(v1_6, v1_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
361       v128_t v0_14 = wasm_v8x16_shuffle(v1_7, v1_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
362       v128_t v0_15 = wasm_v8x16_shuffle(v1_7, v1_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
363 
364       if (bh & 8) {
365         o = (uint8_t*) ((uintptr_t) o + oN_stride);
366         *((double*) o) = wasm_f64x2_extract_lane(v0_15, 0);
367         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
368         if XNN_UNPREDICTABLE(block_width > 15) {
369           o = oN;
370         }
371         *((double*) o) = wasm_f64x2_extract_lane(v0_14, 0);
372         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
373         if XNN_UNPREDICTABLE(block_width >= 15) {
374           o = oN;
375         }
376         *((double*) o) = wasm_f64x2_extract_lane(v0_13, 0);
377         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
378         if XNN_UNPREDICTABLE(block_width > 13) {
379           o = oN;
380         }
381         *((double*) o) = wasm_f64x2_extract_lane(v0_12, 0);
382         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
383         if XNN_UNPREDICTABLE(block_width >= 13) {
384           o = oN;
385         }
386         *((double*) o) = wasm_f64x2_extract_lane(v0_11, 0);
387         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
388         if XNN_UNPREDICTABLE(block_width > 11) {
389           o = oN;
390         }
391         *((double*) o) = wasm_f64x2_extract_lane(v0_10, 0);
392         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
393         if XNN_UNPREDICTABLE(block_width >= 11) {
394           o = oN;
395         }
396         *((double*) o) = wasm_f64x2_extract_lane(v0_9, 0);
397         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
398         if XNN_UNPREDICTABLE(block_width > 9) {
399           o = oN;
400         }
401         *((double*) o) = wasm_f64x2_extract_lane(v0_8, 0);
402         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
403         if XNN_UNPREDICTABLE(block_width >= 9) {
404           o = oN;
405         }
406         *((double*) o) = wasm_f64x2_extract_lane(v0_7, 0);
407         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
408         if XNN_UNPREDICTABLE(block_width > 7) {
409           o = oN;
410         }
411         *((double*) o) = wasm_f64x2_extract_lane(v0_6, 0);
412         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
413         if XNN_UNPREDICTABLE(block_width >= 7) {
414           o = oN;
415         }
416         *((double*) o) = wasm_f64x2_extract_lane(v0_5, 0);
417         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
418         if XNN_UNPREDICTABLE(block_width > 5) {
419           o = oN;
420         }
421         *((double*) o) = wasm_f64x2_extract_lane(v0_4, 0);
422         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
423         if XNN_UNPREDICTABLE(block_width >= 5) {
424           o = oN;
425         }
426         *((double*) o) = wasm_f64x2_extract_lane(v0_3, 0);
427         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
428         if XNN_UNPREDICTABLE(block_width > 3) {
429           o = oN;
430         }
431         *((double*) o) = wasm_f64x2_extract_lane(v0_2, 0);
432         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
433         if XNN_UNPREDICTABLE(block_width >= 3) {
434           o = oN;
435         }
436         *((double*) o) = wasm_f64x2_extract_lane(v0_1, 0);
437         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
438         if XNN_UNPREDICTABLE(block_width > 1) {
439           o = oN;
440         }
441         *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
442         o += 8;
443         v0_0 = wasm_v64x2_shuffle(v0_0, v0_0, 1, 1);
444         v0_1 = wasm_v64x2_shuffle(v0_1, v0_1, 1, 1);
445         v0_2 = wasm_v64x2_shuffle(v0_2, v0_2, 1, 1);
446         v0_3 = wasm_v64x2_shuffle(v0_3, v0_3, 1, 1);
447         v0_4 = wasm_v64x2_shuffle(v0_4, v0_4, 1, 1);
448         v0_5 = wasm_v64x2_shuffle(v0_5, v0_5, 1, 1);
449         v0_6 = wasm_v64x2_shuffle(v0_6, v0_6, 1, 1);
450         v0_7 = wasm_v64x2_shuffle(v0_7, v0_7, 1, 1);
451         v0_8 = wasm_v64x2_shuffle(v0_8, v0_8, 1, 1);
452         v0_9 = wasm_v64x2_shuffle(v0_9, v0_9, 1, 1);
453         v0_10 = wasm_v64x2_shuffle(v0_10, v0_10, 1, 1);
454         v0_11 = wasm_v64x2_shuffle(v0_11, v0_11, 1, 1);
455         v0_12 = wasm_v64x2_shuffle(v0_12, v0_12, 1, 1);
456         v0_13 = wasm_v64x2_shuffle(v0_13, v0_13, 1, 1);
457         v0_14 = wasm_v64x2_shuffle(v0_14, v0_14, 1, 1);
458         v0_15 = wasm_v64x2_shuffle(v0_15, v0_15, 1, 1);
459       }
460 
461       if (bh & 4) {
462         o = (uint8_t*) ((uintptr_t) o + oN_stride);
463         *((float*) o) = wasm_f32x4_extract_lane(v0_15, 0);
464         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
465         if XNN_UNPREDICTABLE(block_width > 15) {
466           o = oN;
467         }
468         *((float*) o) = wasm_f32x4_extract_lane(v0_14, 0);
469         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
470         if XNN_UNPREDICTABLE(block_width >= 15) {
471           o = oN;
472         }
473         *((float*) o) = wasm_f32x4_extract_lane(v0_13, 0);
474         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
475         if XNN_UNPREDICTABLE(block_width > 13) {
476           o = oN;
477         }
478         *((float*) o) = wasm_f32x4_extract_lane(v0_12, 0);
479         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
480         if XNN_UNPREDICTABLE(block_width >= 13) {
481           o = oN;
482         }
483         *((float*) o) = wasm_f32x4_extract_lane(v0_11, 0);
484         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
485         if XNN_UNPREDICTABLE(block_width > 11) {
486           o = oN;
487         }
488         *((float*) o) = wasm_f32x4_extract_lane(v0_10, 0);
489         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
490         if XNN_UNPREDICTABLE(block_width >= 11) {
491           o = oN;
492         }
493         *((float*) o) = wasm_f32x4_extract_lane(v0_9, 0);
494         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
495         if XNN_UNPREDICTABLE(block_width > 9) {
496           o = oN;
497         }
498         *((float*) o) = wasm_f32x4_extract_lane(v0_8, 0);
499         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
500         if XNN_UNPREDICTABLE(block_width >= 9) {
501           o = oN;
502         }
503         *((float*) o) = wasm_f32x4_extract_lane(v0_7, 0);
504         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
505         if XNN_UNPREDICTABLE(block_width > 7) {
506           o = oN;
507         }
508         *((float*) o) = wasm_f32x4_extract_lane(v0_6, 0);
509         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
510         if XNN_UNPREDICTABLE(block_width >= 7) {
511           o = oN;
512         }
513         *((float*) o) = wasm_f32x4_extract_lane(v0_5, 0);
514         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
515         if XNN_UNPREDICTABLE(block_width > 5) {
516           o = oN;
517         }
518         *((float*) o) = wasm_f32x4_extract_lane(v0_4, 0);
519         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
520         if XNN_UNPREDICTABLE(block_width >= 5) {
521           o = oN;
522         }
523         *((float*) o) = wasm_f32x4_extract_lane(v0_3, 0);
524         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
525         if XNN_UNPREDICTABLE(block_width > 3) {
526           o = oN;
527         }
528         *((float*) o) = wasm_f32x4_extract_lane(v0_2, 0);
529         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
530         if XNN_UNPREDICTABLE(block_width >= 3) {
531           o = oN;
532         }
533         *((float*) o) = wasm_f32x4_extract_lane(v0_1, 0);
534         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
535         if XNN_UNPREDICTABLE(block_width > 1) {
536           o = oN;
537         }
538         *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
539         o += 4;
540         v0_0 = wasm_u64x2_shr(v0_0, 32);
541         v0_1 = wasm_u64x2_shr(v0_1, 32);
542         v0_2 = wasm_u64x2_shr(v0_2, 32);
543         v0_3 = wasm_u64x2_shr(v0_3, 32);
544         v0_4 = wasm_u64x2_shr(v0_4, 32);
545         v0_5 = wasm_u64x2_shr(v0_5, 32);
546         v0_6 = wasm_u64x2_shr(v0_6, 32);
547         v0_7 = wasm_u64x2_shr(v0_7, 32);
548         v0_8 = wasm_u64x2_shr(v0_8, 32);
549         v0_9 = wasm_u64x2_shr(v0_9, 32);
550         v0_10 = wasm_u64x2_shr(v0_10, 32);
551         v0_11 = wasm_u64x2_shr(v0_11, 32);
552         v0_12 = wasm_u64x2_shr(v0_12, 32);
553         v0_13 = wasm_u64x2_shr(v0_13, 32);
554         v0_14 = wasm_u64x2_shr(v0_14, 32);
555         v0_15 = wasm_u64x2_shr(v0_15, 32);
556       }
557       if (bh & 2) {
558         o = (uint8_t*) ((uintptr_t) o + oN_stride);
559         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_15, 0);
560         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
561         if XNN_UNPREDICTABLE(block_width > 15) {
562           o = oN;
563         }
564         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_14, 0);
565         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
566         if XNN_UNPREDICTABLE(block_width >= 15) {
567           o = oN;
568         }
569         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_13, 0);
570         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
571         if XNN_UNPREDICTABLE(block_width > 13) {
572           o = oN;
573         }
574         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_12, 0);
575         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
576         if XNN_UNPREDICTABLE(block_width >= 13) {
577           o = oN;
578         }
579         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_11, 0);
580         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
581         if XNN_UNPREDICTABLE(block_width > 11) {
582           o = oN;
583         }
584         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_10, 0);
585         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
586         if XNN_UNPREDICTABLE(block_width >= 11) {
587           o = oN;
588         }
589         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_9, 0);
590         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
591         if XNN_UNPREDICTABLE(block_width > 9) {
592           o = oN;
593         }
594         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_8, 0);
595         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
596         if XNN_UNPREDICTABLE(block_width >= 9) {
597           o = oN;
598         }
599         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_7, 0);
600         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
601         if XNN_UNPREDICTABLE(block_width > 7) {
602           o = oN;
603         }
604         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_6, 0);
605         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
606         if XNN_UNPREDICTABLE(block_width >= 7) {
607           o = oN;
608         }
609         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_5, 0);
610         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
611         if XNN_UNPREDICTABLE(block_width > 5) {
612           o = oN;
613         }
614         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_4, 0);
615         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
616         if XNN_UNPREDICTABLE(block_width >= 5) {
617           o = oN;
618         }
619         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_3, 0);
620         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
621         if XNN_UNPREDICTABLE(block_width > 3) {
622           o = oN;
623         }
624         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_2, 0);
625         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
626         if XNN_UNPREDICTABLE(block_width >= 3) {
627           o = oN;
628         }
629         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_1, 0);
630         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
631         if XNN_UNPREDICTABLE(block_width > 1) {
632           o = oN;
633         }
634         *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0);
635         o += 2;
636         v0_0 = wasm_u32x4_shr(v0_0, 16);
637         v0_1 = wasm_u32x4_shr(v0_1, 16);
638         v0_2 = wasm_u32x4_shr(v0_2, 16);
639         v0_3 = wasm_u32x4_shr(v0_3, 16);
640         v0_4 = wasm_u32x4_shr(v0_4, 16);
641         v0_5 = wasm_u32x4_shr(v0_5, 16);
642         v0_6 = wasm_u32x4_shr(v0_6, 16);
643         v0_7 = wasm_u32x4_shr(v0_7, 16);
644         v0_8 = wasm_u32x4_shr(v0_8, 16);
645         v0_9 = wasm_u32x4_shr(v0_9, 16);
646         v0_10 = wasm_u32x4_shr(v0_10, 16);
647         v0_11 = wasm_u32x4_shr(v0_11, 16);
648         v0_12 = wasm_u32x4_shr(v0_12, 16);
649         v0_13 = wasm_u32x4_shr(v0_13, 16);
650         v0_14 = wasm_u32x4_shr(v0_14, 16);
651         v0_15 = wasm_u32x4_shr(v0_15, 16);
652       }
653       if (bh & 1) {
654         o = (uint8_t*) ((uintptr_t) o + oN_stride);
655         *o = wasm_i8x16_extract_lane(v0_15, 0);
656         uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
657         if XNN_UNPREDICTABLE(block_width > 15) {
658           o = oN;
659         }
660         *o = wasm_i8x16_extract_lane(v0_14, 0);
661         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
662         if XNN_UNPREDICTABLE(block_width >= 15) {
663           o = oN;
664         }
665         *o = wasm_i8x16_extract_lane(v0_13, 0);
666         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
667         if XNN_UNPREDICTABLE(block_width > 13) {
668           o = oN;
669         }
670         *o = wasm_i8x16_extract_lane(v0_12, 0);
671         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
672         if XNN_UNPREDICTABLE(block_width >= 13) {
673           o = oN;
674         }
675         *o = wasm_i8x16_extract_lane(v0_11, 0);
676         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
677         if XNN_UNPREDICTABLE(block_width > 11) {
678           o = oN;
679         }
680         *o = wasm_i8x16_extract_lane(v0_10, 0);
681         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
682         if XNN_UNPREDICTABLE(block_width >= 11) {
683           o = oN;
684         }
685         *o = wasm_i8x16_extract_lane(v0_9, 0);
686         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
687         if XNN_UNPREDICTABLE(block_width > 9) {
688           o = oN;
689         }
690         *o = wasm_i8x16_extract_lane(v0_8, 0);
691         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
692         if XNN_UNPREDICTABLE(block_width >= 9) {
693           o = oN;
694         }
695         *o = wasm_i8x16_extract_lane(v0_7, 0);
696         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
697         if XNN_UNPREDICTABLE(block_width > 7) {
698           o = oN;
699         }
700         *o = wasm_i8x16_extract_lane(v0_6, 0);
701         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
702         if XNN_UNPREDICTABLE(block_width >= 7) {
703           o = oN;
704         }
705         *o = wasm_i8x16_extract_lane(v0_5, 0);
706         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
707         if XNN_UNPREDICTABLE(block_width > 5) {
708           o = oN;
709         }
710         *o = wasm_i8x16_extract_lane(v0_4, 0);
711         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
712         if XNN_UNPREDICTABLE(block_width >= 5) {
713           o = oN;
714         }
715         *o = wasm_i8x16_extract_lane(v0_3, 0);
716         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
717         if XNN_UNPREDICTABLE(block_width > 3) {
718           o = oN;
719         }
720         *o = wasm_i8x16_extract_lane(v0_2, 0);
721         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
722         if XNN_UNPREDICTABLE(block_width >= 3) {
723           o = oN;
724         }
725         *o = wasm_i8x16_extract_lane(v0_1, 0);
726         oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
727         if XNN_UNPREDICTABLE(block_width > 1) {
728           o = oN;
729         }
730         *o = wasm_i8x16_extract_lane(v0_0, 0);
731       }
732     }
733 
734     i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
735     o = (uint8_t*) ((uintptr_t) o + output_reset);
736     block_width = doz(block_width, tile_width);
737   } while (block_width != 0);
738 }
739