1 // Auto-generated file. Do not edit!
2 // Template: src/x32-transposec/wasmsimd.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <wasm_simd128.h>
11
12 #include <assert.h>
13
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17
xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd(
19 const uint8_t* input,
20 uint8_t* output,
21 size_t input_stride,
22 size_t output_stride,
23 size_t block_width,
24 size_t block_height) XNN_OOB_READS
25 {
26 assert(output_stride >= block_height * sizeof(uint8_t));
27 assert(input_stride >= block_width * sizeof(uint8_t));
28
29 const size_t tile_height = 16;
30 const size_t tile_width = 16;
31 const size_t tile_hbytes = tile_height * sizeof(uint8_t);
32 const size_t tile_wbytes = tile_width * sizeof(uint8_t);
33 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes;
35
36 const uint8_t* i0 = input;
37 uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes);
38 const size_t minus_output_stride = -output_stride;
39
40 do {
41 const size_t rem = min(block_width - 1, 15);
42 const size_t oN_stride = rem * output_stride;
43 const size_t oN_offset = oN_stride + tile_hbytes;
44 size_t bh = block_height;
45 for (; bh >= 16; bh -= 16) {
46 const v128_t v4_0 = wasm_v128_load(i0);
47 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
48 const v128_t v4_1 = wasm_v128_load(i0);
49 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
50 const v128_t v4_2 = wasm_v128_load(i0);
51 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
52 const v128_t v4_3 = wasm_v128_load(i0);
53 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
54 const v128_t v4_4 = wasm_v128_load(i0);
55 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
56 const v128_t v4_5 = wasm_v128_load(i0);
57 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
58 const v128_t v4_6 = wasm_v128_load(i0);
59 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
60 const v128_t v4_7 = wasm_v128_load(i0);
61 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
62 const v128_t v4_8 = wasm_v128_load(i0);
63 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
64 const v128_t v4_9 = wasm_v128_load(i0);
65 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
66 const v128_t v4_10 = wasm_v128_load(i0);
67 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
68 const v128_t v4_11 = wasm_v128_load(i0);
69 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
70 const v128_t v4_12 = wasm_v128_load(i0);
71 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
72 const v128_t v4_13 = wasm_v128_load(i0);
73 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
74 const v128_t v4_14 = wasm_v128_load(i0);
75 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
76 const v128_t v4_15 = wasm_v128_load(i0);
77 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
78
79 const v128_t v3_0 = wasm_v8x16_shuffle(v4_0, v4_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
80 const v128_t v3_1 = wasm_v8x16_shuffle(v4_0, v4_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
81 const v128_t v3_2 = wasm_v8x16_shuffle(v4_1, v4_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
82 const v128_t v3_3 = wasm_v8x16_shuffle(v4_1, v4_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
83 const v128_t v3_4 = wasm_v8x16_shuffle(v4_2, v4_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
84 const v128_t v3_5 = wasm_v8x16_shuffle(v4_2, v4_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
85 const v128_t v3_6 = wasm_v8x16_shuffle(v4_3, v4_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
86 const v128_t v3_7 = wasm_v8x16_shuffle(v4_3, v4_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
87 const v128_t v3_8 = wasm_v8x16_shuffle(v4_4, v4_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
88 const v128_t v3_9 = wasm_v8x16_shuffle(v4_4, v4_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
89 const v128_t v3_10 = wasm_v8x16_shuffle(v4_5, v4_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
90 const v128_t v3_11 = wasm_v8x16_shuffle(v4_5, v4_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
91 const v128_t v3_12 = wasm_v8x16_shuffle(v4_6, v4_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
92 const v128_t v3_13 = wasm_v8x16_shuffle(v4_6, v4_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
93 const v128_t v3_14 = wasm_v8x16_shuffle(v4_7, v4_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
94 const v128_t v3_15 = wasm_v8x16_shuffle(v4_7, v4_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
95 const v128_t v2_0 = wasm_v8x16_shuffle(v3_0, v3_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
96 const v128_t v2_1 = wasm_v8x16_shuffle(v3_0, v3_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
97 const v128_t v2_2 = wasm_v8x16_shuffle(v3_1, v3_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
98 const v128_t v2_3 = wasm_v8x16_shuffle(v3_1, v3_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
99 const v128_t v2_4 = wasm_v8x16_shuffle(v3_2, v3_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
100 const v128_t v2_5 = wasm_v8x16_shuffle(v3_2, v3_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
101 const v128_t v2_6 = wasm_v8x16_shuffle(v3_3, v3_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
102 const v128_t v2_7 = wasm_v8x16_shuffle(v3_3, v3_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
103 const v128_t v2_8 = wasm_v8x16_shuffle(v3_4, v3_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
104 const v128_t v2_9 = wasm_v8x16_shuffle(v3_4, v3_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
105 const v128_t v2_10 = wasm_v8x16_shuffle(v3_5, v3_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
106 const v128_t v2_11 = wasm_v8x16_shuffle(v3_5, v3_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
107 const v128_t v2_12 = wasm_v8x16_shuffle(v3_6, v3_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
108 const v128_t v2_13 = wasm_v8x16_shuffle(v3_6, v3_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
109 const v128_t v2_14 = wasm_v8x16_shuffle(v3_7, v3_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
110 const v128_t v2_15 = wasm_v8x16_shuffle(v3_7, v3_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
111 const v128_t v1_0 = wasm_v8x16_shuffle(v2_0, v2_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
112 const v128_t v1_1 = wasm_v8x16_shuffle(v2_0, v2_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
113 const v128_t v1_2 = wasm_v8x16_shuffle(v2_1, v2_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
114 const v128_t v1_3 = wasm_v8x16_shuffle(v2_1, v2_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
115 const v128_t v1_4 = wasm_v8x16_shuffle(v2_2, v2_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
116 const v128_t v1_5 = wasm_v8x16_shuffle(v2_2, v2_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
117 const v128_t v1_6 = wasm_v8x16_shuffle(v2_3, v2_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
118 const v128_t v1_7 = wasm_v8x16_shuffle(v2_3, v2_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
119 const v128_t v1_8 = wasm_v8x16_shuffle(v2_4, v2_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
120 const v128_t v1_9 = wasm_v8x16_shuffle(v2_4, v2_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
121 const v128_t v1_10 = wasm_v8x16_shuffle(v2_5, v2_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
122 const v128_t v1_11 = wasm_v8x16_shuffle(v2_5, v2_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
123 const v128_t v1_12 = wasm_v8x16_shuffle(v2_6, v2_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
124 const v128_t v1_13 = wasm_v8x16_shuffle(v2_6, v2_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
125 const v128_t v1_14 = wasm_v8x16_shuffle(v2_7, v2_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
126 const v128_t v1_15 = wasm_v8x16_shuffle(v2_7, v2_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
127 const v128_t v0_0 = wasm_v8x16_shuffle(v1_0, v1_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
128 const v128_t v0_1 = wasm_v8x16_shuffle(v1_0, v1_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
129 const v128_t v0_2 = wasm_v8x16_shuffle(v1_1, v1_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
130 const v128_t v0_3 = wasm_v8x16_shuffle(v1_1, v1_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
131 const v128_t v0_4 = wasm_v8x16_shuffle(v1_2, v1_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
132 const v128_t v0_5 = wasm_v8x16_shuffle(v1_2, v1_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
133 const v128_t v0_6 = wasm_v8x16_shuffle(v1_3, v1_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
134 const v128_t v0_7 = wasm_v8x16_shuffle(v1_3, v1_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
135 const v128_t v0_8 = wasm_v8x16_shuffle(v1_4, v1_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
136 const v128_t v0_9 = wasm_v8x16_shuffle(v1_4, v1_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
137 const v128_t v0_10 = wasm_v8x16_shuffle(v1_5, v1_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
138 const v128_t v0_11 = wasm_v8x16_shuffle(v1_5, v1_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
139 const v128_t v0_12 = wasm_v8x16_shuffle(v1_6, v1_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
140 const v128_t v0_13 = wasm_v8x16_shuffle(v1_6, v1_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
141 const v128_t v0_14 = wasm_v8x16_shuffle(v1_7, v1_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
142 const v128_t v0_15 = wasm_v8x16_shuffle(v1_7, v1_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
143
144 o = (uint8_t*) ((uintptr_t) o + oN_offset);
145 wasm_v128_store(o, v0_15);
146 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
147 if XNN_UNPREDICTABLE(block_width > 15) {
148 o = oN;
149 }
150 wasm_v128_store(o, v0_14);
151 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
152 if XNN_UNPREDICTABLE(block_width >= 15) {
153 o = oN;
154 }
155 wasm_v128_store(o, v0_13);
156 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
157 if XNN_UNPREDICTABLE(block_width > 13) {
158 o = oN;
159 }
160 wasm_v128_store(o, v0_12);
161 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
162 if XNN_UNPREDICTABLE(block_width >= 13) {
163 o = oN;
164 }
165 wasm_v128_store(o, v0_11);
166 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
167 if XNN_UNPREDICTABLE(block_width > 11) {
168 o = oN;
169 }
170 wasm_v128_store(o, v0_10);
171 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
172 if XNN_UNPREDICTABLE(block_width >= 11) {
173 o = oN;
174 }
175 wasm_v128_store(o, v0_9);
176 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
177 if XNN_UNPREDICTABLE(block_width > 9) {
178 o = oN;
179 }
180 wasm_v128_store(o, v0_8);
181 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
182 if XNN_UNPREDICTABLE(block_width >= 9) {
183 o = oN;
184 }
185 wasm_v128_store(o, v0_7);
186 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
187 if XNN_UNPREDICTABLE(block_width > 7) {
188 o = oN;
189 }
190 wasm_v128_store(o, v0_6);
191 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
192 if XNN_UNPREDICTABLE(block_width >= 7) {
193 o = oN;
194 }
195 wasm_v128_store(o, v0_5);
196 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
197 if XNN_UNPREDICTABLE(block_width > 5) {
198 o = oN;
199 }
200 wasm_v128_store(o, v0_4);
201 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
202 if XNN_UNPREDICTABLE(block_width >= 5) {
203 o = oN;
204 }
205 wasm_v128_store(o, v0_3);
206 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
207 if XNN_UNPREDICTABLE(block_width > 3) {
208 o = oN;
209 }
210 wasm_v128_store(o, v0_2);
211 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
212 if XNN_UNPREDICTABLE(block_width >= 3) {
213 o = oN;
214 }
215 wasm_v128_store(o, v0_1);
216 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
217 if XNN_UNPREDICTABLE(block_width > 1) {
218 o = oN;
219 }
220 wasm_v128_store(o, v0_0);
221 }
222 o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
223
224 if (bh != 0) {
225 const v128_t v4_0 = wasm_v128_load(i0);
226 const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
227 if XNN_UNPREDICTABLE(bh < 2) {
228 i1 = i0;
229 }
230 const v128_t v4_1 = wasm_v128_load(i1);
231 const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
232 if XNN_UNPREDICTABLE(bh <= 2) {
233 i2 = i1;
234 }
235 const v128_t v4_2 = wasm_v128_load(i2);
236 const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
237 if XNN_UNPREDICTABLE(bh < 4) {
238 i3 = i2;
239 }
240 const v128_t v4_3 = wasm_v128_load(i3);
241 const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
242 if XNN_UNPREDICTABLE(bh <= 4) {
243 i4 = i3;
244 }
245 const v128_t v4_4 = wasm_v128_load(i4);
246 const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
247 if XNN_UNPREDICTABLE(bh < 6) {
248 i5 = i4;
249 }
250 const v128_t v4_5 = wasm_v128_load(i5);
251 const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
252 if XNN_UNPREDICTABLE(bh <= 6) {
253 i6 = i5;
254 }
255 const v128_t v4_6 = wasm_v128_load(i6);
256 const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
257 if XNN_UNPREDICTABLE(bh < 8) {
258 i7 = i6;
259 }
260 const v128_t v4_7 = wasm_v128_load(i7);
261 const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
262 if XNN_UNPREDICTABLE(bh <= 8) {
263 i8 = i7;
264 }
265 const v128_t v4_8 = wasm_v128_load(i8);
266 const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
267 if XNN_UNPREDICTABLE(bh < 10) {
268 i9 = i8;
269 }
270 const v128_t v4_9 = wasm_v128_load(i9);
271 const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
272 if XNN_UNPREDICTABLE(bh <= 10) {
273 i10 = i9;
274 }
275 const v128_t v4_10 = wasm_v128_load(i10);
276 const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
277 if XNN_UNPREDICTABLE(bh < 12) {
278 i11 = i10;
279 }
280 const v128_t v4_11 = wasm_v128_load(i11);
281 const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
282 if XNN_UNPREDICTABLE(bh <= 12) {
283 i12 = i11;
284 }
285 const v128_t v4_12 = wasm_v128_load(i12);
286 const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
287 if XNN_UNPREDICTABLE(bh < 14) {
288 i13 = i12;
289 }
290 const v128_t v4_13 = wasm_v128_load(i13);
291 const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
292 if XNN_UNPREDICTABLE(bh <= 14) {
293 i14 = i13;
294 }
295 const v128_t v4_14 = wasm_v128_load(i14);
296 const v128_t v4_15 = wasm_v128_xor(v4_0, v4_0);
297
298 const v128_t v3_0 = wasm_v8x16_shuffle(v4_0, v4_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
299 const v128_t v3_1 = wasm_v8x16_shuffle(v4_0, v4_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
300 const v128_t v3_2 = wasm_v8x16_shuffle(v4_1, v4_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
301 const v128_t v3_3 = wasm_v8x16_shuffle(v4_1, v4_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
302 const v128_t v3_4 = wasm_v8x16_shuffle(v4_2, v4_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
303 const v128_t v3_5 = wasm_v8x16_shuffle(v4_2, v4_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
304 const v128_t v3_6 = wasm_v8x16_shuffle(v4_3, v4_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
305 const v128_t v3_7 = wasm_v8x16_shuffle(v4_3, v4_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
306 const v128_t v3_8 = wasm_v8x16_shuffle(v4_4, v4_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
307 const v128_t v3_9 = wasm_v8x16_shuffle(v4_4, v4_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
308 const v128_t v3_10 = wasm_v8x16_shuffle(v4_5, v4_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
309 const v128_t v3_11 = wasm_v8x16_shuffle(v4_5, v4_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
310 const v128_t v3_12 = wasm_v8x16_shuffle(v4_6, v4_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
311 const v128_t v3_13 = wasm_v8x16_shuffle(v4_6, v4_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
312 const v128_t v3_14 = wasm_v8x16_shuffle(v4_7, v4_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
313 const v128_t v3_15 = wasm_v8x16_shuffle(v4_7, v4_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
314 const v128_t v2_0 = wasm_v8x16_shuffle(v3_0, v3_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
315 const v128_t v2_1 = wasm_v8x16_shuffle(v3_0, v3_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
316 const v128_t v2_2 = wasm_v8x16_shuffle(v3_1, v3_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
317 const v128_t v2_3 = wasm_v8x16_shuffle(v3_1, v3_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
318 const v128_t v2_4 = wasm_v8x16_shuffle(v3_2, v3_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
319 const v128_t v2_5 = wasm_v8x16_shuffle(v3_2, v3_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
320 const v128_t v2_6 = wasm_v8x16_shuffle(v3_3, v3_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
321 const v128_t v2_7 = wasm_v8x16_shuffle(v3_3, v3_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
322 const v128_t v2_8 = wasm_v8x16_shuffle(v3_4, v3_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
323 const v128_t v2_9 = wasm_v8x16_shuffle(v3_4, v3_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
324 const v128_t v2_10 = wasm_v8x16_shuffle(v3_5, v3_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
325 const v128_t v2_11 = wasm_v8x16_shuffle(v3_5, v3_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
326 const v128_t v2_12 = wasm_v8x16_shuffle(v3_6, v3_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
327 const v128_t v2_13 = wasm_v8x16_shuffle(v3_6, v3_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
328 const v128_t v2_14 = wasm_v8x16_shuffle(v3_7, v3_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
329 const v128_t v2_15 = wasm_v8x16_shuffle(v3_7, v3_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
330 const v128_t v1_0 = wasm_v8x16_shuffle(v2_0, v2_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
331 const v128_t v1_1 = wasm_v8x16_shuffle(v2_0, v2_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
332 const v128_t v1_2 = wasm_v8x16_shuffle(v2_1, v2_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
333 const v128_t v1_3 = wasm_v8x16_shuffle(v2_1, v2_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
334 const v128_t v1_4 = wasm_v8x16_shuffle(v2_2, v2_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
335 const v128_t v1_5 = wasm_v8x16_shuffle(v2_2, v2_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
336 const v128_t v1_6 = wasm_v8x16_shuffle(v2_3, v2_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
337 const v128_t v1_7 = wasm_v8x16_shuffle(v2_3, v2_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
338 const v128_t v1_8 = wasm_v8x16_shuffle(v2_4, v2_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
339 const v128_t v1_9 = wasm_v8x16_shuffle(v2_4, v2_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
340 const v128_t v1_10 = wasm_v8x16_shuffle(v2_5, v2_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
341 const v128_t v1_11 = wasm_v8x16_shuffle(v2_5, v2_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
342 const v128_t v1_12 = wasm_v8x16_shuffle(v2_6, v2_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
343 const v128_t v1_13 = wasm_v8x16_shuffle(v2_6, v2_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
344 const v128_t v1_14 = wasm_v8x16_shuffle(v2_7, v2_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
345 const v128_t v1_15 = wasm_v8x16_shuffle(v2_7, v2_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
346
347 v128_t v0_0 = wasm_v8x16_shuffle(v1_0, v1_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
348 v128_t v0_1 = wasm_v8x16_shuffle(v1_0, v1_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
349 v128_t v0_2 = wasm_v8x16_shuffle(v1_1, v1_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
350 v128_t v0_3 = wasm_v8x16_shuffle(v1_1, v1_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
351 v128_t v0_4 = wasm_v8x16_shuffle(v1_2, v1_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
352 v128_t v0_5 = wasm_v8x16_shuffle(v1_2, v1_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
353 v128_t v0_6 = wasm_v8x16_shuffle(v1_3, v1_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
354 v128_t v0_7 = wasm_v8x16_shuffle(v1_3, v1_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
355 v128_t v0_8 = wasm_v8x16_shuffle(v1_4, v1_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
356 v128_t v0_9 = wasm_v8x16_shuffle(v1_4, v1_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
357 v128_t v0_10 = wasm_v8x16_shuffle(v1_5, v1_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
358 v128_t v0_11 = wasm_v8x16_shuffle(v1_5, v1_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
359 v128_t v0_12 = wasm_v8x16_shuffle(v1_6, v1_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
360 v128_t v0_13 = wasm_v8x16_shuffle(v1_6, v1_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
361 v128_t v0_14 = wasm_v8x16_shuffle(v1_7, v1_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
362 v128_t v0_15 = wasm_v8x16_shuffle(v1_7, v1_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
363
364 if (bh & 8) {
365 o = (uint8_t*) ((uintptr_t) o + oN_stride);
366 *((double*) o) = wasm_f64x2_extract_lane(v0_15, 0);
367 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
368 if XNN_UNPREDICTABLE(block_width > 15) {
369 o = oN;
370 }
371 *((double*) o) = wasm_f64x2_extract_lane(v0_14, 0);
372 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
373 if XNN_UNPREDICTABLE(block_width >= 15) {
374 o = oN;
375 }
376 *((double*) o) = wasm_f64x2_extract_lane(v0_13, 0);
377 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
378 if XNN_UNPREDICTABLE(block_width > 13) {
379 o = oN;
380 }
381 *((double*) o) = wasm_f64x2_extract_lane(v0_12, 0);
382 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
383 if XNN_UNPREDICTABLE(block_width >= 13) {
384 o = oN;
385 }
386 *((double*) o) = wasm_f64x2_extract_lane(v0_11, 0);
387 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
388 if XNN_UNPREDICTABLE(block_width > 11) {
389 o = oN;
390 }
391 *((double*) o) = wasm_f64x2_extract_lane(v0_10, 0);
392 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
393 if XNN_UNPREDICTABLE(block_width >= 11) {
394 o = oN;
395 }
396 *((double*) o) = wasm_f64x2_extract_lane(v0_9, 0);
397 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
398 if XNN_UNPREDICTABLE(block_width > 9) {
399 o = oN;
400 }
401 *((double*) o) = wasm_f64x2_extract_lane(v0_8, 0);
402 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
403 if XNN_UNPREDICTABLE(block_width >= 9) {
404 o = oN;
405 }
406 *((double*) o) = wasm_f64x2_extract_lane(v0_7, 0);
407 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
408 if XNN_UNPREDICTABLE(block_width > 7) {
409 o = oN;
410 }
411 *((double*) o) = wasm_f64x2_extract_lane(v0_6, 0);
412 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
413 if XNN_UNPREDICTABLE(block_width >= 7) {
414 o = oN;
415 }
416 *((double*) o) = wasm_f64x2_extract_lane(v0_5, 0);
417 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
418 if XNN_UNPREDICTABLE(block_width > 5) {
419 o = oN;
420 }
421 *((double*) o) = wasm_f64x2_extract_lane(v0_4, 0);
422 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
423 if XNN_UNPREDICTABLE(block_width >= 5) {
424 o = oN;
425 }
426 *((double*) o) = wasm_f64x2_extract_lane(v0_3, 0);
427 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
428 if XNN_UNPREDICTABLE(block_width > 3) {
429 o = oN;
430 }
431 *((double*) o) = wasm_f64x2_extract_lane(v0_2, 0);
432 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
433 if XNN_UNPREDICTABLE(block_width >= 3) {
434 o = oN;
435 }
436 *((double*) o) = wasm_f64x2_extract_lane(v0_1, 0);
437 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
438 if XNN_UNPREDICTABLE(block_width > 1) {
439 o = oN;
440 }
441 *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
442 o += 8;
443 v0_0 = wasm_v64x2_shuffle(v0_0, v0_0, 1, 1);
444 v0_1 = wasm_v64x2_shuffle(v0_1, v0_1, 1, 1);
445 v0_2 = wasm_v64x2_shuffle(v0_2, v0_2, 1, 1);
446 v0_3 = wasm_v64x2_shuffle(v0_3, v0_3, 1, 1);
447 v0_4 = wasm_v64x2_shuffle(v0_4, v0_4, 1, 1);
448 v0_5 = wasm_v64x2_shuffle(v0_5, v0_5, 1, 1);
449 v0_6 = wasm_v64x2_shuffle(v0_6, v0_6, 1, 1);
450 v0_7 = wasm_v64x2_shuffle(v0_7, v0_7, 1, 1);
451 v0_8 = wasm_v64x2_shuffle(v0_8, v0_8, 1, 1);
452 v0_9 = wasm_v64x2_shuffle(v0_9, v0_9, 1, 1);
453 v0_10 = wasm_v64x2_shuffle(v0_10, v0_10, 1, 1);
454 v0_11 = wasm_v64x2_shuffle(v0_11, v0_11, 1, 1);
455 v0_12 = wasm_v64x2_shuffle(v0_12, v0_12, 1, 1);
456 v0_13 = wasm_v64x2_shuffle(v0_13, v0_13, 1, 1);
457 v0_14 = wasm_v64x2_shuffle(v0_14, v0_14, 1, 1);
458 v0_15 = wasm_v64x2_shuffle(v0_15, v0_15, 1, 1);
459 }
460
461 if (bh & 4) {
462 o = (uint8_t*) ((uintptr_t) o + oN_stride);
463 *((float*) o) = wasm_f32x4_extract_lane(v0_15, 0);
464 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
465 if XNN_UNPREDICTABLE(block_width > 15) {
466 o = oN;
467 }
468 *((float*) o) = wasm_f32x4_extract_lane(v0_14, 0);
469 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
470 if XNN_UNPREDICTABLE(block_width >= 15) {
471 o = oN;
472 }
473 *((float*) o) = wasm_f32x4_extract_lane(v0_13, 0);
474 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
475 if XNN_UNPREDICTABLE(block_width > 13) {
476 o = oN;
477 }
478 *((float*) o) = wasm_f32x4_extract_lane(v0_12, 0);
479 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
480 if XNN_UNPREDICTABLE(block_width >= 13) {
481 o = oN;
482 }
483 *((float*) o) = wasm_f32x4_extract_lane(v0_11, 0);
484 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
485 if XNN_UNPREDICTABLE(block_width > 11) {
486 o = oN;
487 }
488 *((float*) o) = wasm_f32x4_extract_lane(v0_10, 0);
489 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
490 if XNN_UNPREDICTABLE(block_width >= 11) {
491 o = oN;
492 }
493 *((float*) o) = wasm_f32x4_extract_lane(v0_9, 0);
494 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
495 if XNN_UNPREDICTABLE(block_width > 9) {
496 o = oN;
497 }
498 *((float*) o) = wasm_f32x4_extract_lane(v0_8, 0);
499 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
500 if XNN_UNPREDICTABLE(block_width >= 9) {
501 o = oN;
502 }
503 *((float*) o) = wasm_f32x4_extract_lane(v0_7, 0);
504 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
505 if XNN_UNPREDICTABLE(block_width > 7) {
506 o = oN;
507 }
508 *((float*) o) = wasm_f32x4_extract_lane(v0_6, 0);
509 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
510 if XNN_UNPREDICTABLE(block_width >= 7) {
511 o = oN;
512 }
513 *((float*) o) = wasm_f32x4_extract_lane(v0_5, 0);
514 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
515 if XNN_UNPREDICTABLE(block_width > 5) {
516 o = oN;
517 }
518 *((float*) o) = wasm_f32x4_extract_lane(v0_4, 0);
519 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
520 if XNN_UNPREDICTABLE(block_width >= 5) {
521 o = oN;
522 }
523 *((float*) o) = wasm_f32x4_extract_lane(v0_3, 0);
524 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
525 if XNN_UNPREDICTABLE(block_width > 3) {
526 o = oN;
527 }
528 *((float*) o) = wasm_f32x4_extract_lane(v0_2, 0);
529 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
530 if XNN_UNPREDICTABLE(block_width >= 3) {
531 o = oN;
532 }
533 *((float*) o) = wasm_f32x4_extract_lane(v0_1, 0);
534 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
535 if XNN_UNPREDICTABLE(block_width > 1) {
536 o = oN;
537 }
538 *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
539 o += 4;
540 v0_0 = wasm_u64x2_shr(v0_0, 32);
541 v0_1 = wasm_u64x2_shr(v0_1, 32);
542 v0_2 = wasm_u64x2_shr(v0_2, 32);
543 v0_3 = wasm_u64x2_shr(v0_3, 32);
544 v0_4 = wasm_u64x2_shr(v0_4, 32);
545 v0_5 = wasm_u64x2_shr(v0_5, 32);
546 v0_6 = wasm_u64x2_shr(v0_6, 32);
547 v0_7 = wasm_u64x2_shr(v0_7, 32);
548 v0_8 = wasm_u64x2_shr(v0_8, 32);
549 v0_9 = wasm_u64x2_shr(v0_9, 32);
550 v0_10 = wasm_u64x2_shr(v0_10, 32);
551 v0_11 = wasm_u64x2_shr(v0_11, 32);
552 v0_12 = wasm_u64x2_shr(v0_12, 32);
553 v0_13 = wasm_u64x2_shr(v0_13, 32);
554 v0_14 = wasm_u64x2_shr(v0_14, 32);
555 v0_15 = wasm_u64x2_shr(v0_15, 32);
556 }
557 if (bh & 2) {
558 o = (uint8_t*) ((uintptr_t) o + oN_stride);
559 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_15, 0);
560 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
561 if XNN_UNPREDICTABLE(block_width > 15) {
562 o = oN;
563 }
564 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_14, 0);
565 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
566 if XNN_UNPREDICTABLE(block_width >= 15) {
567 o = oN;
568 }
569 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_13, 0);
570 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
571 if XNN_UNPREDICTABLE(block_width > 13) {
572 o = oN;
573 }
574 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_12, 0);
575 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
576 if XNN_UNPREDICTABLE(block_width >= 13) {
577 o = oN;
578 }
579 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_11, 0);
580 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
581 if XNN_UNPREDICTABLE(block_width > 11) {
582 o = oN;
583 }
584 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_10, 0);
585 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
586 if XNN_UNPREDICTABLE(block_width >= 11) {
587 o = oN;
588 }
589 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_9, 0);
590 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
591 if XNN_UNPREDICTABLE(block_width > 9) {
592 o = oN;
593 }
594 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_8, 0);
595 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
596 if XNN_UNPREDICTABLE(block_width >= 9) {
597 o = oN;
598 }
599 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_7, 0);
600 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
601 if XNN_UNPREDICTABLE(block_width > 7) {
602 o = oN;
603 }
604 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_6, 0);
605 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
606 if XNN_UNPREDICTABLE(block_width >= 7) {
607 o = oN;
608 }
609 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_5, 0);
610 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
611 if XNN_UNPREDICTABLE(block_width > 5) {
612 o = oN;
613 }
614 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_4, 0);
615 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
616 if XNN_UNPREDICTABLE(block_width >= 5) {
617 o = oN;
618 }
619 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_3, 0);
620 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
621 if XNN_UNPREDICTABLE(block_width > 3) {
622 o = oN;
623 }
624 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_2, 0);
625 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
626 if XNN_UNPREDICTABLE(block_width >= 3) {
627 o = oN;
628 }
629 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_1, 0);
630 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
631 if XNN_UNPREDICTABLE(block_width > 1) {
632 o = oN;
633 }
634 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0);
635 o += 2;
636 v0_0 = wasm_u32x4_shr(v0_0, 16);
637 v0_1 = wasm_u32x4_shr(v0_1, 16);
638 v0_2 = wasm_u32x4_shr(v0_2, 16);
639 v0_3 = wasm_u32x4_shr(v0_3, 16);
640 v0_4 = wasm_u32x4_shr(v0_4, 16);
641 v0_5 = wasm_u32x4_shr(v0_5, 16);
642 v0_6 = wasm_u32x4_shr(v0_6, 16);
643 v0_7 = wasm_u32x4_shr(v0_7, 16);
644 v0_8 = wasm_u32x4_shr(v0_8, 16);
645 v0_9 = wasm_u32x4_shr(v0_9, 16);
646 v0_10 = wasm_u32x4_shr(v0_10, 16);
647 v0_11 = wasm_u32x4_shr(v0_11, 16);
648 v0_12 = wasm_u32x4_shr(v0_12, 16);
649 v0_13 = wasm_u32x4_shr(v0_13, 16);
650 v0_14 = wasm_u32x4_shr(v0_14, 16);
651 v0_15 = wasm_u32x4_shr(v0_15, 16);
652 }
653 if (bh & 1) {
654 o = (uint8_t*) ((uintptr_t) o + oN_stride);
655 *o = wasm_i8x16_extract_lane(v0_15, 0);
656 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
657 if XNN_UNPREDICTABLE(block_width > 15) {
658 o = oN;
659 }
660 *o = wasm_i8x16_extract_lane(v0_14, 0);
661 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
662 if XNN_UNPREDICTABLE(block_width >= 15) {
663 o = oN;
664 }
665 *o = wasm_i8x16_extract_lane(v0_13, 0);
666 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
667 if XNN_UNPREDICTABLE(block_width > 13) {
668 o = oN;
669 }
670 *o = wasm_i8x16_extract_lane(v0_12, 0);
671 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
672 if XNN_UNPREDICTABLE(block_width >= 13) {
673 o = oN;
674 }
675 *o = wasm_i8x16_extract_lane(v0_11, 0);
676 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
677 if XNN_UNPREDICTABLE(block_width > 11) {
678 o = oN;
679 }
680 *o = wasm_i8x16_extract_lane(v0_10, 0);
681 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
682 if XNN_UNPREDICTABLE(block_width >= 11) {
683 o = oN;
684 }
685 *o = wasm_i8x16_extract_lane(v0_9, 0);
686 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
687 if XNN_UNPREDICTABLE(block_width > 9) {
688 o = oN;
689 }
690 *o = wasm_i8x16_extract_lane(v0_8, 0);
691 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
692 if XNN_UNPREDICTABLE(block_width >= 9) {
693 o = oN;
694 }
695 *o = wasm_i8x16_extract_lane(v0_7, 0);
696 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
697 if XNN_UNPREDICTABLE(block_width > 7) {
698 o = oN;
699 }
700 *o = wasm_i8x16_extract_lane(v0_6, 0);
701 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
702 if XNN_UNPREDICTABLE(block_width >= 7) {
703 o = oN;
704 }
705 *o = wasm_i8x16_extract_lane(v0_5, 0);
706 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
707 if XNN_UNPREDICTABLE(block_width > 5) {
708 o = oN;
709 }
710 *o = wasm_i8x16_extract_lane(v0_4, 0);
711 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
712 if XNN_UNPREDICTABLE(block_width >= 5) {
713 o = oN;
714 }
715 *o = wasm_i8x16_extract_lane(v0_3, 0);
716 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
717 if XNN_UNPREDICTABLE(block_width > 3) {
718 o = oN;
719 }
720 *o = wasm_i8x16_extract_lane(v0_2, 0);
721 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
722 if XNN_UNPREDICTABLE(block_width >= 3) {
723 o = oN;
724 }
725 *o = wasm_i8x16_extract_lane(v0_1, 0);
726 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
727 if XNN_UNPREDICTABLE(block_width > 1) {
728 o = oN;
729 }
730 *o = wasm_i8x16_extract_lane(v0_0, 0);
731 }
732 }
733
734 i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
735 o = (uint8_t*) ((uintptr_t) o + output_reset);
736 block_width = doz(block_width, tile_width);
737 } while (block_width != 0);
738 }
739