xref: /aosp_15_r20/external/XNNPACK/src/qu8-dwconv/gen/up24x25-minmax-rndnu-neon-mul8.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/qu8-dwconv/unipass-neon-mul8.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/dwconv.h>
15 
16 
xnn_qu8_dwconv_minmax_rndnu_ukernel_up24x25__neon_mul8(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qu8_dwconv_minmax_rndnu_ukernel_up24x25__neon_mul8(
18     size_t channels,
19     size_t output_width,
20     const uint8_t** input,
21     const void* weights,
22     uint8_t* output,
23     size_t input_stride,
24     size_t output_increment,
25     size_t input_offset,
26     const uint8_t* zero,
27     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29   assert(channels != 0);
30   assert(output_width != 0);
31 
32   const uint8x8_t vkernel_zero_point = vld1_dup_u8(params->rndnu_neon.kernel_zero_point);
33   const uint16x8_t vkernel_zero_point16 = vmovl_u8(vkernel_zero_point);
34   const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
35   const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
36   const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
37   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
38   const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
39   const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
40   do {
41     const uint8_t* i0 = input[0];
42     assert(i0 != NULL);
43     if XNN_UNPREDICTABLE(i0 != zero) {
44       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
45     }
46     const uint8_t* i1 = input[1];
47     assert(i1 != NULL);
48     if XNN_UNPREDICTABLE(i1 != zero) {
49       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
50     }
51     const uint8_t* i2 = input[2];
52     assert(i2 != NULL);
53     if XNN_UNPREDICTABLE(i2 != zero) {
54       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
55     }
56     const uint8_t* i3 = input[3];
57     assert(i3 != NULL);
58     if XNN_UNPREDICTABLE(i3 != zero) {
59       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
60     }
61     const uint8_t* i4 = input[4];
62     assert(i4 != NULL);
63     if XNN_UNPREDICTABLE(i4 != zero) {
64       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
65     }
66     const uint8_t* i5 = input[5];
67     assert(i5 != NULL);
68     if XNN_UNPREDICTABLE(i5 != zero) {
69       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
70     }
71     const uint8_t* i6 = input[6];
72     assert(i6 != NULL);
73     if XNN_UNPREDICTABLE(i6 != zero) {
74       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
75     }
76     const uint8_t* i7 = input[7];
77     assert(i7 != NULL);
78     if XNN_UNPREDICTABLE(i7 != zero) {
79       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
80     }
81     const uint8_t* i8 = input[8];
82     assert(i8 != NULL);
83     if XNN_UNPREDICTABLE(i8 != zero) {
84       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
85     }
86     const uint8_t* i9 = input[9];
87     assert(i9 != NULL);
88     if XNN_UNPREDICTABLE(i9 != zero) {
89       i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
90     }
91     const uint8_t* i10 = input[10];
92     assert(i10 != NULL);
93     if XNN_UNPREDICTABLE(i10 != zero) {
94       i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
95     }
96     const uint8_t* i11 = input[11];
97     assert(i11 != NULL);
98     if XNN_UNPREDICTABLE(i11 != zero) {
99       i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
100     }
101     const uint8_t* i12 = input[12];
102     assert(i12 != NULL);
103     if XNN_UNPREDICTABLE(i12 != zero) {
104       i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
105     }
106     const uint8_t* i13 = input[13];
107     assert(i13 != NULL);
108     if XNN_UNPREDICTABLE(i13 != zero) {
109       i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
110     }
111     const uint8_t* i14 = input[14];
112     assert(i14 != NULL);
113     if XNN_UNPREDICTABLE(i14 != zero) {
114       i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
115     }
116     const uint8_t* i15 = input[15];
117     assert(i15 != NULL);
118     if XNN_UNPREDICTABLE(i15 != zero) {
119       i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
120     }
121     const uint8_t* i16 = input[16];
122     assert(i16 != NULL);
123     if XNN_UNPREDICTABLE(i16 != zero) {
124       i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
125     }
126     const uint8_t* i17 = input[17];
127     assert(i17 != NULL);
128     if XNN_UNPREDICTABLE(i17 != zero) {
129       i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
130     }
131     const uint8_t* i18 = input[18];
132     assert(i18 != NULL);
133     if XNN_UNPREDICTABLE(i18 != zero) {
134       i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
135     }
136     const uint8_t* i19 = input[19];
137     assert(i19 != NULL);
138     if XNN_UNPREDICTABLE(i19 != zero) {
139       i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
140     }
141     const uint8_t* i20 = input[20];
142     assert(i20 != NULL);
143     if XNN_UNPREDICTABLE(i20 != zero) {
144       i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
145     }
146     const uint8_t* i21 = input[21];
147     assert(i21 != NULL);
148     if XNN_UNPREDICTABLE(i21 != zero) {
149       i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
150     }
151     const uint8_t* i22 = input[22];
152     assert(i22 != NULL);
153     if XNN_UNPREDICTABLE(i22 != zero) {
154       i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
155     }
156     const uint8_t* i23 = input[23];
157     assert(i23 != NULL);
158     if XNN_UNPREDICTABLE(i23 != zero) {
159       i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
160     }
161     const uint8_t* i24 = input[24];
162     assert(i24 != NULL);
163     if XNN_UNPREDICTABLE(i24 != zero) {
164       i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
165     }
166     input = (const uint8_t**) ((uintptr_t) input + input_stride);
167 
168 
169     size_t c = channels;
170     const void* w = weights;
171     for (; c >= 24; c -= 24) {
172       int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
173       int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
174       int32x4_t vacc89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
175       int32x4_t vaccCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
176       int32x4_t vaccGHIJ = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
177       int32x4_t vaccKLMN = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
178 
179 
180       const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
181       const uint8x8_t vk0x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
182       const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
183       const uint8x8_t vk0x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
184       const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
185       const uint8x8_t vk0xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
186 
187       uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567);
188       uint16x8_t vprod89ABCDEF = vmull_u8(vi0x89ABCDEF, vk0x89ABCDEF);
189       uint16x8_t vprodGHIJKLMN = vmull_u8(vi0xGHIJKLMN, vk0xGHIJKLMN);
190 
191       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
192       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
193       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
194       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
195       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
196       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
197       const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
198       const uint8x8_t vk1x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
199       const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
200       const uint8x8_t vk1x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
201       const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
202       const uint8x8_t vk1xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
203 
204       vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567);
205       uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
206       vprod89ABCDEF = vmull_u8(vi1x89ABCDEF, vk1x89ABCDEF);
207       uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
208       vprodGHIJKLMN = vmull_u8(vi1xGHIJKLMN, vk1xGHIJKLMN);
209       uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
210 
211       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
212       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
213       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
214       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
215       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
216       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
217       const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
218       const uint8x8_t vk2x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
219       const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
220       const uint8x8_t vk2x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
221       const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
222       const uint8x8_t vk2xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
223 
224       vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567);
225       vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
226       vprod89ABCDEF = vmull_u8(vi2x89ABCDEF, vk2x89ABCDEF);
227       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
228       vprodGHIJKLMN = vmull_u8(vi2xGHIJKLMN, vk2xGHIJKLMN);
229       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
230 
231       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
232       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
233       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
234       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
235       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
236       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
237       const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
238       const uint8x8_t vk3x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
239       const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
240       const uint8x8_t vk3x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
241       const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
242       const uint8x8_t vk3xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
243 
244       vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567);
245       vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
246       vprod89ABCDEF = vmull_u8(vi3x89ABCDEF, vk3x89ABCDEF);
247       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
248       vprodGHIJKLMN = vmull_u8(vi3xGHIJKLMN, vk3xGHIJKLMN);
249       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
250 
251       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
252       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
253       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
254       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
255       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
256       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
257       const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
258       const uint8x8_t vk4x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
259       const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
260       const uint8x8_t vk4x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
261       const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
262       const uint8x8_t vk4xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
263 
264       vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567);
265       vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
266       vprod89ABCDEF = vmull_u8(vi4x89ABCDEF, vk4x89ABCDEF);
267       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
268       vprodGHIJKLMN = vmull_u8(vi4xGHIJKLMN, vk4xGHIJKLMN);
269       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
270 
271       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
272       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
273       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
274       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
275       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
276       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
277       const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
278       const uint8x8_t vk5x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
279       const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
280       const uint8x8_t vk5x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
281       const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
282       const uint8x8_t vk5xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
283 
284       vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567);
285       vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
286       vprod89ABCDEF = vmull_u8(vi5x89ABCDEF, vk5x89ABCDEF);
287       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
288       vprodGHIJKLMN = vmull_u8(vi5xGHIJKLMN, vk5xGHIJKLMN);
289       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
290 
291       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
292       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
293       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
294       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
295       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
296       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
297       const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
298       const uint8x8_t vk6x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
299       const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
300       const uint8x8_t vk6x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
301       const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
302       const uint8x8_t vk6xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
303 
304       vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567);
305       vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
306       vprod89ABCDEF = vmull_u8(vi6x89ABCDEF, vk6x89ABCDEF);
307       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
308       vprodGHIJKLMN = vmull_u8(vi6xGHIJKLMN, vk6xGHIJKLMN);
309       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
310 
311       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
312       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
313       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
314       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
315       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
316       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
317       const uint8x8_t vi7x01234567 = vld1_u8(i7); i7 += 8;
318       const uint8x8_t vk7x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
319       const uint8x8_t vi7x89ABCDEF = vld1_u8(i7); i7 += 8;
320       const uint8x8_t vk7x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
321       const uint8x8_t vi7xGHIJKLMN = vld1_u8(i7); i7 += 8;
322       const uint8x8_t vk7xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
323 
324       vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567);
325       vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567);
326       vprod89ABCDEF = vmull_u8(vi7x89ABCDEF, vk7x89ABCDEF);
327       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi7x89ABCDEF);
328       vprodGHIJKLMN = vmull_u8(vi7xGHIJKLMN, vk7xGHIJKLMN);
329       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi7xGHIJKLMN);
330 
331       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
332       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
333       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
334       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
335       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
336       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
337       const uint8x8_t vi8x01234567 = vld1_u8(i8); i8 += 8;
338       const uint8x8_t vk8x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
339       const uint8x8_t vi8x89ABCDEF = vld1_u8(i8); i8 += 8;
340       const uint8x8_t vk8x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
341       const uint8x8_t vi8xGHIJKLMN = vld1_u8(i8); i8 += 8;
342       const uint8x8_t vk8xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
343 
344       vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567);
345       vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567);
346       vprod89ABCDEF = vmull_u8(vi8x89ABCDEF, vk8x89ABCDEF);
347       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi8x89ABCDEF);
348       vprodGHIJKLMN = vmull_u8(vi8xGHIJKLMN, vk8xGHIJKLMN);
349       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi8xGHIJKLMN);
350 
351       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
352       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
353       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
354       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
355       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
356       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
357       const uint8x8_t vi9x01234567 = vld1_u8(i9); i9 += 8;
358       const uint8x8_t vk9x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
359       const uint8x8_t vi9x89ABCDEF = vld1_u8(i9); i9 += 8;
360       const uint8x8_t vk9x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
361       const uint8x8_t vi9xGHIJKLMN = vld1_u8(i9); i9 += 8;
362       const uint8x8_t vk9xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
363 
364       vprod01234567 = vmull_u8(vi9x01234567, vk9x01234567);
365       vsum01234567 = vaddw_u8(vsum01234567, vi9x01234567);
366       vprod89ABCDEF = vmull_u8(vi9x89ABCDEF, vk9x89ABCDEF);
367       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi9x89ABCDEF);
368       vprodGHIJKLMN = vmull_u8(vi9xGHIJKLMN, vk9xGHIJKLMN);
369       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi9xGHIJKLMN);
370 
371       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
372       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
373       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
374       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
375       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
376       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
377       const uint8x8_t vi10x01234567 = vld1_u8(i10); i10 += 8;
378       const uint8x8_t vk10x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
379       const uint8x8_t vi10x89ABCDEF = vld1_u8(i10); i10 += 8;
380       const uint8x8_t vk10x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
381       const uint8x8_t vi10xGHIJKLMN = vld1_u8(i10); i10 += 8;
382       const uint8x8_t vk10xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
383 
384       vprod01234567 = vmull_u8(vi10x01234567, vk10x01234567);
385       vsum01234567 = vaddw_u8(vsum01234567, vi10x01234567);
386       vprod89ABCDEF = vmull_u8(vi10x89ABCDEF, vk10x89ABCDEF);
387       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi10x89ABCDEF);
388       vprodGHIJKLMN = vmull_u8(vi10xGHIJKLMN, vk10xGHIJKLMN);
389       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi10xGHIJKLMN);
390 
391       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
392       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
393       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
394       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
395       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
396       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
397       const uint8x8_t vi11x01234567 = vld1_u8(i11); i11 += 8;
398       const uint8x8_t vk11x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
399       const uint8x8_t vi11x89ABCDEF = vld1_u8(i11); i11 += 8;
400       const uint8x8_t vk11x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
401       const uint8x8_t vi11xGHIJKLMN = vld1_u8(i11); i11 += 8;
402       const uint8x8_t vk11xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
403 
404       vprod01234567 = vmull_u8(vi11x01234567, vk11x01234567);
405       vsum01234567 = vaddw_u8(vsum01234567, vi11x01234567);
406       vprod89ABCDEF = vmull_u8(vi11x89ABCDEF, vk11x89ABCDEF);
407       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi11x89ABCDEF);
408       vprodGHIJKLMN = vmull_u8(vi11xGHIJKLMN, vk11xGHIJKLMN);
409       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi11xGHIJKLMN);
410 
411       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
412       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
413       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
414       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
415       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
416       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
417       const uint8x8_t vi12x01234567 = vld1_u8(i12); i12 += 8;
418       const uint8x8_t vk12x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
419       const uint8x8_t vi12x89ABCDEF = vld1_u8(i12); i12 += 8;
420       const uint8x8_t vk12x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
421       const uint8x8_t vi12xGHIJKLMN = vld1_u8(i12); i12 += 8;
422       const uint8x8_t vk12xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
423 
424       vprod01234567 = vmull_u8(vi12x01234567, vk12x01234567);
425       vsum01234567 = vaddw_u8(vsum01234567, vi12x01234567);
426       vprod89ABCDEF = vmull_u8(vi12x89ABCDEF, vk12x89ABCDEF);
427       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi12x89ABCDEF);
428       vprodGHIJKLMN = vmull_u8(vi12xGHIJKLMN, vk12xGHIJKLMN);
429       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi12xGHIJKLMN);
430 
431       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
432       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
433       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
434       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
435       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
436       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
437       const uint8x8_t vi13x01234567 = vld1_u8(i13); i13 += 8;
438       const uint8x8_t vk13x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
439       const uint8x8_t vi13x89ABCDEF = vld1_u8(i13); i13 += 8;
440       const uint8x8_t vk13x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
441       const uint8x8_t vi13xGHIJKLMN = vld1_u8(i13); i13 += 8;
442       const uint8x8_t vk13xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
443 
444       vprod01234567 = vmull_u8(vi13x01234567, vk13x01234567);
445       vsum01234567 = vaddw_u8(vsum01234567, vi13x01234567);
446       vprod89ABCDEF = vmull_u8(vi13x89ABCDEF, vk13x89ABCDEF);
447       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi13x89ABCDEF);
448       vprodGHIJKLMN = vmull_u8(vi13xGHIJKLMN, vk13xGHIJKLMN);
449       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi13xGHIJKLMN);
450 
451       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
452       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
453       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
454       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
455       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
456       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
457       const uint8x8_t vi14x01234567 = vld1_u8(i14); i14 += 8;
458       const uint8x8_t vk14x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
459       const uint8x8_t vi14x89ABCDEF = vld1_u8(i14); i14 += 8;
460       const uint8x8_t vk14x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
461       const uint8x8_t vi14xGHIJKLMN = vld1_u8(i14); i14 += 8;
462       const uint8x8_t vk14xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
463 
464       vprod01234567 = vmull_u8(vi14x01234567, vk14x01234567);
465       vsum01234567 = vaddw_u8(vsum01234567, vi14x01234567);
466       vprod89ABCDEF = vmull_u8(vi14x89ABCDEF, vk14x89ABCDEF);
467       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi14x89ABCDEF);
468       vprodGHIJKLMN = vmull_u8(vi14xGHIJKLMN, vk14xGHIJKLMN);
469       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi14xGHIJKLMN);
470 
471       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
472       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
473       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
474       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
475       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
476       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
477       const uint8x8_t vi15x01234567 = vld1_u8(i15); i15 += 8;
478       const uint8x8_t vk15x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
479       const uint8x8_t vi15x89ABCDEF = vld1_u8(i15); i15 += 8;
480       const uint8x8_t vk15x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
481       const uint8x8_t vi15xGHIJKLMN = vld1_u8(i15); i15 += 8;
482       const uint8x8_t vk15xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
483 
484       vprod01234567 = vmull_u8(vi15x01234567, vk15x01234567);
485       vsum01234567 = vaddw_u8(vsum01234567, vi15x01234567);
486       vprod89ABCDEF = vmull_u8(vi15x89ABCDEF, vk15x89ABCDEF);
487       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi15x89ABCDEF);
488       vprodGHIJKLMN = vmull_u8(vi15xGHIJKLMN, vk15xGHIJKLMN);
489       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi15xGHIJKLMN);
490 
491       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
492       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
493       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
494       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
495       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
496       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
497       const uint8x8_t vi16x01234567 = vld1_u8(i16); i16 += 8;
498       const uint8x8_t vk16x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
499       const uint8x8_t vi16x89ABCDEF = vld1_u8(i16); i16 += 8;
500       const uint8x8_t vk16x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
501       const uint8x8_t vi16xGHIJKLMN = vld1_u8(i16); i16 += 8;
502       const uint8x8_t vk16xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
503 
504       vprod01234567 = vmull_u8(vi16x01234567, vk16x01234567);
505       vsum01234567 = vaddw_u8(vsum01234567, vi16x01234567);
506       vprod89ABCDEF = vmull_u8(vi16x89ABCDEF, vk16x89ABCDEF);
507       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi16x89ABCDEF);
508       vprodGHIJKLMN = vmull_u8(vi16xGHIJKLMN, vk16xGHIJKLMN);
509       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi16xGHIJKLMN);
510 
511       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
512       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
513       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
514       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
515       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
516       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
517       const uint8x8_t vi17x01234567 = vld1_u8(i17); i17 += 8;
518       const uint8x8_t vk17x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
519       const uint8x8_t vi17x89ABCDEF = vld1_u8(i17); i17 += 8;
520       const uint8x8_t vk17x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
521       const uint8x8_t vi17xGHIJKLMN = vld1_u8(i17); i17 += 8;
522       const uint8x8_t vk17xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
523 
524       vprod01234567 = vmull_u8(vi17x01234567, vk17x01234567);
525       vsum01234567 = vaddw_u8(vsum01234567, vi17x01234567);
526       vprod89ABCDEF = vmull_u8(vi17x89ABCDEF, vk17x89ABCDEF);
527       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi17x89ABCDEF);
528       vprodGHIJKLMN = vmull_u8(vi17xGHIJKLMN, vk17xGHIJKLMN);
529       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi17xGHIJKLMN);
530 
531       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
532       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
533       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
534       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
535       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
536       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
537       const uint8x8_t vi18x01234567 = vld1_u8(i18); i18 += 8;
538       const uint8x8_t vk18x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
539       const uint8x8_t vi18x89ABCDEF = vld1_u8(i18); i18 += 8;
540       const uint8x8_t vk18x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
541       const uint8x8_t vi18xGHIJKLMN = vld1_u8(i18); i18 += 8;
542       const uint8x8_t vk18xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
543 
544       vprod01234567 = vmull_u8(vi18x01234567, vk18x01234567);
545       vsum01234567 = vaddw_u8(vsum01234567, vi18x01234567);
546       vprod89ABCDEF = vmull_u8(vi18x89ABCDEF, vk18x89ABCDEF);
547       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi18x89ABCDEF);
548       vprodGHIJKLMN = vmull_u8(vi18xGHIJKLMN, vk18xGHIJKLMN);
549       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi18xGHIJKLMN);
550 
551       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
552       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
553       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
554       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
555       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
556       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
557       const uint8x8_t vi19x01234567 = vld1_u8(i19); i19 += 8;
558       const uint8x8_t vk19x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
559       const uint8x8_t vi19x89ABCDEF = vld1_u8(i19); i19 += 8;
560       const uint8x8_t vk19x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
561       const uint8x8_t vi19xGHIJKLMN = vld1_u8(i19); i19 += 8;
562       const uint8x8_t vk19xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
563 
564       vprod01234567 = vmull_u8(vi19x01234567, vk19x01234567);
565       vsum01234567 = vaddw_u8(vsum01234567, vi19x01234567);
566       vprod89ABCDEF = vmull_u8(vi19x89ABCDEF, vk19x89ABCDEF);
567       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi19x89ABCDEF);
568       vprodGHIJKLMN = vmull_u8(vi19xGHIJKLMN, vk19xGHIJKLMN);
569       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi19xGHIJKLMN);
570 
571       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
572       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
573       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
574       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
575       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
576       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
577       const uint8x8_t vi20x01234567 = vld1_u8(i20); i20 += 8;
578       const uint8x8_t vk20x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
579       const uint8x8_t vi20x89ABCDEF = vld1_u8(i20); i20 += 8;
580       const uint8x8_t vk20x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
581       const uint8x8_t vi20xGHIJKLMN = vld1_u8(i20); i20 += 8;
582       const uint8x8_t vk20xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
583 
584       vprod01234567 = vmull_u8(vi20x01234567, vk20x01234567);
585       vsum01234567 = vaddw_u8(vsum01234567, vi20x01234567);
586       vprod89ABCDEF = vmull_u8(vi20x89ABCDEF, vk20x89ABCDEF);
587       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi20x89ABCDEF);
588       vprodGHIJKLMN = vmull_u8(vi20xGHIJKLMN, vk20xGHIJKLMN);
589       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi20xGHIJKLMN);
590 
591       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
592       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
593       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
594       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
595       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
596       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
597       const uint8x8_t vi21x01234567 = vld1_u8(i21); i21 += 8;
598       const uint8x8_t vk21x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
599       const uint8x8_t vi21x89ABCDEF = vld1_u8(i21); i21 += 8;
600       const uint8x8_t vk21x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
601       const uint8x8_t vi21xGHIJKLMN = vld1_u8(i21); i21 += 8;
602       const uint8x8_t vk21xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
603 
604       vprod01234567 = vmull_u8(vi21x01234567, vk21x01234567);
605       vsum01234567 = vaddw_u8(vsum01234567, vi21x01234567);
606       vprod89ABCDEF = vmull_u8(vi21x89ABCDEF, vk21x89ABCDEF);
607       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi21x89ABCDEF);
608       vprodGHIJKLMN = vmull_u8(vi21xGHIJKLMN, vk21xGHIJKLMN);
609       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi21xGHIJKLMN);
610 
611       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
612       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
613       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
614       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
615       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
616       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
617       const uint8x8_t vi22x01234567 = vld1_u8(i22); i22 += 8;
618       const uint8x8_t vk22x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
619       const uint8x8_t vi22x89ABCDEF = vld1_u8(i22); i22 += 8;
620       const uint8x8_t vk22x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
621       const uint8x8_t vi22xGHIJKLMN = vld1_u8(i22); i22 += 8;
622       const uint8x8_t vk22xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
623 
624       vprod01234567 = vmull_u8(vi22x01234567, vk22x01234567);
625       vsum01234567 = vaddw_u8(vsum01234567, vi22x01234567);
626       vprod89ABCDEF = vmull_u8(vi22x89ABCDEF, vk22x89ABCDEF);
627       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi22x89ABCDEF);
628       vprodGHIJKLMN = vmull_u8(vi22xGHIJKLMN, vk22xGHIJKLMN);
629       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi22xGHIJKLMN);
630 
631       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
632       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
633       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
634       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
635       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
636       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
637       const uint8x8_t vi23x01234567 = vld1_u8(i23); i23 += 8;
638       const uint8x8_t vk23x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
639       const uint8x8_t vi23x89ABCDEF = vld1_u8(i23); i23 += 8;
640       const uint8x8_t vk23x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
641       const uint8x8_t vi23xGHIJKLMN = vld1_u8(i23); i23 += 8;
642       const uint8x8_t vk23xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
643 
644       vprod01234567 = vmull_u8(vi23x01234567, vk23x01234567);
645       vsum01234567 = vaddw_u8(vsum01234567, vi23x01234567);
646       vprod89ABCDEF = vmull_u8(vi23x89ABCDEF, vk23x89ABCDEF);
647       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi23x89ABCDEF);
648       vprodGHIJKLMN = vmull_u8(vi23xGHIJKLMN, vk23xGHIJKLMN);
649       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi23xGHIJKLMN);
650 
651       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
652       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
653       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
654       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
655       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
656       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
657       const uint8x8_t vi24x01234567 = vld1_u8(i24); i24 += 8;
658       const uint8x8_t vk24x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
659       const uint8x8_t vi24x89ABCDEF = vld1_u8(i24); i24 += 8;
660       const uint8x8_t vk24x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
661       const uint8x8_t vi24xGHIJKLMN = vld1_u8(i24); i24 += 8;
662       const uint8x8_t vk24xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
663 
664       vprod01234567 = vmull_u8(vi24x01234567, vk24x01234567);
665       vsum01234567 = vaddw_u8(vsum01234567, vi24x01234567);
666       vprod89ABCDEF = vmull_u8(vi24x89ABCDEF, vk24x89ABCDEF);
667       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi24x89ABCDEF);
668       vprodGHIJKLMN = vmull_u8(vi24xGHIJKLMN, vk24xGHIJKLMN);
669       vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi24xGHIJKLMN);
670 
671       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
672       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
673       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
674       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
675       vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
676       vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
677 
678       vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point16)));
679       vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point16)));
680       vacc89AB = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF), vget_low_u16(vkernel_zero_point16)));
681       vaccCDEF = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF), vget_high_u16(vkernel_zero_point16)));
682       vaccGHIJ = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN), vget_low_u16(vkernel_zero_point16)));
683       vaccKLMN = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN), vget_high_u16(vkernel_zero_point16)));
684 
685       vacc0123 = vshlq_s32(vacc0123, vright_pre_shift);
686       vacc4567 = vshlq_s32(vacc4567, vright_pre_shift);
687       vacc89AB = vshlq_s32(vacc89AB, vright_pre_shift);
688       vaccCDEF = vshlq_s32(vaccCDEF, vright_pre_shift);
689       vaccGHIJ = vshlq_s32(vaccGHIJ, vright_pre_shift);
690       vaccKLMN = vshlq_s32(vaccKLMN, vright_pre_shift);
691 
692       vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
693       vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
694       vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
695       vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);
696       vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier);
697       vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier);
698 
699       vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
700       vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
701       vacc89AB = vrshlq_s32(vacc89AB, vright_post_shift);
702       vaccCDEF = vrshlq_s32(vaccCDEF, vright_post_shift);
703       vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_post_shift);
704       vaccKLMN = vrshlq_s32(vaccKLMN, vright_post_shift);
705 
706 #if XNN_ARCH_ARM64
707       const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
708       const int16x8_t vacc89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF), voutput_zero_point);
709       const int16x8_t vaccGHIJKLMN = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN), voutput_zero_point);
710 
711       uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
712       uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
713 #else
714       const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
715       const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
716       const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point);
717 
718       uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
719       uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
720 #endif
721 
722       vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
723       voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min));
724 
725       vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
726       voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max));
727 
728       vst1q_u8(output, vout0123456789ABCDEF); output += 16;
729       vst1_u8(output, voutGHIJKLMN); output += 8;
730     }
731     if XNN_UNLIKELY(c != 0) {
732       const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 24);
733       do {
734         int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
735         int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
736 
737         const int16x8_t vi0x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i0))); i0 += 8;
738         const int16x8_t vk0x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(k), vkernel_zero_point)); k += 8;
739 
740         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567));
741         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi0x01234567), vget_high_s16(vk0x01234567));
742         const int16x8_t vi1x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i1))); i1 += 8;
743         const int16x8_t vk1x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 16)), vkernel_zero_point));
744 
745         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567));
746         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi1x01234567), vget_high_s16(vk1x01234567));
747         const int16x8_t vi2x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i2))); i2 += 8;
748         const int16x8_t vk2x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 40)), vkernel_zero_point));
749 
750         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567));
751         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi2x01234567), vget_high_s16(vk2x01234567));
752         const int16x8_t vi3x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i3))); i3 += 8;
753         const int16x8_t vk3x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 64)), vkernel_zero_point));
754 
755         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567));
756         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi3x01234567), vget_high_s16(vk3x01234567));
757         const int16x8_t vi4x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i4))); i4 += 8;
758         const int16x8_t vk4x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 88)), vkernel_zero_point));
759 
760         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567));
761         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567));
762         const int16x8_t vi5x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i5))); i5 += 8;
763         const int16x8_t vk5x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 112)), vkernel_zero_point));
764 
765         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567));
766         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi5x01234567), vget_high_s16(vk5x01234567));
767         const int16x8_t vi6x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i6))); i6 += 8;
768         const int16x8_t vk6x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 136)), vkernel_zero_point));
769 
770         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567));
771         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi6x01234567), vget_high_s16(vk6x01234567));
772         const int16x8_t vi7x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i7))); i7 += 8;
773         const int16x8_t vk7x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 160)), vkernel_zero_point));
774 
775         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567));
776         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi7x01234567), vget_high_s16(vk7x01234567));
777         const int16x8_t vi8x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i8))); i8 += 8;
778         const int16x8_t vk8x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 184)), vkernel_zero_point));
779 
780         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567));
781         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567));
782         const int16x8_t vi9x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i9))); i9 += 8;
783         const int16x8_t vk9x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 208)), vkernel_zero_point));
784 
785         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi9x01234567), vget_low_s16(vk9x01234567));
786         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi9x01234567), vget_high_s16(vk9x01234567));
787         const int16x8_t vi10x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i10))); i10 += 8;
788         const int16x8_t vk10x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 232)), vkernel_zero_point));
789 
790         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi10x01234567), vget_low_s16(vk10x01234567));
791         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi10x01234567), vget_high_s16(vk10x01234567));
792         const int16x8_t vi11x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i11))); i11 += 8;
793         const int16x8_t vk11x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 256)), vkernel_zero_point));
794 
795         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi11x01234567), vget_low_s16(vk11x01234567));
796         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi11x01234567), vget_high_s16(vk11x01234567));
797         const int16x8_t vi12x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i12))); i12 += 8;
798         const int16x8_t vk12x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 280)), vkernel_zero_point));
799 
800         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi12x01234567), vget_low_s16(vk12x01234567));
801         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi12x01234567), vget_high_s16(vk12x01234567));
802         const int16x8_t vi13x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i13))); i13 += 8;
803         const int16x8_t vk13x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 304)), vkernel_zero_point));
804 
805         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi13x01234567), vget_low_s16(vk13x01234567));
806         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi13x01234567), vget_high_s16(vk13x01234567));
807         const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); i14 += 8;
808         const int16x8_t vk14x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 328)), vkernel_zero_point));
809 
810         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567));
811         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567));
812         const int16x8_t vi15x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i15))); i15 += 8;
813         const int16x8_t vk15x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 352)), vkernel_zero_point));
814 
815         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi15x01234567), vget_low_s16(vk15x01234567));
816         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi15x01234567), vget_high_s16(vk15x01234567));
817         const int16x8_t vi16x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i16))); i16 += 8;
818         const int16x8_t vk16x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 376)), vkernel_zero_point));
819 
820         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi16x01234567), vget_low_s16(vk16x01234567));
821         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi16x01234567), vget_high_s16(vk16x01234567));
822         const int16x8_t vi17x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i17))); i17 += 8;
823         const int16x8_t vk17x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 400)), vkernel_zero_point));
824 
825         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi17x01234567), vget_low_s16(vk17x01234567));
826         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi17x01234567), vget_high_s16(vk17x01234567));
827         const int16x8_t vi18x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i18))); i18 += 8;
828         const int16x8_t vk18x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 424)), vkernel_zero_point));
829 
830         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi18x01234567), vget_low_s16(vk18x01234567));
831         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi18x01234567), vget_high_s16(vk18x01234567));
832         const int16x8_t vi19x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i19))); i19 += 8;
833         const int16x8_t vk19x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 448)), vkernel_zero_point));
834 
835         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi19x01234567), vget_low_s16(vk19x01234567));
836         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi19x01234567), vget_high_s16(vk19x01234567));
837         const int16x8_t vi20x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i20))); i20 += 8;
838         const int16x8_t vk20x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 472)), vkernel_zero_point));
839 
840         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi20x01234567), vget_low_s16(vk20x01234567));
841         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi20x01234567), vget_high_s16(vk20x01234567));
842         const int16x8_t vi21x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i21))); i21 += 8;
843         const int16x8_t vk21x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 496)), vkernel_zero_point));
844 
845         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi21x01234567), vget_low_s16(vk21x01234567));
846         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi21x01234567), vget_high_s16(vk21x01234567));
847         const int16x8_t vi22x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i22))); i22 += 8;
848         const int16x8_t vk22x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 520)), vkernel_zero_point));
849 
850         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi22x01234567), vget_low_s16(vk22x01234567));
851         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi22x01234567), vget_high_s16(vk22x01234567));
852         const int16x8_t vi23x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i23))); i23 += 8;
853         const int16x8_t vk23x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 544)), vkernel_zero_point));
854 
855         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi23x01234567), vget_low_s16(vk23x01234567));
856         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi23x01234567), vget_high_s16(vk23x01234567));
857         const int16x8_t vi24x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i24))); i24 += 8;
858         const int16x8_t vk24x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 568)), vkernel_zero_point));
859 
860         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi24x01234567), vget_low_s16(vk24x01234567));
861         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi24x01234567), vget_high_s16(vk24x01234567));
862 
863         vacc0123 = vrshlq_s32(vacc0123, vright_pre_shift);
864         vacc4567 = vrshlq_s32(vacc4567, vright_pre_shift);
865 
866         vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
867         vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
868 
869         vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
870         vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
871 
872 #if XNN_ARCH_ARM64
873         const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
874         uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
875 #else
876         const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
877         uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
878 #endif
879 
880         vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
881         vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
882 
883         if XNN_LIKELY(c >= 8) {
884           vst1_u8(output, vout01234567); output += 8;
885           c -= 8;
886         } else {
887           if (c & 4) {
888             vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
889             vout01234567 = vext_u8(vout01234567, vout01234567, 4);
890           }
891           if (c & 2) {
892             vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
893             vout01234567 = vext_u8(vout01234567, vout01234567, 2);
894           }
895           if (c & 1) {
896             vst1_lane_u8(output, vout01234567, 0); output += 1;
897           }
898           c = 0;
899         }
900       } while (c != 0);
901     }
902 
903     output = (uint8_t*) ((uintptr_t) output + output_increment);
904   } while (--output_width != 0);
905 }
906