xref: /aosp_15_r20/external/XNNPACK/src/qu8-dwconv/gen/up16x25-minmax-rndnu-neon-mul8.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/qu8-dwconv/unipass-neon-mul8.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/dwconv.h>
15 
16 
xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul8(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul8(
18     size_t channels,
19     size_t output_width,
20     const uint8_t** input,
21     const void* weights,
22     uint8_t* output,
23     size_t input_stride,
24     size_t output_increment,
25     size_t input_offset,
26     const uint8_t* zero,
27     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29   assert(channels != 0);
30   assert(output_width != 0);
31 
32   const uint8x8_t vkernel_zero_point = vld1_dup_u8(params->rndnu_neon.kernel_zero_point);
33   const uint16x8_t vkernel_zero_point16 = vmovl_u8(vkernel_zero_point);
34   const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
35   const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
36   const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
37   const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
38   const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
39   const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
40   do {
41     const uint8_t* i0 = input[0];
42     assert(i0 != NULL);
43     if XNN_UNPREDICTABLE(i0 != zero) {
44       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
45     }
46     const uint8_t* i1 = input[1];
47     assert(i1 != NULL);
48     if XNN_UNPREDICTABLE(i1 != zero) {
49       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
50     }
51     const uint8_t* i2 = input[2];
52     assert(i2 != NULL);
53     if XNN_UNPREDICTABLE(i2 != zero) {
54       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
55     }
56     const uint8_t* i3 = input[3];
57     assert(i3 != NULL);
58     if XNN_UNPREDICTABLE(i3 != zero) {
59       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
60     }
61     const uint8_t* i4 = input[4];
62     assert(i4 != NULL);
63     if XNN_UNPREDICTABLE(i4 != zero) {
64       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
65     }
66     const uint8_t* i5 = input[5];
67     assert(i5 != NULL);
68     if XNN_UNPREDICTABLE(i5 != zero) {
69       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
70     }
71     const uint8_t* i6 = input[6];
72     assert(i6 != NULL);
73     if XNN_UNPREDICTABLE(i6 != zero) {
74       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
75     }
76     const uint8_t* i7 = input[7];
77     assert(i7 != NULL);
78     if XNN_UNPREDICTABLE(i7 != zero) {
79       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
80     }
81     const uint8_t* i8 = input[8];
82     assert(i8 != NULL);
83     if XNN_UNPREDICTABLE(i8 != zero) {
84       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
85     }
86     const uint8_t* i9 = input[9];
87     assert(i9 != NULL);
88     if XNN_UNPREDICTABLE(i9 != zero) {
89       i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
90     }
91     const uint8_t* i10 = input[10];
92     assert(i10 != NULL);
93     if XNN_UNPREDICTABLE(i10 != zero) {
94       i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
95     }
96     const uint8_t* i11 = input[11];
97     assert(i11 != NULL);
98     if XNN_UNPREDICTABLE(i11 != zero) {
99       i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
100     }
101     const uint8_t* i12 = input[12];
102     assert(i12 != NULL);
103     if XNN_UNPREDICTABLE(i12 != zero) {
104       i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
105     }
106     const uint8_t* i13 = input[13];
107     assert(i13 != NULL);
108     if XNN_UNPREDICTABLE(i13 != zero) {
109       i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
110     }
111     const uint8_t* i14 = input[14];
112     assert(i14 != NULL);
113     if XNN_UNPREDICTABLE(i14 != zero) {
114       i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
115     }
116     const uint8_t* i15 = input[15];
117     assert(i15 != NULL);
118     if XNN_UNPREDICTABLE(i15 != zero) {
119       i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
120     }
121     const uint8_t* i16 = input[16];
122     assert(i16 != NULL);
123     if XNN_UNPREDICTABLE(i16 != zero) {
124       i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
125     }
126     const uint8_t* i17 = input[17];
127     assert(i17 != NULL);
128     if XNN_UNPREDICTABLE(i17 != zero) {
129       i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
130     }
131     const uint8_t* i18 = input[18];
132     assert(i18 != NULL);
133     if XNN_UNPREDICTABLE(i18 != zero) {
134       i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
135     }
136     const uint8_t* i19 = input[19];
137     assert(i19 != NULL);
138     if XNN_UNPREDICTABLE(i19 != zero) {
139       i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
140     }
141     const uint8_t* i20 = input[20];
142     assert(i20 != NULL);
143     if XNN_UNPREDICTABLE(i20 != zero) {
144       i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
145     }
146     const uint8_t* i21 = input[21];
147     assert(i21 != NULL);
148     if XNN_UNPREDICTABLE(i21 != zero) {
149       i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
150     }
151     const uint8_t* i22 = input[22];
152     assert(i22 != NULL);
153     if XNN_UNPREDICTABLE(i22 != zero) {
154       i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
155     }
156     const uint8_t* i23 = input[23];
157     assert(i23 != NULL);
158     if XNN_UNPREDICTABLE(i23 != zero) {
159       i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
160     }
161     const uint8_t* i24 = input[24];
162     assert(i24 != NULL);
163     if XNN_UNPREDICTABLE(i24 != zero) {
164       i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
165     }
166     input = (const uint8_t**) ((uintptr_t) input + input_stride);
167 
168 
169     size_t c = channels;
170     const void* w = weights;
171     for (; c >= 16; c -= 16) {
172       int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
173       int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
174       int32x4_t vacc89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
175       int32x4_t vaccCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
176 
177 
178       const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
179       const uint8x8_t vk0x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
180       const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
181       const uint8x8_t vk0x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
182 
183       uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567);
184       uint16x8_t vprod89ABCDEF = vmull_u8(vi0x89ABCDEF, vk0x89ABCDEF);
185 
186       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
187       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
188       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
189       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
190       const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
191       const uint8x8_t vk1x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
192       const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
193       const uint8x8_t vk1x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
194 
195       vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567);
196       uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
197       vprod89ABCDEF = vmull_u8(vi1x89ABCDEF, vk1x89ABCDEF);
198       uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
199 
200       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
201       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
202       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
203       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
204       const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
205       const uint8x8_t vk2x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
206       const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
207       const uint8x8_t vk2x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
208 
209       vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567);
210       vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
211       vprod89ABCDEF = vmull_u8(vi2x89ABCDEF, vk2x89ABCDEF);
212       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
213 
214       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
215       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
216       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
217       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
218       const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
219       const uint8x8_t vk3x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
220       const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
221       const uint8x8_t vk3x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
222 
223       vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567);
224       vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
225       vprod89ABCDEF = vmull_u8(vi3x89ABCDEF, vk3x89ABCDEF);
226       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
227 
228       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
229       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
230       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
231       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
232       const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
233       const uint8x8_t vk4x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
234       const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
235       const uint8x8_t vk4x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
236 
237       vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567);
238       vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
239       vprod89ABCDEF = vmull_u8(vi4x89ABCDEF, vk4x89ABCDEF);
240       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
241 
242       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
243       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
244       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
245       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
246       const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
247       const uint8x8_t vk5x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
248       const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
249       const uint8x8_t vk5x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
250 
251       vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567);
252       vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
253       vprod89ABCDEF = vmull_u8(vi5x89ABCDEF, vk5x89ABCDEF);
254       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
255 
256       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
257       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
258       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
259       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
260       const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
261       const uint8x8_t vk6x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
262       const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
263       const uint8x8_t vk6x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
264 
265       vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567);
266       vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
267       vprod89ABCDEF = vmull_u8(vi6x89ABCDEF, vk6x89ABCDEF);
268       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
269 
270       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
271       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
272       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
273       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
274       const uint8x8_t vi7x01234567 = vld1_u8(i7); i7 += 8;
275       const uint8x8_t vk7x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
276       const uint8x8_t vi7x89ABCDEF = vld1_u8(i7); i7 += 8;
277       const uint8x8_t vk7x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
278 
279       vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567);
280       vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567);
281       vprod89ABCDEF = vmull_u8(vi7x89ABCDEF, vk7x89ABCDEF);
282       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi7x89ABCDEF);
283 
284       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
285       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
286       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
287       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
288       const uint8x8_t vi8x01234567 = vld1_u8(i8); i8 += 8;
289       const uint8x8_t vk8x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
290       const uint8x8_t vi8x89ABCDEF = vld1_u8(i8); i8 += 8;
291       const uint8x8_t vk8x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
292 
293       vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567);
294       vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567);
295       vprod89ABCDEF = vmull_u8(vi8x89ABCDEF, vk8x89ABCDEF);
296       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi8x89ABCDEF);
297 
298       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
299       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
300       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
301       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
302       const uint8x8_t vi9x01234567 = vld1_u8(i9); i9 += 8;
303       const uint8x8_t vk9x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
304       const uint8x8_t vi9x89ABCDEF = vld1_u8(i9); i9 += 8;
305       const uint8x8_t vk9x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
306 
307       vprod01234567 = vmull_u8(vi9x01234567, vk9x01234567);
308       vsum01234567 = vaddw_u8(vsum01234567, vi9x01234567);
309       vprod89ABCDEF = vmull_u8(vi9x89ABCDEF, vk9x89ABCDEF);
310       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi9x89ABCDEF);
311 
312       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
313       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
314       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
315       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
316       const uint8x8_t vi10x01234567 = vld1_u8(i10); i10 += 8;
317       const uint8x8_t vk10x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
318       const uint8x8_t vi10x89ABCDEF = vld1_u8(i10); i10 += 8;
319       const uint8x8_t vk10x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
320 
321       vprod01234567 = vmull_u8(vi10x01234567, vk10x01234567);
322       vsum01234567 = vaddw_u8(vsum01234567, vi10x01234567);
323       vprod89ABCDEF = vmull_u8(vi10x89ABCDEF, vk10x89ABCDEF);
324       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi10x89ABCDEF);
325 
326       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
327       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
328       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
329       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
330       const uint8x8_t vi11x01234567 = vld1_u8(i11); i11 += 8;
331       const uint8x8_t vk11x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
332       const uint8x8_t vi11x89ABCDEF = vld1_u8(i11); i11 += 8;
333       const uint8x8_t vk11x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
334 
335       vprod01234567 = vmull_u8(vi11x01234567, vk11x01234567);
336       vsum01234567 = vaddw_u8(vsum01234567, vi11x01234567);
337       vprod89ABCDEF = vmull_u8(vi11x89ABCDEF, vk11x89ABCDEF);
338       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi11x89ABCDEF);
339 
340       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
341       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
342       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
343       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
344       const uint8x8_t vi12x01234567 = vld1_u8(i12); i12 += 8;
345       const uint8x8_t vk12x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
346       const uint8x8_t vi12x89ABCDEF = vld1_u8(i12); i12 += 8;
347       const uint8x8_t vk12x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
348 
349       vprod01234567 = vmull_u8(vi12x01234567, vk12x01234567);
350       vsum01234567 = vaddw_u8(vsum01234567, vi12x01234567);
351       vprod89ABCDEF = vmull_u8(vi12x89ABCDEF, vk12x89ABCDEF);
352       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi12x89ABCDEF);
353 
354       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
355       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
356       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
357       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
358       const uint8x8_t vi13x01234567 = vld1_u8(i13); i13 += 8;
359       const uint8x8_t vk13x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
360       const uint8x8_t vi13x89ABCDEF = vld1_u8(i13); i13 += 8;
361       const uint8x8_t vk13x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
362 
363       vprod01234567 = vmull_u8(vi13x01234567, vk13x01234567);
364       vsum01234567 = vaddw_u8(vsum01234567, vi13x01234567);
365       vprod89ABCDEF = vmull_u8(vi13x89ABCDEF, vk13x89ABCDEF);
366       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi13x89ABCDEF);
367 
368       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
369       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
370       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
371       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
372       const uint8x8_t vi14x01234567 = vld1_u8(i14); i14 += 8;
373       const uint8x8_t vk14x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
374       const uint8x8_t vi14x89ABCDEF = vld1_u8(i14); i14 += 8;
375       const uint8x8_t vk14x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
376 
377       vprod01234567 = vmull_u8(vi14x01234567, vk14x01234567);
378       vsum01234567 = vaddw_u8(vsum01234567, vi14x01234567);
379       vprod89ABCDEF = vmull_u8(vi14x89ABCDEF, vk14x89ABCDEF);
380       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi14x89ABCDEF);
381 
382       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
383       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
384       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
385       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
386       const uint8x8_t vi15x01234567 = vld1_u8(i15); i15 += 8;
387       const uint8x8_t vk15x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
388       const uint8x8_t vi15x89ABCDEF = vld1_u8(i15); i15 += 8;
389       const uint8x8_t vk15x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
390 
391       vprod01234567 = vmull_u8(vi15x01234567, vk15x01234567);
392       vsum01234567 = vaddw_u8(vsum01234567, vi15x01234567);
393       vprod89ABCDEF = vmull_u8(vi15x89ABCDEF, vk15x89ABCDEF);
394       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi15x89ABCDEF);
395 
396       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
397       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
398       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
399       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
400       const uint8x8_t vi16x01234567 = vld1_u8(i16); i16 += 8;
401       const uint8x8_t vk16x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
402       const uint8x8_t vi16x89ABCDEF = vld1_u8(i16); i16 += 8;
403       const uint8x8_t vk16x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
404 
405       vprod01234567 = vmull_u8(vi16x01234567, vk16x01234567);
406       vsum01234567 = vaddw_u8(vsum01234567, vi16x01234567);
407       vprod89ABCDEF = vmull_u8(vi16x89ABCDEF, vk16x89ABCDEF);
408       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi16x89ABCDEF);
409 
410       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
411       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
412       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
413       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
414       const uint8x8_t vi17x01234567 = vld1_u8(i17); i17 += 8;
415       const uint8x8_t vk17x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
416       const uint8x8_t vi17x89ABCDEF = vld1_u8(i17); i17 += 8;
417       const uint8x8_t vk17x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
418 
419       vprod01234567 = vmull_u8(vi17x01234567, vk17x01234567);
420       vsum01234567 = vaddw_u8(vsum01234567, vi17x01234567);
421       vprod89ABCDEF = vmull_u8(vi17x89ABCDEF, vk17x89ABCDEF);
422       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi17x89ABCDEF);
423 
424       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
425       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
426       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
427       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
428       const uint8x8_t vi18x01234567 = vld1_u8(i18); i18 += 8;
429       const uint8x8_t vk18x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
430       const uint8x8_t vi18x89ABCDEF = vld1_u8(i18); i18 += 8;
431       const uint8x8_t vk18x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
432 
433       vprod01234567 = vmull_u8(vi18x01234567, vk18x01234567);
434       vsum01234567 = vaddw_u8(vsum01234567, vi18x01234567);
435       vprod89ABCDEF = vmull_u8(vi18x89ABCDEF, vk18x89ABCDEF);
436       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi18x89ABCDEF);
437 
438       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
439       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
440       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
441       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
442       const uint8x8_t vi19x01234567 = vld1_u8(i19); i19 += 8;
443       const uint8x8_t vk19x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
444       const uint8x8_t vi19x89ABCDEF = vld1_u8(i19); i19 += 8;
445       const uint8x8_t vk19x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
446 
447       vprod01234567 = vmull_u8(vi19x01234567, vk19x01234567);
448       vsum01234567 = vaddw_u8(vsum01234567, vi19x01234567);
449       vprod89ABCDEF = vmull_u8(vi19x89ABCDEF, vk19x89ABCDEF);
450       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi19x89ABCDEF);
451 
452       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
453       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
454       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
455       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
456       const uint8x8_t vi20x01234567 = vld1_u8(i20); i20 += 8;
457       const uint8x8_t vk20x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
458       const uint8x8_t vi20x89ABCDEF = vld1_u8(i20); i20 += 8;
459       const uint8x8_t vk20x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
460 
461       vprod01234567 = vmull_u8(vi20x01234567, vk20x01234567);
462       vsum01234567 = vaddw_u8(vsum01234567, vi20x01234567);
463       vprod89ABCDEF = vmull_u8(vi20x89ABCDEF, vk20x89ABCDEF);
464       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi20x89ABCDEF);
465 
466       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
467       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
468       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
469       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
470       const uint8x8_t vi21x01234567 = vld1_u8(i21); i21 += 8;
471       const uint8x8_t vk21x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
472       const uint8x8_t vi21x89ABCDEF = vld1_u8(i21); i21 += 8;
473       const uint8x8_t vk21x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
474 
475       vprod01234567 = vmull_u8(vi21x01234567, vk21x01234567);
476       vsum01234567 = vaddw_u8(vsum01234567, vi21x01234567);
477       vprod89ABCDEF = vmull_u8(vi21x89ABCDEF, vk21x89ABCDEF);
478       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi21x89ABCDEF);
479 
480       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
481       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
482       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
483       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
484       const uint8x8_t vi22x01234567 = vld1_u8(i22); i22 += 8;
485       const uint8x8_t vk22x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
486       const uint8x8_t vi22x89ABCDEF = vld1_u8(i22); i22 += 8;
487       const uint8x8_t vk22x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
488 
489       vprod01234567 = vmull_u8(vi22x01234567, vk22x01234567);
490       vsum01234567 = vaddw_u8(vsum01234567, vi22x01234567);
491       vprod89ABCDEF = vmull_u8(vi22x89ABCDEF, vk22x89ABCDEF);
492       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi22x89ABCDEF);
493 
494       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
495       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
496       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
497       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
498       const uint8x8_t vi23x01234567 = vld1_u8(i23); i23 += 8;
499       const uint8x8_t vk23x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
500       const uint8x8_t vi23x89ABCDEF = vld1_u8(i23); i23 += 8;
501       const uint8x8_t vk23x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
502 
503       vprod01234567 = vmull_u8(vi23x01234567, vk23x01234567);
504       vsum01234567 = vaddw_u8(vsum01234567, vi23x01234567);
505       vprod89ABCDEF = vmull_u8(vi23x89ABCDEF, vk23x89ABCDEF);
506       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi23x89ABCDEF);
507 
508       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
509       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
510       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
511       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
512       const uint8x8_t vi24x01234567 = vld1_u8(i24); i24 += 8;
513       const uint8x8_t vk24x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
514       const uint8x8_t vi24x89ABCDEF = vld1_u8(i24); i24 += 8;
515       const uint8x8_t vk24x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
516 
517       vprod01234567 = vmull_u8(vi24x01234567, vk24x01234567);
518       vsum01234567 = vaddw_u8(vsum01234567, vi24x01234567);
519       vprod89ABCDEF = vmull_u8(vi24x89ABCDEF, vk24x89ABCDEF);
520       vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi24x89ABCDEF);
521 
522       vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
523       vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
524       vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
525       vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
526 
527       vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point16)));
528       vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point16)));
529       vacc89AB = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF), vget_low_u16(vkernel_zero_point16)));
530       vaccCDEF = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF), vget_high_u16(vkernel_zero_point16)));
531 
532       vacc0123 = vshlq_s32(vacc0123, vright_pre_shift);
533       vacc4567 = vshlq_s32(vacc4567, vright_pre_shift);
534       vacc89AB = vshlq_s32(vacc89AB, vright_pre_shift);
535       vaccCDEF = vshlq_s32(vaccCDEF, vright_pre_shift);
536 
537       vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
538       vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
539       vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
540       vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);
541 
542       vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
543       vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
544       vacc89AB = vrshlq_s32(vacc89AB, vright_post_shift);
545       vaccCDEF = vrshlq_s32(vaccCDEF, vright_post_shift);
546 
547 #if XNN_ARCH_ARM64
548       const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
549       const int16x8_t vacc89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF), voutput_zero_point);
550 
551       uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
552 #else
553       const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
554       const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
555 
556       uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
557 #endif
558 
559       vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
560 
561       vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
562 
563       vst1q_u8(output, vout0123456789ABCDEF); output += 16;
564     }
565     if XNN_UNLIKELY(c != 0) {
566       const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
567       do {
568         int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
569         int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
570 
571         const int16x8_t vi0x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i0))); i0 += 8;
572         const int16x8_t vk0x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(k), vkernel_zero_point)); k += 8;
573 
574         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567));
575         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi0x01234567), vget_high_s16(vk0x01234567));
576         const int16x8_t vi1x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i1))); i1 += 8;
577         const int16x8_t vk1x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 8)), vkernel_zero_point));
578 
579         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567));
580         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi1x01234567), vget_high_s16(vk1x01234567));
581         const int16x8_t vi2x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i2))); i2 += 8;
582         const int16x8_t vk2x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 24)), vkernel_zero_point));
583 
584         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567));
585         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi2x01234567), vget_high_s16(vk2x01234567));
586         const int16x8_t vi3x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i3))); i3 += 8;
587         const int16x8_t vk3x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 40)), vkernel_zero_point));
588 
589         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567));
590         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi3x01234567), vget_high_s16(vk3x01234567));
591         const int16x8_t vi4x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i4))); i4 += 8;
592         const int16x8_t vk4x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 56)), vkernel_zero_point));
593 
594         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567));
595         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567));
596         const int16x8_t vi5x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i5))); i5 += 8;
597         const int16x8_t vk5x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 72)), vkernel_zero_point));
598 
599         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567));
600         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi5x01234567), vget_high_s16(vk5x01234567));
601         const int16x8_t vi6x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i6))); i6 += 8;
602         const int16x8_t vk6x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 88)), vkernel_zero_point));
603 
604         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567));
605         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi6x01234567), vget_high_s16(vk6x01234567));
606         const int16x8_t vi7x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i7))); i7 += 8;
607         const int16x8_t vk7x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 104)), vkernel_zero_point));
608 
609         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567));
610         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi7x01234567), vget_high_s16(vk7x01234567));
611         const int16x8_t vi8x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i8))); i8 += 8;
612         const int16x8_t vk8x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 120)), vkernel_zero_point));
613 
614         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567));
615         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567));
616         const int16x8_t vi9x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i9))); i9 += 8;
617         const int16x8_t vk9x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 136)), vkernel_zero_point));
618 
619         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi9x01234567), vget_low_s16(vk9x01234567));
620         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi9x01234567), vget_high_s16(vk9x01234567));
621         const int16x8_t vi10x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i10))); i10 += 8;
622         const int16x8_t vk10x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 152)), vkernel_zero_point));
623 
624         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi10x01234567), vget_low_s16(vk10x01234567));
625         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi10x01234567), vget_high_s16(vk10x01234567));
626         const int16x8_t vi11x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i11))); i11 += 8;
627         const int16x8_t vk11x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 168)), vkernel_zero_point));
628 
629         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi11x01234567), vget_low_s16(vk11x01234567));
630         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi11x01234567), vget_high_s16(vk11x01234567));
631         const int16x8_t vi12x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i12))); i12 += 8;
632         const int16x8_t vk12x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 184)), vkernel_zero_point));
633 
634         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi12x01234567), vget_low_s16(vk12x01234567));
635         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi12x01234567), vget_high_s16(vk12x01234567));
636         const int16x8_t vi13x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i13))); i13 += 8;
637         const int16x8_t vk13x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 200)), vkernel_zero_point));
638 
639         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi13x01234567), vget_low_s16(vk13x01234567));
640         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi13x01234567), vget_high_s16(vk13x01234567));
641         const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); i14 += 8;
642         const int16x8_t vk14x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 216)), vkernel_zero_point));
643 
644         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567));
645         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567));
646         const int16x8_t vi15x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i15))); i15 += 8;
647         const int16x8_t vk15x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 232)), vkernel_zero_point));
648 
649         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi15x01234567), vget_low_s16(vk15x01234567));
650         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi15x01234567), vget_high_s16(vk15x01234567));
651         const int16x8_t vi16x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i16))); i16 += 8;
652         const int16x8_t vk16x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 248)), vkernel_zero_point));
653 
654         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi16x01234567), vget_low_s16(vk16x01234567));
655         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi16x01234567), vget_high_s16(vk16x01234567));
656         const int16x8_t vi17x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i17))); i17 += 8;
657         const int16x8_t vk17x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 264)), vkernel_zero_point));
658 
659         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi17x01234567), vget_low_s16(vk17x01234567));
660         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi17x01234567), vget_high_s16(vk17x01234567));
661         const int16x8_t vi18x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i18))); i18 += 8;
662         const int16x8_t vk18x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 280)), vkernel_zero_point));
663 
664         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi18x01234567), vget_low_s16(vk18x01234567));
665         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi18x01234567), vget_high_s16(vk18x01234567));
666         const int16x8_t vi19x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i19))); i19 += 8;
667         const int16x8_t vk19x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 296)), vkernel_zero_point));
668 
669         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi19x01234567), vget_low_s16(vk19x01234567));
670         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi19x01234567), vget_high_s16(vk19x01234567));
671         const int16x8_t vi20x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i20))); i20 += 8;
672         const int16x8_t vk20x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 312)), vkernel_zero_point));
673 
674         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi20x01234567), vget_low_s16(vk20x01234567));
675         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi20x01234567), vget_high_s16(vk20x01234567));
676         const int16x8_t vi21x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i21))); i21 += 8;
677         const int16x8_t vk21x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 328)), vkernel_zero_point));
678 
679         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi21x01234567), vget_low_s16(vk21x01234567));
680         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi21x01234567), vget_high_s16(vk21x01234567));
681         const int16x8_t vi22x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i22))); i22 += 8;
682         const int16x8_t vk22x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 344)), vkernel_zero_point));
683 
684         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi22x01234567), vget_low_s16(vk22x01234567));
685         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi22x01234567), vget_high_s16(vk22x01234567));
686         const int16x8_t vi23x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i23))); i23 += 8;
687         const int16x8_t vk23x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 360)), vkernel_zero_point));
688 
689         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi23x01234567), vget_low_s16(vk23x01234567));
690         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi23x01234567), vget_high_s16(vk23x01234567));
691         const int16x8_t vi24x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i24))); i24 += 8;
692         const int16x8_t vk24x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 376)), vkernel_zero_point));
693 
694         vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi24x01234567), vget_low_s16(vk24x01234567));
695         vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi24x01234567), vget_high_s16(vk24x01234567));
696 
697         vacc0123 = vrshlq_s32(vacc0123, vright_pre_shift);
698         vacc4567 = vrshlq_s32(vacc4567, vright_pre_shift);
699 
700         vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
701         vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
702 
703         vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
704         vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
705 
706 #if XNN_ARCH_ARM64
707         const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
708         uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
709 #else
710         const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
711         uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
712 #endif
713 
714         vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
715         vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
716 
717         if XNN_LIKELY(c >= 8) {
718           vst1_u8(output, vout01234567); output += 8;
719           c -= 8;
720         } else {
721           if (c & 4) {
722             vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
723             vout01234567 = vext_u8(vout01234567, vout01234567, 4);
724           }
725           if (c & 2) {
726             vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
727             vout01234567 = vext_u8(vout01234567, vout01234567, 2);
728           }
729           if (c & 1) {
730             vst1_lane_u8(output, vout01234567, 0); output += 1;
731           }
732           c = 0;
733         }
734       } while (c != 0);
735     }
736 
737     output = (uint8_t*) ((uintptr_t) output + output_increment);
738   } while (--output_width != 0);
739 }
740