1 // Auto-generated file. Do not edit!
2 // Template: src/qu8-dwconv/unipass-neon-mul8.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/dwconv.h>
15
16
xnn_qu8_dwconv_minmax_rndnu_ukernel_up24x25__neon_mul8(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qu8_dwconv_minmax_rndnu_ukernel_up24x25__neon_mul8(
18 size_t channels,
19 size_t output_width,
20 const uint8_t** input,
21 const void* weights,
22 uint8_t* output,
23 size_t input_stride,
24 size_t output_increment,
25 size_t input_offset,
26 const uint8_t* zero,
27 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(channels != 0);
30 assert(output_width != 0);
31
32 const uint8x8_t vkernel_zero_point = vld1_dup_u8(params->rndnu_neon.kernel_zero_point);
33 const uint16x8_t vkernel_zero_point16 = vmovl_u8(vkernel_zero_point);
34 const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift);
35 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier);
36 const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift);
37 const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point);
38 const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min);
39 const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max);
40 do {
41 const uint8_t* i0 = input[0];
42 assert(i0 != NULL);
43 if XNN_UNPREDICTABLE(i0 != zero) {
44 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
45 }
46 const uint8_t* i1 = input[1];
47 assert(i1 != NULL);
48 if XNN_UNPREDICTABLE(i1 != zero) {
49 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
50 }
51 const uint8_t* i2 = input[2];
52 assert(i2 != NULL);
53 if XNN_UNPREDICTABLE(i2 != zero) {
54 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
55 }
56 const uint8_t* i3 = input[3];
57 assert(i3 != NULL);
58 if XNN_UNPREDICTABLE(i3 != zero) {
59 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
60 }
61 const uint8_t* i4 = input[4];
62 assert(i4 != NULL);
63 if XNN_UNPREDICTABLE(i4 != zero) {
64 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
65 }
66 const uint8_t* i5 = input[5];
67 assert(i5 != NULL);
68 if XNN_UNPREDICTABLE(i5 != zero) {
69 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
70 }
71 const uint8_t* i6 = input[6];
72 assert(i6 != NULL);
73 if XNN_UNPREDICTABLE(i6 != zero) {
74 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
75 }
76 const uint8_t* i7 = input[7];
77 assert(i7 != NULL);
78 if XNN_UNPREDICTABLE(i7 != zero) {
79 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
80 }
81 const uint8_t* i8 = input[8];
82 assert(i8 != NULL);
83 if XNN_UNPREDICTABLE(i8 != zero) {
84 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
85 }
86 const uint8_t* i9 = input[9];
87 assert(i9 != NULL);
88 if XNN_UNPREDICTABLE(i9 != zero) {
89 i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
90 }
91 const uint8_t* i10 = input[10];
92 assert(i10 != NULL);
93 if XNN_UNPREDICTABLE(i10 != zero) {
94 i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
95 }
96 const uint8_t* i11 = input[11];
97 assert(i11 != NULL);
98 if XNN_UNPREDICTABLE(i11 != zero) {
99 i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
100 }
101 const uint8_t* i12 = input[12];
102 assert(i12 != NULL);
103 if XNN_UNPREDICTABLE(i12 != zero) {
104 i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
105 }
106 const uint8_t* i13 = input[13];
107 assert(i13 != NULL);
108 if XNN_UNPREDICTABLE(i13 != zero) {
109 i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
110 }
111 const uint8_t* i14 = input[14];
112 assert(i14 != NULL);
113 if XNN_UNPREDICTABLE(i14 != zero) {
114 i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
115 }
116 const uint8_t* i15 = input[15];
117 assert(i15 != NULL);
118 if XNN_UNPREDICTABLE(i15 != zero) {
119 i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
120 }
121 const uint8_t* i16 = input[16];
122 assert(i16 != NULL);
123 if XNN_UNPREDICTABLE(i16 != zero) {
124 i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
125 }
126 const uint8_t* i17 = input[17];
127 assert(i17 != NULL);
128 if XNN_UNPREDICTABLE(i17 != zero) {
129 i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
130 }
131 const uint8_t* i18 = input[18];
132 assert(i18 != NULL);
133 if XNN_UNPREDICTABLE(i18 != zero) {
134 i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
135 }
136 const uint8_t* i19 = input[19];
137 assert(i19 != NULL);
138 if XNN_UNPREDICTABLE(i19 != zero) {
139 i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
140 }
141 const uint8_t* i20 = input[20];
142 assert(i20 != NULL);
143 if XNN_UNPREDICTABLE(i20 != zero) {
144 i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
145 }
146 const uint8_t* i21 = input[21];
147 assert(i21 != NULL);
148 if XNN_UNPREDICTABLE(i21 != zero) {
149 i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
150 }
151 const uint8_t* i22 = input[22];
152 assert(i22 != NULL);
153 if XNN_UNPREDICTABLE(i22 != zero) {
154 i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
155 }
156 const uint8_t* i23 = input[23];
157 assert(i23 != NULL);
158 if XNN_UNPREDICTABLE(i23 != zero) {
159 i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
160 }
161 const uint8_t* i24 = input[24];
162 assert(i24 != NULL);
163 if XNN_UNPREDICTABLE(i24 != zero) {
164 i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
165 }
166 input = (const uint8_t**) ((uintptr_t) input + input_stride);
167
168
169 size_t c = channels;
170 const void* w = weights;
171 for (; c >= 24; c -= 24) {
172 int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
173 int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
174 int32x4_t vacc89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
175 int32x4_t vaccCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
176 int32x4_t vaccGHIJ = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
177 int32x4_t vaccKLMN = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
178
179
180 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
181 const uint8x8_t vk0x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
182 const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
183 const uint8x8_t vk0x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
184 const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
185 const uint8x8_t vk0xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
186
187 uint16x8_t vprod01234567 = vmull_u8(vi0x01234567, vk0x01234567);
188 uint16x8_t vprod89ABCDEF = vmull_u8(vi0x89ABCDEF, vk0x89ABCDEF);
189 uint16x8_t vprodGHIJKLMN = vmull_u8(vi0xGHIJKLMN, vk0xGHIJKLMN);
190
191 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
192 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
193 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
194 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
195 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
196 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
197 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
198 const uint8x8_t vk1x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
199 const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
200 const uint8x8_t vk1x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
201 const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
202 const uint8x8_t vk1xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
203
204 vprod01234567 = vmull_u8(vi1x01234567, vk1x01234567);
205 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
206 vprod89ABCDEF = vmull_u8(vi1x89ABCDEF, vk1x89ABCDEF);
207 uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
208 vprodGHIJKLMN = vmull_u8(vi1xGHIJKLMN, vk1xGHIJKLMN);
209 uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
210
211 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
212 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
213 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
214 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
215 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
216 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
217 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
218 const uint8x8_t vk2x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
219 const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
220 const uint8x8_t vk2x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
221 const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
222 const uint8x8_t vk2xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
223
224 vprod01234567 = vmull_u8(vi2x01234567, vk2x01234567);
225 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
226 vprod89ABCDEF = vmull_u8(vi2x89ABCDEF, vk2x89ABCDEF);
227 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
228 vprodGHIJKLMN = vmull_u8(vi2xGHIJKLMN, vk2xGHIJKLMN);
229 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
230
231 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
232 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
233 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
234 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
235 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
236 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
237 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
238 const uint8x8_t vk3x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
239 const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
240 const uint8x8_t vk3x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
241 const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
242 const uint8x8_t vk3xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
243
244 vprod01234567 = vmull_u8(vi3x01234567, vk3x01234567);
245 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
246 vprod89ABCDEF = vmull_u8(vi3x89ABCDEF, vk3x89ABCDEF);
247 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
248 vprodGHIJKLMN = vmull_u8(vi3xGHIJKLMN, vk3xGHIJKLMN);
249 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
250
251 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
252 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
253 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
254 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
255 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
256 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
257 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
258 const uint8x8_t vk4x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
259 const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
260 const uint8x8_t vk4x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
261 const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
262 const uint8x8_t vk4xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
263
264 vprod01234567 = vmull_u8(vi4x01234567, vk4x01234567);
265 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
266 vprod89ABCDEF = vmull_u8(vi4x89ABCDEF, vk4x89ABCDEF);
267 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
268 vprodGHIJKLMN = vmull_u8(vi4xGHIJKLMN, vk4xGHIJKLMN);
269 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
270
271 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
272 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
273 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
274 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
275 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
276 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
277 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
278 const uint8x8_t vk5x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
279 const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
280 const uint8x8_t vk5x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
281 const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
282 const uint8x8_t vk5xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
283
284 vprod01234567 = vmull_u8(vi5x01234567, vk5x01234567);
285 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
286 vprod89ABCDEF = vmull_u8(vi5x89ABCDEF, vk5x89ABCDEF);
287 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
288 vprodGHIJKLMN = vmull_u8(vi5xGHIJKLMN, vk5xGHIJKLMN);
289 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
290
291 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
292 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
293 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
294 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
295 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
296 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
297 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
298 const uint8x8_t vk6x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
299 const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
300 const uint8x8_t vk6x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
301 const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
302 const uint8x8_t vk6xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
303
304 vprod01234567 = vmull_u8(vi6x01234567, vk6x01234567);
305 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
306 vprod89ABCDEF = vmull_u8(vi6x89ABCDEF, vk6x89ABCDEF);
307 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
308 vprodGHIJKLMN = vmull_u8(vi6xGHIJKLMN, vk6xGHIJKLMN);
309 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
310
311 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
312 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
313 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
314 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
315 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
316 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
317 const uint8x8_t vi7x01234567 = vld1_u8(i7); i7 += 8;
318 const uint8x8_t vk7x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
319 const uint8x8_t vi7x89ABCDEF = vld1_u8(i7); i7 += 8;
320 const uint8x8_t vk7x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
321 const uint8x8_t vi7xGHIJKLMN = vld1_u8(i7); i7 += 8;
322 const uint8x8_t vk7xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
323
324 vprod01234567 = vmull_u8(vi7x01234567, vk7x01234567);
325 vsum01234567 = vaddw_u8(vsum01234567, vi7x01234567);
326 vprod89ABCDEF = vmull_u8(vi7x89ABCDEF, vk7x89ABCDEF);
327 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi7x89ABCDEF);
328 vprodGHIJKLMN = vmull_u8(vi7xGHIJKLMN, vk7xGHIJKLMN);
329 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi7xGHIJKLMN);
330
331 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
332 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
333 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
334 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
335 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
336 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
337 const uint8x8_t vi8x01234567 = vld1_u8(i8); i8 += 8;
338 const uint8x8_t vk8x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
339 const uint8x8_t vi8x89ABCDEF = vld1_u8(i8); i8 += 8;
340 const uint8x8_t vk8x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
341 const uint8x8_t vi8xGHIJKLMN = vld1_u8(i8); i8 += 8;
342 const uint8x8_t vk8xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
343
344 vprod01234567 = vmull_u8(vi8x01234567, vk8x01234567);
345 vsum01234567 = vaddw_u8(vsum01234567, vi8x01234567);
346 vprod89ABCDEF = vmull_u8(vi8x89ABCDEF, vk8x89ABCDEF);
347 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi8x89ABCDEF);
348 vprodGHIJKLMN = vmull_u8(vi8xGHIJKLMN, vk8xGHIJKLMN);
349 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi8xGHIJKLMN);
350
351 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
352 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
353 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
354 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
355 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
356 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
357 const uint8x8_t vi9x01234567 = vld1_u8(i9); i9 += 8;
358 const uint8x8_t vk9x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
359 const uint8x8_t vi9x89ABCDEF = vld1_u8(i9); i9 += 8;
360 const uint8x8_t vk9x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
361 const uint8x8_t vi9xGHIJKLMN = vld1_u8(i9); i9 += 8;
362 const uint8x8_t vk9xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
363
364 vprod01234567 = vmull_u8(vi9x01234567, vk9x01234567);
365 vsum01234567 = vaddw_u8(vsum01234567, vi9x01234567);
366 vprod89ABCDEF = vmull_u8(vi9x89ABCDEF, vk9x89ABCDEF);
367 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi9x89ABCDEF);
368 vprodGHIJKLMN = vmull_u8(vi9xGHIJKLMN, vk9xGHIJKLMN);
369 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi9xGHIJKLMN);
370
371 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
372 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
373 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
374 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
375 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
376 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
377 const uint8x8_t vi10x01234567 = vld1_u8(i10); i10 += 8;
378 const uint8x8_t vk10x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
379 const uint8x8_t vi10x89ABCDEF = vld1_u8(i10); i10 += 8;
380 const uint8x8_t vk10x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
381 const uint8x8_t vi10xGHIJKLMN = vld1_u8(i10); i10 += 8;
382 const uint8x8_t vk10xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
383
384 vprod01234567 = vmull_u8(vi10x01234567, vk10x01234567);
385 vsum01234567 = vaddw_u8(vsum01234567, vi10x01234567);
386 vprod89ABCDEF = vmull_u8(vi10x89ABCDEF, vk10x89ABCDEF);
387 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi10x89ABCDEF);
388 vprodGHIJKLMN = vmull_u8(vi10xGHIJKLMN, vk10xGHIJKLMN);
389 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi10xGHIJKLMN);
390
391 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
392 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
393 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
394 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
395 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
396 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
397 const uint8x8_t vi11x01234567 = vld1_u8(i11); i11 += 8;
398 const uint8x8_t vk11x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
399 const uint8x8_t vi11x89ABCDEF = vld1_u8(i11); i11 += 8;
400 const uint8x8_t vk11x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
401 const uint8x8_t vi11xGHIJKLMN = vld1_u8(i11); i11 += 8;
402 const uint8x8_t vk11xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
403
404 vprod01234567 = vmull_u8(vi11x01234567, vk11x01234567);
405 vsum01234567 = vaddw_u8(vsum01234567, vi11x01234567);
406 vprod89ABCDEF = vmull_u8(vi11x89ABCDEF, vk11x89ABCDEF);
407 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi11x89ABCDEF);
408 vprodGHIJKLMN = vmull_u8(vi11xGHIJKLMN, vk11xGHIJKLMN);
409 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi11xGHIJKLMN);
410
411 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
412 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
413 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
414 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
415 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
416 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
417 const uint8x8_t vi12x01234567 = vld1_u8(i12); i12 += 8;
418 const uint8x8_t vk12x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
419 const uint8x8_t vi12x89ABCDEF = vld1_u8(i12); i12 += 8;
420 const uint8x8_t vk12x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
421 const uint8x8_t vi12xGHIJKLMN = vld1_u8(i12); i12 += 8;
422 const uint8x8_t vk12xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
423
424 vprod01234567 = vmull_u8(vi12x01234567, vk12x01234567);
425 vsum01234567 = vaddw_u8(vsum01234567, vi12x01234567);
426 vprod89ABCDEF = vmull_u8(vi12x89ABCDEF, vk12x89ABCDEF);
427 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi12x89ABCDEF);
428 vprodGHIJKLMN = vmull_u8(vi12xGHIJKLMN, vk12xGHIJKLMN);
429 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi12xGHIJKLMN);
430
431 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
432 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
433 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
434 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
435 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
436 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
437 const uint8x8_t vi13x01234567 = vld1_u8(i13); i13 += 8;
438 const uint8x8_t vk13x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
439 const uint8x8_t vi13x89ABCDEF = vld1_u8(i13); i13 += 8;
440 const uint8x8_t vk13x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
441 const uint8x8_t vi13xGHIJKLMN = vld1_u8(i13); i13 += 8;
442 const uint8x8_t vk13xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
443
444 vprod01234567 = vmull_u8(vi13x01234567, vk13x01234567);
445 vsum01234567 = vaddw_u8(vsum01234567, vi13x01234567);
446 vprod89ABCDEF = vmull_u8(vi13x89ABCDEF, vk13x89ABCDEF);
447 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi13x89ABCDEF);
448 vprodGHIJKLMN = vmull_u8(vi13xGHIJKLMN, vk13xGHIJKLMN);
449 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi13xGHIJKLMN);
450
451 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
452 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
453 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
454 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
455 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
456 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
457 const uint8x8_t vi14x01234567 = vld1_u8(i14); i14 += 8;
458 const uint8x8_t vk14x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
459 const uint8x8_t vi14x89ABCDEF = vld1_u8(i14); i14 += 8;
460 const uint8x8_t vk14x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
461 const uint8x8_t vi14xGHIJKLMN = vld1_u8(i14); i14 += 8;
462 const uint8x8_t vk14xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
463
464 vprod01234567 = vmull_u8(vi14x01234567, vk14x01234567);
465 vsum01234567 = vaddw_u8(vsum01234567, vi14x01234567);
466 vprod89ABCDEF = vmull_u8(vi14x89ABCDEF, vk14x89ABCDEF);
467 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi14x89ABCDEF);
468 vprodGHIJKLMN = vmull_u8(vi14xGHIJKLMN, vk14xGHIJKLMN);
469 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi14xGHIJKLMN);
470
471 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
472 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
473 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
474 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
475 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
476 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
477 const uint8x8_t vi15x01234567 = vld1_u8(i15); i15 += 8;
478 const uint8x8_t vk15x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
479 const uint8x8_t vi15x89ABCDEF = vld1_u8(i15); i15 += 8;
480 const uint8x8_t vk15x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
481 const uint8x8_t vi15xGHIJKLMN = vld1_u8(i15); i15 += 8;
482 const uint8x8_t vk15xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
483
484 vprod01234567 = vmull_u8(vi15x01234567, vk15x01234567);
485 vsum01234567 = vaddw_u8(vsum01234567, vi15x01234567);
486 vprod89ABCDEF = vmull_u8(vi15x89ABCDEF, vk15x89ABCDEF);
487 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi15x89ABCDEF);
488 vprodGHIJKLMN = vmull_u8(vi15xGHIJKLMN, vk15xGHIJKLMN);
489 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi15xGHIJKLMN);
490
491 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
492 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
493 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
494 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
495 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
496 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
497 const uint8x8_t vi16x01234567 = vld1_u8(i16); i16 += 8;
498 const uint8x8_t vk16x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
499 const uint8x8_t vi16x89ABCDEF = vld1_u8(i16); i16 += 8;
500 const uint8x8_t vk16x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
501 const uint8x8_t vi16xGHIJKLMN = vld1_u8(i16); i16 += 8;
502 const uint8x8_t vk16xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
503
504 vprod01234567 = vmull_u8(vi16x01234567, vk16x01234567);
505 vsum01234567 = vaddw_u8(vsum01234567, vi16x01234567);
506 vprod89ABCDEF = vmull_u8(vi16x89ABCDEF, vk16x89ABCDEF);
507 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi16x89ABCDEF);
508 vprodGHIJKLMN = vmull_u8(vi16xGHIJKLMN, vk16xGHIJKLMN);
509 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi16xGHIJKLMN);
510
511 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
512 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
513 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
514 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
515 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
516 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
517 const uint8x8_t vi17x01234567 = vld1_u8(i17); i17 += 8;
518 const uint8x8_t vk17x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
519 const uint8x8_t vi17x89ABCDEF = vld1_u8(i17); i17 += 8;
520 const uint8x8_t vk17x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
521 const uint8x8_t vi17xGHIJKLMN = vld1_u8(i17); i17 += 8;
522 const uint8x8_t vk17xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
523
524 vprod01234567 = vmull_u8(vi17x01234567, vk17x01234567);
525 vsum01234567 = vaddw_u8(vsum01234567, vi17x01234567);
526 vprod89ABCDEF = vmull_u8(vi17x89ABCDEF, vk17x89ABCDEF);
527 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi17x89ABCDEF);
528 vprodGHIJKLMN = vmull_u8(vi17xGHIJKLMN, vk17xGHIJKLMN);
529 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi17xGHIJKLMN);
530
531 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
532 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
533 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
534 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
535 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
536 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
537 const uint8x8_t vi18x01234567 = vld1_u8(i18); i18 += 8;
538 const uint8x8_t vk18x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
539 const uint8x8_t vi18x89ABCDEF = vld1_u8(i18); i18 += 8;
540 const uint8x8_t vk18x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
541 const uint8x8_t vi18xGHIJKLMN = vld1_u8(i18); i18 += 8;
542 const uint8x8_t vk18xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
543
544 vprod01234567 = vmull_u8(vi18x01234567, vk18x01234567);
545 vsum01234567 = vaddw_u8(vsum01234567, vi18x01234567);
546 vprod89ABCDEF = vmull_u8(vi18x89ABCDEF, vk18x89ABCDEF);
547 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi18x89ABCDEF);
548 vprodGHIJKLMN = vmull_u8(vi18xGHIJKLMN, vk18xGHIJKLMN);
549 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi18xGHIJKLMN);
550
551 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
552 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
553 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
554 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
555 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
556 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
557 const uint8x8_t vi19x01234567 = vld1_u8(i19); i19 += 8;
558 const uint8x8_t vk19x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
559 const uint8x8_t vi19x89ABCDEF = vld1_u8(i19); i19 += 8;
560 const uint8x8_t vk19x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
561 const uint8x8_t vi19xGHIJKLMN = vld1_u8(i19); i19 += 8;
562 const uint8x8_t vk19xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
563
564 vprod01234567 = vmull_u8(vi19x01234567, vk19x01234567);
565 vsum01234567 = vaddw_u8(vsum01234567, vi19x01234567);
566 vprod89ABCDEF = vmull_u8(vi19x89ABCDEF, vk19x89ABCDEF);
567 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi19x89ABCDEF);
568 vprodGHIJKLMN = vmull_u8(vi19xGHIJKLMN, vk19xGHIJKLMN);
569 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi19xGHIJKLMN);
570
571 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
572 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
573 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
574 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
575 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
576 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
577 const uint8x8_t vi20x01234567 = vld1_u8(i20); i20 += 8;
578 const uint8x8_t vk20x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
579 const uint8x8_t vi20x89ABCDEF = vld1_u8(i20); i20 += 8;
580 const uint8x8_t vk20x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
581 const uint8x8_t vi20xGHIJKLMN = vld1_u8(i20); i20 += 8;
582 const uint8x8_t vk20xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
583
584 vprod01234567 = vmull_u8(vi20x01234567, vk20x01234567);
585 vsum01234567 = vaddw_u8(vsum01234567, vi20x01234567);
586 vprod89ABCDEF = vmull_u8(vi20x89ABCDEF, vk20x89ABCDEF);
587 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi20x89ABCDEF);
588 vprodGHIJKLMN = vmull_u8(vi20xGHIJKLMN, vk20xGHIJKLMN);
589 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi20xGHIJKLMN);
590
591 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
592 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
593 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
594 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
595 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
596 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
597 const uint8x8_t vi21x01234567 = vld1_u8(i21); i21 += 8;
598 const uint8x8_t vk21x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
599 const uint8x8_t vi21x89ABCDEF = vld1_u8(i21); i21 += 8;
600 const uint8x8_t vk21x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
601 const uint8x8_t vi21xGHIJKLMN = vld1_u8(i21); i21 += 8;
602 const uint8x8_t vk21xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
603
604 vprod01234567 = vmull_u8(vi21x01234567, vk21x01234567);
605 vsum01234567 = vaddw_u8(vsum01234567, vi21x01234567);
606 vprod89ABCDEF = vmull_u8(vi21x89ABCDEF, vk21x89ABCDEF);
607 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi21x89ABCDEF);
608 vprodGHIJKLMN = vmull_u8(vi21xGHIJKLMN, vk21xGHIJKLMN);
609 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi21xGHIJKLMN);
610
611 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
612 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
613 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
614 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
615 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
616 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
617 const uint8x8_t vi22x01234567 = vld1_u8(i22); i22 += 8;
618 const uint8x8_t vk22x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
619 const uint8x8_t vi22x89ABCDEF = vld1_u8(i22); i22 += 8;
620 const uint8x8_t vk22x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
621 const uint8x8_t vi22xGHIJKLMN = vld1_u8(i22); i22 += 8;
622 const uint8x8_t vk22xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
623
624 vprod01234567 = vmull_u8(vi22x01234567, vk22x01234567);
625 vsum01234567 = vaddw_u8(vsum01234567, vi22x01234567);
626 vprod89ABCDEF = vmull_u8(vi22x89ABCDEF, vk22x89ABCDEF);
627 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi22x89ABCDEF);
628 vprodGHIJKLMN = vmull_u8(vi22xGHIJKLMN, vk22xGHIJKLMN);
629 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi22xGHIJKLMN);
630
631 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
632 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
633 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
634 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
635 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
636 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
637 const uint8x8_t vi23x01234567 = vld1_u8(i23); i23 += 8;
638 const uint8x8_t vk23x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
639 const uint8x8_t vi23x89ABCDEF = vld1_u8(i23); i23 += 8;
640 const uint8x8_t vk23x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
641 const uint8x8_t vi23xGHIJKLMN = vld1_u8(i23); i23 += 8;
642 const uint8x8_t vk23xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
643
644 vprod01234567 = vmull_u8(vi23x01234567, vk23x01234567);
645 vsum01234567 = vaddw_u8(vsum01234567, vi23x01234567);
646 vprod89ABCDEF = vmull_u8(vi23x89ABCDEF, vk23x89ABCDEF);
647 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi23x89ABCDEF);
648 vprodGHIJKLMN = vmull_u8(vi23xGHIJKLMN, vk23xGHIJKLMN);
649 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi23xGHIJKLMN);
650
651 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
652 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
653 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
654 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
655 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
656 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
657 const uint8x8_t vi24x01234567 = vld1_u8(i24); i24 += 8;
658 const uint8x8_t vk24x01234567 = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
659 const uint8x8_t vi24x89ABCDEF = vld1_u8(i24); i24 += 8;
660 const uint8x8_t vk24x89ABCDEF = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
661 const uint8x8_t vi24xGHIJKLMN = vld1_u8(i24); i24 += 8;
662 const uint8x8_t vk24xGHIJKLMN = vld1_u8(w); w = (const void*) ((const int8_t*) w + 8);
663
664 vprod01234567 = vmull_u8(vi24x01234567, vk24x01234567);
665 vsum01234567 = vaddw_u8(vsum01234567, vi24x01234567);
666 vprod89ABCDEF = vmull_u8(vi24x89ABCDEF, vk24x89ABCDEF);
667 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi24x89ABCDEF);
668 vprodGHIJKLMN = vmull_u8(vi24xGHIJKLMN, vk24xGHIJKLMN);
669 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi24xGHIJKLMN);
670
671 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vprod01234567)));
672 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vprod01234567)));
673 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vprod89ABCDEF)));
674 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vprod89ABCDEF)));
675 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vprodGHIJKLMN)));
676 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vprodGHIJKLMN)));
677
678 vacc0123 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567), vget_low_u16(vkernel_zero_point16)));
679 vacc4567 = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567), vget_high_u16(vkernel_zero_point16)));
680 vacc89AB = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF), vget_low_u16(vkernel_zero_point16)));
681 vaccCDEF = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF), vget_high_u16(vkernel_zero_point16)));
682 vaccGHIJ = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN), vget_low_u16(vkernel_zero_point16)));
683 vaccKLMN = vreinterpretq_s32_u32(vmlsl_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN), vget_high_u16(vkernel_zero_point16)));
684
685 vacc0123 = vshlq_s32(vacc0123, vright_pre_shift);
686 vacc4567 = vshlq_s32(vacc4567, vright_pre_shift);
687 vacc89AB = vshlq_s32(vacc89AB, vright_pre_shift);
688 vaccCDEF = vshlq_s32(vaccCDEF, vright_pre_shift);
689 vaccGHIJ = vshlq_s32(vaccGHIJ, vright_pre_shift);
690 vaccKLMN = vshlq_s32(vaccKLMN, vright_pre_shift);
691
692 vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
693 vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
694 vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier);
695 vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier);
696 vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier);
697 vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier);
698
699 vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
700 vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
701 vacc89AB = vrshlq_s32(vacc89AB, vright_post_shift);
702 vaccCDEF = vrshlq_s32(vaccCDEF, vright_post_shift);
703 vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_post_shift);
704 vaccKLMN = vrshlq_s32(vaccKLMN, vright_post_shift);
705
706 #if XNN_ARCH_ARM64
707 const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
708 const int16x8_t vacc89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF), voutput_zero_point);
709 const int16x8_t vaccGHIJKLMN = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN), voutput_zero_point);
710
711 uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
712 uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
713 #else
714 const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
715 const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
716 const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point);
717
718 uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
719 uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
720 #endif
721
722 vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
723 voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min));
724
725 vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
726 voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max));
727
728 vst1q_u8(output, vout0123456789ABCDEF); output += 16;
729 vst1_u8(output, voutGHIJKLMN); output += 8;
730 }
731 if XNN_UNLIKELY(c != 0) {
732 const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 24);
733 do {
734 int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
735 int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
736
737 const int16x8_t vi0x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i0))); i0 += 8;
738 const int16x8_t vk0x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(k), vkernel_zero_point)); k += 8;
739
740 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567));
741 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi0x01234567), vget_high_s16(vk0x01234567));
742 const int16x8_t vi1x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i1))); i1 += 8;
743 const int16x8_t vk1x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 16)), vkernel_zero_point));
744
745 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567));
746 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi1x01234567), vget_high_s16(vk1x01234567));
747 const int16x8_t vi2x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i2))); i2 += 8;
748 const int16x8_t vk2x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 40)), vkernel_zero_point));
749
750 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567));
751 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi2x01234567), vget_high_s16(vk2x01234567));
752 const int16x8_t vi3x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i3))); i3 += 8;
753 const int16x8_t vk3x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 64)), vkernel_zero_point));
754
755 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567));
756 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi3x01234567), vget_high_s16(vk3x01234567));
757 const int16x8_t vi4x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i4))); i4 += 8;
758 const int16x8_t vk4x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 88)), vkernel_zero_point));
759
760 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567));
761 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567));
762 const int16x8_t vi5x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i5))); i5 += 8;
763 const int16x8_t vk5x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 112)), vkernel_zero_point));
764
765 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567));
766 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi5x01234567), vget_high_s16(vk5x01234567));
767 const int16x8_t vi6x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i6))); i6 += 8;
768 const int16x8_t vk6x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 136)), vkernel_zero_point));
769
770 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567));
771 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi6x01234567), vget_high_s16(vk6x01234567));
772 const int16x8_t vi7x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i7))); i7 += 8;
773 const int16x8_t vk7x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 160)), vkernel_zero_point));
774
775 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567));
776 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi7x01234567), vget_high_s16(vk7x01234567));
777 const int16x8_t vi8x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i8))); i8 += 8;
778 const int16x8_t vk8x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 184)), vkernel_zero_point));
779
780 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567));
781 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567));
782 const int16x8_t vi9x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i9))); i9 += 8;
783 const int16x8_t vk9x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 208)), vkernel_zero_point));
784
785 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi9x01234567), vget_low_s16(vk9x01234567));
786 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi9x01234567), vget_high_s16(vk9x01234567));
787 const int16x8_t vi10x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i10))); i10 += 8;
788 const int16x8_t vk10x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 232)), vkernel_zero_point));
789
790 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi10x01234567), vget_low_s16(vk10x01234567));
791 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi10x01234567), vget_high_s16(vk10x01234567));
792 const int16x8_t vi11x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i11))); i11 += 8;
793 const int16x8_t vk11x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 256)), vkernel_zero_point));
794
795 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi11x01234567), vget_low_s16(vk11x01234567));
796 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi11x01234567), vget_high_s16(vk11x01234567));
797 const int16x8_t vi12x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i12))); i12 += 8;
798 const int16x8_t vk12x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 280)), vkernel_zero_point));
799
800 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi12x01234567), vget_low_s16(vk12x01234567));
801 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi12x01234567), vget_high_s16(vk12x01234567));
802 const int16x8_t vi13x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i13))); i13 += 8;
803 const int16x8_t vk13x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 304)), vkernel_zero_point));
804
805 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi13x01234567), vget_low_s16(vk13x01234567));
806 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi13x01234567), vget_high_s16(vk13x01234567));
807 const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); i14 += 8;
808 const int16x8_t vk14x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 328)), vkernel_zero_point));
809
810 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567));
811 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567));
812 const int16x8_t vi15x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i15))); i15 += 8;
813 const int16x8_t vk15x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 352)), vkernel_zero_point));
814
815 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi15x01234567), vget_low_s16(vk15x01234567));
816 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi15x01234567), vget_high_s16(vk15x01234567));
817 const int16x8_t vi16x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i16))); i16 += 8;
818 const int16x8_t vk16x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 376)), vkernel_zero_point));
819
820 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi16x01234567), vget_low_s16(vk16x01234567));
821 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi16x01234567), vget_high_s16(vk16x01234567));
822 const int16x8_t vi17x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i17))); i17 += 8;
823 const int16x8_t vk17x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 400)), vkernel_zero_point));
824
825 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi17x01234567), vget_low_s16(vk17x01234567));
826 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi17x01234567), vget_high_s16(vk17x01234567));
827 const int16x8_t vi18x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i18))); i18 += 8;
828 const int16x8_t vk18x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 424)), vkernel_zero_point));
829
830 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi18x01234567), vget_low_s16(vk18x01234567));
831 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi18x01234567), vget_high_s16(vk18x01234567));
832 const int16x8_t vi19x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i19))); i19 += 8;
833 const int16x8_t vk19x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 448)), vkernel_zero_point));
834
835 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi19x01234567), vget_low_s16(vk19x01234567));
836 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi19x01234567), vget_high_s16(vk19x01234567));
837 const int16x8_t vi20x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i20))); i20 += 8;
838 const int16x8_t vk20x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 472)), vkernel_zero_point));
839
840 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi20x01234567), vget_low_s16(vk20x01234567));
841 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi20x01234567), vget_high_s16(vk20x01234567));
842 const int16x8_t vi21x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i21))); i21 += 8;
843 const int16x8_t vk21x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 496)), vkernel_zero_point));
844
845 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi21x01234567), vget_low_s16(vk21x01234567));
846 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi21x01234567), vget_high_s16(vk21x01234567));
847 const int16x8_t vi22x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i22))); i22 += 8;
848 const int16x8_t vk22x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 520)), vkernel_zero_point));
849
850 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi22x01234567), vget_low_s16(vk22x01234567));
851 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi22x01234567), vget_high_s16(vk22x01234567));
852 const int16x8_t vi23x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i23))); i23 += 8;
853 const int16x8_t vk23x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 544)), vkernel_zero_point));
854
855 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi23x01234567), vget_low_s16(vk23x01234567));
856 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi23x01234567), vget_high_s16(vk23x01234567));
857 const int16x8_t vi24x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i24))); i24 += 8;
858 const int16x8_t vk24x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) (k + 568)), vkernel_zero_point));
859
860 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi24x01234567), vget_low_s16(vk24x01234567));
861 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi24x01234567), vget_high_s16(vk24x01234567));
862
863 vacc0123 = vrshlq_s32(vacc0123, vright_pre_shift);
864 vacc4567 = vrshlq_s32(vacc4567, vright_pre_shift);
865
866 vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
867 vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
868
869 vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
870 vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
871
872 #if XNN_ARCH_ARM64
873 const int16x8_t vacc01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567), voutput_zero_point);
874 uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
875 #else
876 const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
877 uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
878 #endif
879
880 vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
881 vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
882
883 if XNN_LIKELY(c >= 8) {
884 vst1_u8(output, vout01234567); output += 8;
885 c -= 8;
886 } else {
887 if (c & 4) {
888 vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
889 vout01234567 = vext_u8(vout01234567, vout01234567, 4);
890 }
891 if (c & 2) {
892 vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
893 vout01234567 = vext_u8(vout01234567, vout01234567, 2);
894 }
895 if (c & 1) {
896 vst1_lane_u8(output, vout01234567, 0); output += 1;
897 }
898 c = 0;
899 }
900 } while (c != 0);
901 }
902
903 output = (uint8_t*) ((uintptr_t) output + output_increment);
904 } while (--output_width != 0);
905 }
906