1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-dwconv/unipass-neon-mul8.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/dwconv.h>
15
16
xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64(
18 size_t channels,
19 size_t output_width,
20 const int8_t** input,
21 const void* weights,
22 int8_t* output,
23 size_t input_stride,
24 size_t output_increment,
25 size_t input_offset,
26 const int8_t* zero,
27 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(channels != 0);
30 assert(output_width != 0);
31
32 const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift);
33 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier);
34 const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift);
35 const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point);
36 const int8x8_t voutput_min = vld1_dup_s8(¶ms->rndnu_neon.output_min);
37 const int8x8_t voutput_max = vld1_dup_s8(¶ms->rndnu_neon.output_max);
38 do {
39 const int8_t* i0 = input[0];
40 assert(i0 != NULL);
41 if XNN_UNPREDICTABLE(i0 != zero) {
42 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
43 }
44 const int8_t* i1 = input[1];
45 assert(i1 != NULL);
46 if XNN_UNPREDICTABLE(i1 != zero) {
47 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
48 }
49 const int8_t* i2 = input[2];
50 assert(i2 != NULL);
51 if XNN_UNPREDICTABLE(i2 != zero) {
52 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
53 }
54 const int8_t* i3 = input[3];
55 assert(i3 != NULL);
56 if XNN_UNPREDICTABLE(i3 != zero) {
57 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
58 }
59 const int8_t* i4 = input[4];
60 assert(i4 != NULL);
61 if XNN_UNPREDICTABLE(i4 != zero) {
62 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
63 }
64 const int8_t* i5 = input[5];
65 assert(i5 != NULL);
66 if XNN_UNPREDICTABLE(i5 != zero) {
67 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
68 }
69 const int8_t* i6 = input[6];
70 assert(i6 != NULL);
71 if XNN_UNPREDICTABLE(i6 != zero) {
72 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
73 }
74 const int8_t* i7 = input[7];
75 assert(i7 != NULL);
76 if XNN_UNPREDICTABLE(i7 != zero) {
77 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
78 }
79 const int8_t* i8 = input[8];
80 assert(i8 != NULL);
81 if XNN_UNPREDICTABLE(i8 != zero) {
82 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
83 }
84 const int8_t* i9 = input[9];
85 assert(i9 != NULL);
86 if XNN_UNPREDICTABLE(i9 != zero) {
87 i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
88 }
89 const int8_t* i10 = input[10];
90 assert(i10 != NULL);
91 if XNN_UNPREDICTABLE(i10 != zero) {
92 i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
93 }
94 const int8_t* i11 = input[11];
95 assert(i11 != NULL);
96 if XNN_UNPREDICTABLE(i11 != zero) {
97 i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
98 }
99 const int8_t* i12 = input[12];
100 assert(i12 != NULL);
101 if XNN_UNPREDICTABLE(i12 != zero) {
102 i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
103 }
104 const int8_t* i13 = input[13];
105 assert(i13 != NULL);
106 if XNN_UNPREDICTABLE(i13 != zero) {
107 i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
108 }
109 const int8_t* i14 = input[14];
110 assert(i14 != NULL);
111 if XNN_UNPREDICTABLE(i14 != zero) {
112 i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
113 }
114 const int8_t* i15 = input[15];
115 assert(i15 != NULL);
116 if XNN_UNPREDICTABLE(i15 != zero) {
117 i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
118 }
119 const int8_t* i16 = input[16];
120 assert(i16 != NULL);
121 if XNN_UNPREDICTABLE(i16 != zero) {
122 i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
123 }
124 const int8_t* i17 = input[17];
125 assert(i17 != NULL);
126 if XNN_UNPREDICTABLE(i17 != zero) {
127 i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
128 }
129 const int8_t* i18 = input[18];
130 assert(i18 != NULL);
131 if XNN_UNPREDICTABLE(i18 != zero) {
132 i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
133 }
134 const int8_t* i19 = input[19];
135 assert(i19 != NULL);
136 if XNN_UNPREDICTABLE(i19 != zero) {
137 i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
138 }
139 const int8_t* i20 = input[20];
140 assert(i20 != NULL);
141 if XNN_UNPREDICTABLE(i20 != zero) {
142 i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
143 }
144 const int8_t* i21 = input[21];
145 assert(i21 != NULL);
146 if XNN_UNPREDICTABLE(i21 != zero) {
147 i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
148 }
149 const int8_t* i22 = input[22];
150 assert(i22 != NULL);
151 if XNN_UNPREDICTABLE(i22 != zero) {
152 i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
153 }
154 const int8_t* i23 = input[23];
155 assert(i23 != NULL);
156 if XNN_UNPREDICTABLE(i23 != zero) {
157 i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
158 }
159 const int8_t* i24 = input[24];
160 assert(i24 != NULL);
161 if XNN_UNPREDICTABLE(i24 != zero) {
162 i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
163 }
164 input = (const int8_t**) ((uintptr_t) input + input_stride);
165
166 size_t c = channels;
167 const void* w = weights;
168 for (; c >= 8; c -= 8) {
169 int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
170 int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
171
172 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
173 const int8x8_t vk0x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
174
175 int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);
176
177 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
178 const int8x8_t vk1x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
179
180 vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);
181
182 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
183 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
184 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
185 const int8x8_t vk2x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
186
187 vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);
188
189 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
190 const int8x8_t vk3x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
191
192 vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);
193
194 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
195 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
196 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
197 const int8x8_t vk4x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
198
199 vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);
200
201 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
202 const int8x8_t vk5x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
203
204 vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);
205
206 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
207 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
208 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
209 const int8x8_t vk6x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
210
211 vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);
212
213 const int8x8_t vi7x01234567 = vld1_s8(i7); i7 += 8;
214 const int8x8_t vk7x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
215
216 vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);
217
218 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
219 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
220 const int8x8_t vi8x01234567 = vld1_s8(i8); i8 += 8;
221 const int8x8_t vk8x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
222
223 vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);
224
225 const int8x8_t vi9x01234567 = vld1_s8(i9); i9 += 8;
226 const int8x8_t vk9x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
227
228 vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567);
229
230 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
231 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
232 const int8x8_t vi10x01234567 = vld1_s8(i10); i10 += 8;
233 const int8x8_t vk10x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
234
235 vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567);
236
237 const int8x8_t vi11x01234567 = vld1_s8(i11); i11 += 8;
238 const int8x8_t vk11x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
239
240 vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567);
241
242 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
243 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
244 const int8x8_t vi12x01234567 = vld1_s8(i12); i12 += 8;
245 const int8x8_t vk12x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
246
247 vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567);
248
249 const int8x8_t vi13x01234567 = vld1_s8(i13); i13 += 8;
250 const int8x8_t vk13x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
251
252 vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567);
253
254 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
255 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
256 const int8x8_t vi14x01234567 = vld1_s8(i14); i14 += 8;
257 const int8x8_t vk14x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
258
259 vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567);
260
261 const int8x8_t vi15x01234567 = vld1_s8(i15); i15 += 8;
262 const int8x8_t vk15x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
263
264 vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567);
265
266 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
267 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
268 const int8x8_t vi16x01234567 = vld1_s8(i16); i16 += 8;
269 const int8x8_t vk16x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
270
271 vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567);
272
273 const int8x8_t vi17x01234567 = vld1_s8(i17); i17 += 8;
274 const int8x8_t vk17x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
275
276 vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567);
277
278 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
279 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
280 const int8x8_t vi18x01234567 = vld1_s8(i18); i18 += 8;
281 const int8x8_t vk18x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
282
283 vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567);
284
285 const int8x8_t vi19x01234567 = vld1_s8(i19); i19 += 8;
286 const int8x8_t vk19x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
287
288 vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567);
289
290 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
291 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
292 const int8x8_t vi20x01234567 = vld1_s8(i20); i20 += 8;
293 const int8x8_t vk20x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
294
295 vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567);
296
297 const int8x8_t vi21x01234567 = vld1_s8(i21); i21 += 8;
298 const int8x8_t vk21x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
299
300 vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567);
301
302 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
303 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
304 const int8x8_t vi22x01234567 = vld1_s8(i22); i22 += 8;
305 const int8x8_t vk22x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
306
307 vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567);
308
309 const int8x8_t vi23x01234567 = vld1_s8(i23); i23 += 8;
310 const int8x8_t vk23x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
311
312 vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567);
313
314 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
315 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
316 const int8x8_t vi24x01234567 = vld1_s8(i24); i24 += 8;
317 const int8x8_t vk24x01234567 = vld1_s8(w); w = (const void*) ((const int8_t*) w + 8);
318
319 vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567);
320
321 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
322 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
323
324 vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift);
325 vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift);
326
327 vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
328 vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
329
330 vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
331 vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
332
333 #if XNN_ARCH_ARM64
334 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
335
336 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
337
338 int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
339 #else // !XNN_ARCH_ARM64
340 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
341
342 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
343
344 int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
345 #endif // !XNN_ARCH_ARM64
346
347 vout01234567 = vmax_s8(vout01234567, voutput_min);
348
349 vout01234567 = vmin_s8(vout01234567, voutput_max);
350
351 vst1_s8(output, vout01234567); output += 8;
352 }
353 if XNN_UNLIKELY(c != 0) {
354 {
355 int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
356 int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
357
358 const int8x8_t vi0x01234567 = vld1_s8(i0);
359 const int8x8_t vk0x01234567 = vld1_s8(w);
360
361 int16x8_t vprod01234567 = vmull_s8(vi0x01234567, vk0x01234567);
362
363 const int8x8_t vi1x01234567 = vld1_s8(i1);
364 const int8x8_t vk1x01234567 = vld1_s8((const void*) ((const int8_t*) w + 8));
365
366 vprod01234567 = vmlal_s8(vprod01234567, vi1x01234567, vk1x01234567);
367
368 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
369 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
370 const int8x8_t vi2x01234567 = vld1_s8(i2);
371 const int8x8_t vk2x01234567 = vld1_s8((const void*) ((const int8_t*) w + 16));
372
373 vprod01234567 = vmull_s8(vi2x01234567, vk2x01234567);
374
375 const int8x8_t vi3x01234567 = vld1_s8(i3);
376 const int8x8_t vk3x01234567 = vld1_s8((const void*) ((const int8_t*) w + 24));
377
378 vprod01234567 = vmlal_s8(vprod01234567, vi3x01234567, vk3x01234567);
379
380 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
381 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
382 const int8x8_t vi4x01234567 = vld1_s8(i4);
383 const int8x8_t vk4x01234567 = vld1_s8((const void*) ((const int8_t*) w + 32));
384
385 vprod01234567 = vmull_s8(vi4x01234567, vk4x01234567);
386
387 const int8x8_t vi5x01234567 = vld1_s8(i5);
388 const int8x8_t vk5x01234567 = vld1_s8((const void*) ((const int8_t*) w + 40));
389
390 vprod01234567 = vmlal_s8(vprod01234567, vi5x01234567, vk5x01234567);
391
392 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
393 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
394 const int8x8_t vi6x01234567 = vld1_s8(i6);
395 const int8x8_t vk6x01234567 = vld1_s8((const void*) ((const int8_t*) w + 48));
396
397 vprod01234567 = vmull_s8(vi6x01234567, vk6x01234567);
398
399 const int8x8_t vi7x01234567 = vld1_s8(i7);
400 const int8x8_t vk7x01234567 = vld1_s8((const void*) ((const int8_t*) w + 56));
401
402 vprod01234567 = vmlal_s8(vprod01234567, vi7x01234567, vk7x01234567);
403
404 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
405 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
406 const int8x8_t vi8x01234567 = vld1_s8(i8);
407 const int8x8_t vk8x01234567 = vld1_s8((const void*) ((const int8_t*) w + 64));
408
409 vprod01234567 = vmull_s8(vi8x01234567, vk8x01234567);
410
411 const int8x8_t vi9x01234567 = vld1_s8(i9);
412 const int8x8_t vk9x01234567 = vld1_s8((const void*) ((const int8_t*) w + 72));
413
414 vprod01234567 = vmlal_s8(vprod01234567, vi9x01234567, vk9x01234567);
415
416 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
417 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
418 const int8x8_t vi10x01234567 = vld1_s8(i10);
419 const int8x8_t vk10x01234567 = vld1_s8((const void*) ((const int8_t*) w + 80));
420
421 vprod01234567 = vmull_s8(vi10x01234567, vk10x01234567);
422
423 const int8x8_t vi11x01234567 = vld1_s8(i11);
424 const int8x8_t vk11x01234567 = vld1_s8((const void*) ((const int8_t*) w + 88));
425
426 vprod01234567 = vmlal_s8(vprod01234567, vi11x01234567, vk11x01234567);
427
428 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
429 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
430 const int8x8_t vi12x01234567 = vld1_s8(i12);
431 const int8x8_t vk12x01234567 = vld1_s8((const void*) ((const int8_t*) w + 96));
432
433 vprod01234567 = vmull_s8(vi12x01234567, vk12x01234567);
434
435 const int8x8_t vi13x01234567 = vld1_s8(i13);
436 const int8x8_t vk13x01234567 = vld1_s8((const void*) ((const int8_t*) w + 104));
437
438 vprod01234567 = vmlal_s8(vprod01234567, vi13x01234567, vk13x01234567);
439
440 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
441 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
442 const int8x8_t vi14x01234567 = vld1_s8(i14);
443 const int8x8_t vk14x01234567 = vld1_s8((const void*) ((const int8_t*) w + 112));
444
445 vprod01234567 = vmull_s8(vi14x01234567, vk14x01234567);
446
447 const int8x8_t vi15x01234567 = vld1_s8(i15);
448 const int8x8_t vk15x01234567 = vld1_s8((const void*) ((const int8_t*) w + 120));
449
450 vprod01234567 = vmlal_s8(vprod01234567, vi15x01234567, vk15x01234567);
451
452 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
453 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
454 const int8x8_t vi16x01234567 = vld1_s8(i16);
455 const int8x8_t vk16x01234567 = vld1_s8((const void*) ((const int8_t*) w + 128));
456
457 vprod01234567 = vmull_s8(vi16x01234567, vk16x01234567);
458
459 const int8x8_t vi17x01234567 = vld1_s8(i17);
460 const int8x8_t vk17x01234567 = vld1_s8((const void*) ((const int8_t*) w + 136));
461
462 vprod01234567 = vmlal_s8(vprod01234567, vi17x01234567, vk17x01234567);
463
464 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
465 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
466 const int8x8_t vi18x01234567 = vld1_s8(i18);
467 const int8x8_t vk18x01234567 = vld1_s8((const void*) ((const int8_t*) w + 144));
468
469 vprod01234567 = vmull_s8(vi18x01234567, vk18x01234567);
470
471 const int8x8_t vi19x01234567 = vld1_s8(i19);
472 const int8x8_t vk19x01234567 = vld1_s8((const void*) ((const int8_t*) w + 152));
473
474 vprod01234567 = vmlal_s8(vprod01234567, vi19x01234567, vk19x01234567);
475
476 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
477 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
478 const int8x8_t vi20x01234567 = vld1_s8(i20);
479 const int8x8_t vk20x01234567 = vld1_s8((const void*) ((const int8_t*) w + 160));
480
481 vprod01234567 = vmull_s8(vi20x01234567, vk20x01234567);
482
483 const int8x8_t vi21x01234567 = vld1_s8(i21);
484 const int8x8_t vk21x01234567 = vld1_s8((const void*) ((const int8_t*) w + 168));
485
486 vprod01234567 = vmlal_s8(vprod01234567, vi21x01234567, vk21x01234567);
487
488 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
489 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
490 const int8x8_t vi22x01234567 = vld1_s8(i22);
491 const int8x8_t vk22x01234567 = vld1_s8((const void*) ((const int8_t*) w + 176));
492
493 vprod01234567 = vmull_s8(vi22x01234567, vk22x01234567);
494
495 const int8x8_t vi23x01234567 = vld1_s8(i23);
496 const int8x8_t vk23x01234567 = vld1_s8((const void*) ((const int8_t*) w + 184));
497
498 vprod01234567 = vmlal_s8(vprod01234567, vi23x01234567, vk23x01234567);
499
500 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
501 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
502 const int8x8_t vi24x01234567 = vld1_s8(i24);
503 const int8x8_t vk24x01234567 = vld1_s8((const void*) ((const int8_t*) w + 192));
504
505 vprod01234567 = vmull_s8(vi24x01234567, vk24x01234567);
506
507 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vprod01234567));
508 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vprod01234567));
509
510 vacc0123 = vqshlq_s32(vacc0123, vright_pre_shift);
511 vacc4567 = vqshlq_s32(vacc4567, vright_pre_shift);
512
513 vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier);
514 vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier);
515
516 vacc0123 = vrshlq_s32(vacc0123, vright_post_shift);
517 vacc4567 = vrshlq_s32(vacc4567, vright_post_shift);
518
519 #if XNN_ARCH_ARM64
520 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
521 #else
522 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
523 #endif
524 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
525
526 int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
527 vout01234567 = vmax_s8(vout01234567, voutput_min);
528 vout01234567 = vmin_s8(vout01234567, voutput_max);
529
530 if (c & 4) {
531 vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
532 vout01234567 = vext_s8(vout01234567, vout01234567, 4);
533 }
534 if (c & 2) {
535 vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
536 vout01234567 = vext_s8(vout01234567, vout01234567, 2);
537 }
538 if (c & 1) {
539 vst1_lane_s8(output, vout01234567, 0); output += 1;
540 }
541 }
542 }
543
544 output = (int8_t*) ((uintptr_t) output + output_increment);
545 } while (--output_width != 0);
546 }
547