xref: /aosp_15_r20/external/XNNPACK/src/qs8-gemm/gen/4x16c2-minmax-rndnu-neon-mlal-ld2r.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-gemm/c2-neon-mull-dup.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/gemm.h>
15 #include <xnnpack/math.h>
16 
xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r(
18     size_t mr,
19     size_t nc,
20     size_t kc,
21     const int8_t* restrict a,
22     size_t a_stride,
23     const void* restrict w,
24     int8_t* restrict c,
25     size_t cm_stride,
26     size_t cn_stride,
27     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29   assert(mr != 0);
30   assert(mr <= 4);
31   assert(nc != 0);
32   assert(kc != 0);
33   assert(kc % sizeof(int8_t) == 0);
34   assert(a != NULL);
35   assert(w != NULL);
36   assert(c != NULL);
37 
38   kc = round_up_po2(kc, 2 * sizeof(int8_t));
39   const int8_t* a0 = a;
40   int8_t* c0 = c;
41   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
42   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
43   if XNN_UNPREDICTABLE(mr < 2) {
44     a1 = a0;
45     c1 = c0;
46   }
47   const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
48   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
49   if XNN_UNPREDICTABLE(mr <= 2) {
50     a2 = a1;
51     c2 = c1;
52   }
53   const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
54   int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
55   if XNN_UNPREDICTABLE(mr != 4) {
56     a3 = a2;
57     c3 = c2;
58   }
59 
60   do {
61     int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
62     int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
63     int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
64     int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
65     int32x4_t vacc1x0123 = vacc0x0123;
66     int32x4_t vacc1x4567 = vacc0x4567;
67     int32x4_t vacc1x89AB = vacc0x89AB;
68     int32x4_t vacc1xCDEF = vacc0xCDEF;
69     int32x4_t vacc2x0123 = vacc0x0123;
70     int32x4_t vacc2x4567 = vacc0x4567;
71     int32x4_t vacc2x89AB = vacc0x89AB;
72     int32x4_t vacc2xCDEF = vacc0xCDEF;
73     int32x4_t vacc3x0123 = vacc0x0123;
74     int32x4_t vacc3x4567 = vacc0x4567;
75     int32x4_t vacc3x89AB = vacc0x89AB;
76     int32x4_t vacc3xCDEF = vacc0xCDEF;
77 
78     size_t k = kc;
79 
80     while (k >= 16 * sizeof(int8_t)) {
81       const int16x4x2_t va00x0 = vld2_dup_s16((const void*)a0);
82       const int16x4x2_t va01x0 = vld2_dup_s16((const void*)(a0 + 4)); a0 += 8;
83       const int16x4x2_t va00x1 = vld2_dup_s16((const void*)a0);
84       const int16x4x2_t va01x1 = vld2_dup_s16((const void*)(a0 + 4)); a0 += 8;
85       const int16x4x2_t va10x0 = vld2_dup_s16((const void*)a1);
86       const int16x4x2_t va11x0 = vld2_dup_s16((const void*)(a1 + 4)); a1 += 8;
87       const int16x4x2_t va10x1 = vld2_dup_s16((const void*)a1);
88       const int16x4x2_t va11x1 = vld2_dup_s16((const void*)(a1 + 4)); a1 += 8;
89       const int16x4x2_t va20x0 = vld2_dup_s16((const void*)a2);
90       const int16x4x2_t va21x0 = vld2_dup_s16((const void*)(a2 + 4)); a2 += 8;
91       const int16x4x2_t va20x1 = vld2_dup_s16((const void*)a2);
92       const int16x4x2_t va21x1 = vld2_dup_s16((const void*)(a2 + 4)); a2 += 8;
93       const int16x4x2_t va30x0 = vld2_dup_s16((const void*)a3);
94       const int16x4x2_t va31x0 = vld2_dup_s16((const void*)(a3 + 4)); a3 += 8;
95       const int16x4x2_t va30x1 = vld2_dup_s16((const void*)a3);
96       const int16x4x2_t va31x1 = vld2_dup_s16((const void*)(a3 + 4)); a3 += 8;
97       const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
98       const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
99       const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
100       const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
101       const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
102       const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
103       const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
104       const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
105       const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
106       const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
107       const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
108       const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
109       const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
110       const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
111       const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
112       const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
113 
114       const int8x8_t va0c0x0 = vreinterpret_s8_s16(va00x0.val[0]);
115       const int8x8_t va0c0x1 = vreinterpret_s8_s16(va00x1.val[0]);
116       const int8x8_t va1c0x0 = vreinterpret_s8_s16(va10x0.val[0]);
117       const int8x8_t va1c0x1 = vreinterpret_s8_s16(va10x1.val[0]);
118       const int8x8_t va2c0x0 = vreinterpret_s8_s16(va20x0.val[0]);
119       const int8x8_t va2c0x1 = vreinterpret_s8_s16(va20x1.val[0]);
120       const int8x8_t va3c0x0 = vreinterpret_s8_s16(va30x0.val[0]);
121       const int8x8_t va3c0x1 = vreinterpret_s8_s16(va30x1.val[0]);
122 
123       int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0c0x0);
124       int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1c0x0);
125       int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, va2c0x0);
126       int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3c0x0);
127       const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
128       vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0c0x1);
129       vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1c0x1);
130       vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2c0x1);
131       vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, va3c0x1);
132       vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
133       vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
134       vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
135       vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
136       int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0c0x0);
137       int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1c0x0);
138       int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, va2c0x0);
139       int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3c0x0);
140       const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
141       vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0c0x1);
142       vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1c0x1);
143       vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2c0x1);
144       vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, va3c0x1);
145       vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
146       vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
147       vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
148       vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
149       int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, va0c0x0);
150       int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, va1c0x0);
151       int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, va2c0x0);
152       int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3c0x0);
153       const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
154       vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, va0c0x1);
155       vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, va1c0x1);
156       vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, va2c0x1);
157       vprod3x89ABc0 = vmlal_s8(vprod3x89ABc0, vb89ABc0x1, va3c0x1);
158       vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
159       vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
160       vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
161       vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
162       int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, va0c0x0);
163       int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1c0x0);
164       int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, va2c0x0);
165       int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3c0x0);
166       const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
167       vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, va0c0x1);
168       vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1c0x1);
169       vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, va2c0x1);
170       vprod3xCDEFc0 = vmlal_s8(vprod3xCDEFc0, vbCDEFc0x1, va3c0x1);
171       vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
172       vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
173       vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
174       vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
175       const int8x8_t va0c1x0 = vreinterpret_s8_s16(va00x0.val[1]);
176       const int8x8_t va0c1x1 = vreinterpret_s8_s16(va00x1.val[1]);
177       const int8x8_t va1c1x0 = vreinterpret_s8_s16(va10x0.val[1]);
178       const int8x8_t va1c1x1 = vreinterpret_s8_s16(va10x1.val[1]);
179       const int8x8_t va2c1x0 = vreinterpret_s8_s16(va20x0.val[1]);
180       const int8x8_t va2c1x1 = vreinterpret_s8_s16(va20x1.val[1]);
181       const int8x8_t va3c1x0 = vreinterpret_s8_s16(va30x0.val[1]);
182       const int8x8_t va3c1x1 = vreinterpret_s8_s16(va30x1.val[1]);
183 
184       int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0c1x0);
185       int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1c1x0);
186       int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, va2c1x0);
187       int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3c1x0);
188       const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
189       vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0c1x1);
190       vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1c1x1);
191       vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2c1x1);
192       vprod3x0123c1 = vmlal_s8(vprod3x0123c1, vb0123c1x1, va3c1x1);
193       vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
194       vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
195       vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
196       vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
197       int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0c1x0);
198       int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1c1x0);
199       int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, va2c1x0);
200       int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3c1x0);
201       const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
202       vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0c1x1);
203       vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1c1x1);
204       vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2c1x1);
205       vprod3x4567c1 = vmlal_s8(vprod3x4567c1, vb4567c1x1, va3c1x1);
206       vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
207       vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
208       vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
209       vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
210       int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, va0c1x0);
211       int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, va1c1x0);
212       int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, va2c1x0);
213       int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3c1x0);
214       const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
215       vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, va0c1x1);
216       vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, va1c1x1);
217       vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, va2c1x1);
218       vprod3x89ABc1 = vmlal_s8(vprod3x89ABc1, vb89ABc1x1, va3c1x1);
219       vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
220       vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
221       vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
222       vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
223       int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0c1x0);
224       int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1c1x0);
225       int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0);
226       int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3c1x0);
227       const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
228       vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, va0c1x1);
229       vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1c1x1);
230       vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, va2c1x1);
231       vprod3xCDEFc1 = vmlal_s8(vprod3xCDEFc1, vbCDEFc1x1, va3c1x1);
232       vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
233       vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
234       vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
235       vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
236       const int8x8_t va0c2x0 = vreinterpret_s8_s16(va01x0.val[0]);
237       const int8x8_t va0c2x1 = vreinterpret_s8_s16(va01x1.val[0]);
238       const int8x8_t va1c2x0 = vreinterpret_s8_s16(va11x0.val[0]);
239       const int8x8_t va1c2x1 = vreinterpret_s8_s16(va11x1.val[0]);
240       const int8x8_t va2c2x0 = vreinterpret_s8_s16(va21x0.val[0]);
241       const int8x8_t va2c2x1 = vreinterpret_s8_s16(va21x1.val[0]);
242       const int8x8_t va3c2x0 = vreinterpret_s8_s16(va31x0.val[0]);
243       const int8x8_t va3c2x1 = vreinterpret_s8_s16(va31x1.val[0]);
244 
245       int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0c2x0);
246       int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1c2x0);
247       int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, va2c2x0);
248       int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3c2x0);
249       const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
250       vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0c2x1);
251       vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1c2x1);
252       vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, va2c2x1);
253       vprod3x0123c2 = vmlal_s8(vprod3x0123c2, vb0123c2x1, va3c2x1);
254       vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
255       vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
256       vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
257       vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
258       int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0c2x0);
259       int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1c2x0);
260       int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, va2c2x0);
261       int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3c2x0);
262       const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
263       vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0c2x1);
264       vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1c2x1);
265       vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, va2c2x1);
266       vprod3x4567c2 = vmlal_s8(vprod3x4567c2, vb4567c2x1, va3c2x1);
267       vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
268       vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
269       vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
270       vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
271       int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, va0c2x0);
272       int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, va1c2x0);
273       int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, va2c2x0);
274       int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2x0, va3c2x0);
275       const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
276       vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, va0c2x1);
277       vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, va1c2x1);
278       vprod2x89ABc2 = vmlal_s8(vprod2x89ABc2, vb89ABc2x1, va2c2x1);
279       vprod3x89ABc2 = vmlal_s8(vprod3x89ABc2, vb89ABc2x1, va3c2x1);
280       vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
281       vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
282       vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
283       vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
284       int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, va0c2x0);
285       int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, va1c2x0);
286       int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, va2c2x0);
287       int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2x0, va3c2x0);
288       const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
289       vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, va0c2x1);
290       vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, va1c2x1);
291       vprod2xCDEFc2 = vmlal_s8(vprod2xCDEFc2, vbCDEFc2x1, va2c2x1);
292       vprod3xCDEFc2 = vmlal_s8(vprod3xCDEFc2, vbCDEFc2x1, va3c2x1);
293       vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
294       vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
295       vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
296       vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
297       const int8x8_t va0c3x0 = vreinterpret_s8_s16(va01x0.val[1]);
298       const int8x8_t va0c3x1 = vreinterpret_s8_s16(va01x1.val[1]);
299       const int8x8_t va1c3x0 = vreinterpret_s8_s16(va11x0.val[1]);
300       const int8x8_t va1c3x1 = vreinterpret_s8_s16(va11x1.val[1]);
301       const int8x8_t va2c3x0 = vreinterpret_s8_s16(va21x0.val[1]);
302       const int8x8_t va2c3x1 = vreinterpret_s8_s16(va21x1.val[1]);
303       const int8x8_t va3c3x0 = vreinterpret_s8_s16(va31x0.val[1]);
304       const int8x8_t va3c3x1 = vreinterpret_s8_s16(va31x1.val[1]);
305 
306       int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0c3x0);
307       int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1c3x0);
308       int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, va2c3x0);
309       int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, va3c3x0);
310       const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
311       vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0c3x1);
312       vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1c3x1);
313       vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, va2c3x1);
314       vprod3x0123c3 = vmlal_s8(vprod3x0123c3, vb0123c3x1, va3c3x1);
315       vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
316       vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
317       vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
318       vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
319       int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0c3x0);
320       int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1c3x0);
321       int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, va2c3x0);
322       int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, va3c3x0);
323       const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
324       vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0c3x1);
325       vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1c3x1);
326       vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, va2c3x1);
327       vprod3x4567c3 = vmlal_s8(vprod3x4567c3, vb4567c3x1, va3c3x1);
328       vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
329       vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
330       vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
331       vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
332       int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, va0c3x0);
333       int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, va1c3x0);
334       int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, va2c3x0);
335       int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3x0, va3c3x0);
336       const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
337       vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, va0c3x1);
338       vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, va1c3x1);
339       vprod2x89ABc3 = vmlal_s8(vprod2x89ABc3, vb89ABc3x1, va2c3x1);
340       vprod3x89ABc3 = vmlal_s8(vprod3x89ABc3, vb89ABc3x1, va3c3x1);
341       vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
342       vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
343       vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
344       vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
345       int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0c3x0);
346       int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1c3x0);
347       int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2c3x0);
348       int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3c3x0);
349       const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
350       vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, va0c3x1);
351       vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, va1c3x1);
352       vprod2xCDEFc3 = vmlal_s8(vprod2xCDEFc3, vbCDEFc3x1, va2c3x1);
353       vprod3xCDEFc3 = vmlal_s8(vprod3xCDEFc3, vbCDEFc3x1, va3c3x1);
354       vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
355       vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
356       vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
357       vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
358 
359       k -= 16 * sizeof(int8_t);
360     }
361 
362     if (k >= 8 * sizeof(int8_t)) {
363       const int16x4x2_t va00 = vld2_dup_s16((const void*)a0);
364       const int16x4x2_t va01 = vld2_dup_s16((const void*)(a0 + 4)); a0 += 8;
365       const int16x4x2_t va10 = vld2_dup_s16((const void*)a1);
366       const int16x4x2_t va11 = vld2_dup_s16((const void*)(a1 + 4)); a1 += 8;
367       const int16x4x2_t va20 = vld2_dup_s16((const void*)a2);
368       const int16x4x2_t va21 = vld2_dup_s16((const void*)(a2 + 4)); a2 += 8;
369       const int16x4x2_t va30 = vld2_dup_s16((const void*)a3);
370       const int16x4x2_t va31 = vld2_dup_s16((const void*)(a3 + 4)); a3 += 8;
371 
372       const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
373       const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
374       const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
375       const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
376       const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
377       const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
378       const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
379       const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
380       const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
381       const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
382       const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
383       const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
384       const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
385       const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
386       const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
387       const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
388 
389       const int8x8_t va0c0 = vreinterpret_s8_s16(va00.val[0]);
390       const int8x8_t va1c0 = vreinterpret_s8_s16(va10.val[0]);
391       const int8x8_t va2c0 = vreinterpret_s8_s16(va20.val[0]);
392       const int8x8_t va3c0 = vreinterpret_s8_s16(va30.val[0]);
393 
394       const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, va0c0);
395       const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, va1c0);
396       const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, va2c0);
397       const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, va3c0);
398       vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
399       vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
400       vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
401       vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
402       const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, va0c0);
403       const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, va1c0);
404       const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, va2c0);
405       const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, va3c0);
406       vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
407       vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
408       vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
409       vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
410       const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, va0c0);
411       const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, va1c0);
412       const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, va2c0);
413       const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, va3c0);
414       vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
415       vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
416       vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
417       vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
418       const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, va0c0);
419       const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0);
420       const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, va2c0);
421       const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, va3c0);
422       vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
423       vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
424       vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
425       vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
426       const int8x8_t va0c1 = vreinterpret_s8_s16(va00.val[1]);
427       const int8x8_t va1c1 = vreinterpret_s8_s16(va10.val[1]);
428       const int8x8_t va2c1 = vreinterpret_s8_s16(va20.val[1]);
429       const int8x8_t va3c1 = vreinterpret_s8_s16(va30.val[1]);
430 
431       const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, va0c1);
432       const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, va1c1);
433       const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, va2c1);
434       const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, va3c1);
435       vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
436       vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
437       vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
438       vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
439       const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, va0c1);
440       const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, va1c1);
441       const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, va2c1);
442       const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, va3c1);
443       vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
444       vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
445       vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
446       vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
447       const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, va0c1);
448       const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, va1c1);
449       const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, va2c1);
450       const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, va3c1);
451       vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
452       vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
453       vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
454       vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
455       const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, va0c1);
456       const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1);
457       const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, va2c1);
458       const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, va3c1);
459       vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
460       vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
461       vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
462       vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
463       const int8x8_t va0c2 = vreinterpret_s8_s16(va01.val[0]);
464       const int8x8_t va1c2 = vreinterpret_s8_s16(va11.val[0]);
465       const int8x8_t va2c2 = vreinterpret_s8_s16(va21.val[0]);
466       const int8x8_t va3c2 = vreinterpret_s8_s16(va31.val[0]);
467 
468       const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, va0c2);
469       const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2);
470       const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2);
471       const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, va3c2);
472       vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
473       vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
474       vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
475       vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
476       const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, va0c2);
477       const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2);
478       const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2);
479       const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, va3c2);
480       vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
481       vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
482       vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
483       vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
484       const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, va0c2);
485       const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2);
486       const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2);
487       const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, va3c2);
488       vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
489       vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
490       vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
491       vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
492       const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, va0c2);
493       const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2);
494       const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2);
495       const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, va3c2);
496       vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
497       vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
498       vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
499       vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
500       const int8x8_t va0c3 = vreinterpret_s8_s16(va01.val[1]);
501       const int8x8_t va1c3 = vreinterpret_s8_s16(va11.val[1]);
502       const int8x8_t va2c3 = vreinterpret_s8_s16(va21.val[1]);
503       const int8x8_t va3c3 = vreinterpret_s8_s16(va31.val[1]);
504 
505       const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, va0c3);
506       const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, va1c3);
507       const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, va2c3);
508       const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, va3c3);
509       vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
510       vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
511       vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
512       vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
513       const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, va0c3);
514       const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, va1c3);
515       const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, va2c3);
516       const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, va3c3);
517       vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
518       vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
519       vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
520       vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
521       const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, va0c3);
522       const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, va1c3);
523       const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, va2c3);
524       const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, va3c3);
525       vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
526       vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
527       vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
528       vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
529       const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, va0c3);
530       const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, va1c3);
531       const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, va2c3);
532       const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, va3c3);
533       vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
534       vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
535       vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
536       vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
537 
538       k -= 8 * sizeof(int8_t);
539     }
540 
541     if XNN_UNLIKELY(k != 0) {
542       const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
543       const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
544       const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
545       const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
546 
547       const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
548       const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
549       const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
550       const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
551 
552       const int8x8_t va0c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0));
553       const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, va0c0);
554       vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
555       const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, va0c0);
556       vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
557       const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, va0c0);
558       vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
559       const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, va0c0);
560       vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
561       const int8x8_t va1c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0));
562       const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, va1c0);
563       vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
564       const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, va1c0);
565       vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
566       const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, va1c0);
567       vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
568       const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, va1c0);
569       vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
570       const int8x8_t va2c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0));
571       const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, va2c0);
572       vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
573       const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, va2c0);
574       vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
575       const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, va2c0);
576       vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
577       const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, va2c0);
578       vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
579       const int8x8_t va3c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0));
580       const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, va3c0);
581       vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
582       const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, va3c0);
583       vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
584       const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, va3c0);
585       vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
586       const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, va3c0);
587       vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
588 
589       if (k > 2 * sizeof(int8_t)) {
590         const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
591         const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
592         const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
593         const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
594 
595         const int8x8_t va0c1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1));
596         const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, va0c1);
597         vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
598         const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, va0c1);
599         vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
600         const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, va0c1);
601         vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
602         const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, va0c1);
603         vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
604         const int8x8_t va1c1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1));
605         const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, va1c1);
606         vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
607         const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, va1c1);
608         vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
609         const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, va1c1);
610         vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
611         const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, va1c1);
612         vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
613         const int8x8_t va2c1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1));
614         const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, va2c1);
615         vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
616         const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, va2c1);
617         vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
618         const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, va2c1);
619         vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
620         const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, va2c1);
621         vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
622         const int8x8_t va3c1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1));
623         const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, va3c1);
624         vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
625         const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, va3c1);
626         vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
627         const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, va3c1);
628         vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
629         const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, va3c1);
630         vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
631 
632         if (k > 4 * sizeof(int8_t)) {
633           const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
634           const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
635           const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
636           const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
637 
638           const int8x8_t va0c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2));
639           const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, va0c2);
640           vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
641           const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, va0c2);
642           vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
643           const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, va0c2);
644           vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
645           const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, va0c2);
646           vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
647           const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2));
648           const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2);
649           vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
650           const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2);
651           vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
652           const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2);
653           vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
654           const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2);
655           vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
656           const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2));
657           const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2);
658           vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
659           const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2);
660           vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
661           const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2);
662           vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
663           const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2);
664           vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
665           const int8x8_t va3c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2));
666           const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, va3c2);
667           vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
668           const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, va3c2);
669           vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
670           const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, va3c2);
671           vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
672           const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, va3c2);
673           vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
674         }
675       }
676     }
677 
678     const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
679     const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
680     const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
681 
682     vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
683     vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
684     vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
685     vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);
686     vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
687     vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);
688     vacc1x89AB = vqshlq_s32(vacc1x89AB, vright_pre_shift);
689     vacc1xCDEF = vqshlq_s32(vacc1xCDEF, vright_pre_shift);
690     vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift);
691     vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift);
692     vacc2x89AB = vqshlq_s32(vacc2x89AB, vright_pre_shift);
693     vacc2xCDEF = vqshlq_s32(vacc2xCDEF, vright_pre_shift);
694     vacc3x0123 = vqshlq_s32(vacc3x0123, vright_pre_shift);
695     vacc3x4567 = vqshlq_s32(vacc3x4567, vright_pre_shift);
696     vacc3x89AB = vqshlq_s32(vacc3x89AB, vright_pre_shift);
697     vacc3xCDEF = vqshlq_s32(vacc3xCDEF, vright_pre_shift);
698 
699     vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
700     vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
701     vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
702     vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);
703     vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
704     vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
705     vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier);
706     vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier);
707     vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
708     vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);
709     vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier);
710     vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier);
711     vacc3x0123 = vqdmulhq_s32(vacc3x0123, vmultiplier);
712     vacc3x4567 = vqdmulhq_s32(vacc3x4567, vmultiplier);
713     vacc3x89AB = vqdmulhq_s32(vacc3x89AB, vmultiplier);
714     vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier);
715 
716     vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
717     vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
718     vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
719     vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);
720     vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
721     vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
722     vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift);
723     vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift);
724     vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
725     vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);
726     vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift);
727     vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift);
728     vacc3x0123 = vrshlq_s32(vacc3x0123, vright_post_shift);
729     vacc3x4567 = vrshlq_s32(vacc3x4567, vright_post_shift);
730     vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_post_shift);
731     vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift);
732 
733     const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
734 #if XNN_ARCH_ARM64
735     int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
736     int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF);
737     int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);
738     int16x8_t vacc1x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF);
739     int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567);
740     int16x8_t vacc2x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF);
741     int16x8_t vacc3x01234567 = vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567);
742     int16x8_t vacc3x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF);
743 
744     vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
745     vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);
746     vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
747     vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point);
748     vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);
749     vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point);
750     vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point);
751     vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point);
752 
753     int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
754     int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
755     int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
756     int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
757 #else
758     int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
759     int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF));
760     int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
761     int16x8_t vacc1x89ABCDEF = vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF));
762     int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));
763     int16x8_t vacc2x89ABCDEF = vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF));
764     int16x8_t vacc3x01234567 = vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567));
765     int16x8_t vacc3x89ABCDEF = vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF));
766 
767     vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
768     vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);
769     vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
770     vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point);
771     vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);
772     vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point);
773     vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point);
774     vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point);
775 
776     int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
777     int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
778     int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
779     int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
780 #endif
781 
782     const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
783     vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
784     vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
785     vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
786     vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
787 
788     const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);
789     vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
790     vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
791     vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
792     vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
793 
794     if (nc >= 16) {
795       vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
796       vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
797       vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
798       vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
799 
800       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
801       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
802       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
803       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
804 
805       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
806       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
807       a2 = (const int8_t*) ((uintptr_t) a2 - kc);
808       a3 = (const int8_t*) ((uintptr_t) a3 - kc);
809 
810       nc -= 16;
811     } else {
812       // Final case where not all of the 16 columns fit in the destination.
813       int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
814       int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
815       if (nc & 8) {
816         vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
817         vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
818         vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
819         vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
820         vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
821         vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
822       }
823       if (nc & 4) {
824         vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
825         vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
826         vst1q_lane_u32((void*) c2, vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
827         vst1q_lane_u32((void*) c3, vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
828         vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
829         vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
830       }
831       if (nc & 2) {
832         vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
833         vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
834         vst1q_lane_u16((void*) c2, vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
835         vst1q_lane_u16((void*) c3, vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
836         vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
837         vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
838       }
839       if (nc & 1) {
840         vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
841         vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
842         vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
843         vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
844       }
845 
846       nc = 0;
847     }
848   } while (nc != 0);
849 }
850