xref: /aosp_15_r20/external/XNNPACK/src/qu8-igemm/gen/8x16c4-minmax-rndnu-neondot.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/qu8-igemm/c4-neondot.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/igemm.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot(
19     size_t mr,
20     size_t nc,
21     size_t kc,
22     size_t ks,
23     const uint8_t** restrict a,
24     const void* restrict w,
25     uint8_t* restrict c,
26     size_t cm_stride,
27     size_t cn_stride,
28     size_t a_offset,
29     const uint8_t* zero,
30     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
31 {
32   assert(mr != 0);
33   assert(mr <= 8);
34   assert(nc != 0);
35   assert(kc != 0);
36   assert(ks != 0);
37   assert(ks % (8 * sizeof(void*)) == 0);
38   assert(a_offset % sizeof(uint8_t) == 0);
39   assert(a != NULL);
40   assert(w != NULL);
41   assert(c != NULL);
42 
43   kc = round_up_po2(kc, 4 * sizeof(uint8_t));
44   uint8_t* c0 = c;
45   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
46   if XNN_UNPREDICTABLE(mr < 2) {
47     c1 = c0;
48   }
49   uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
50   if XNN_UNPREDICTABLE(mr <= 2) {
51     c2 = c1;
52   }
53   uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
54   if XNN_UNPREDICTABLE(mr < 4) {
55     c3 = c2;
56   }
57   uint8_t* c4 = (uint8_t*) ((uintptr_t) c3 + cm_stride);
58   if XNN_UNPREDICTABLE(mr <= 4) {
59     c4 = c3;
60   }
61   uint8_t* c5 = (uint8_t*) ((uintptr_t) c4 + cm_stride);
62   if XNN_UNPREDICTABLE(mr < 6) {
63     c5 = c4;
64   }
65   uint8_t* c6 = (uint8_t*) ((uintptr_t) c5 + cm_stride);
66   if XNN_UNPREDICTABLE(mr <= 6) {
67     c6 = c5;
68   }
69   uint8_t* c7 = (uint8_t*) ((uintptr_t) c6 + cm_stride);
70   if XNN_UNPREDICTABLE(mr != 8) {
71     c7 = c6;
72   }
73 
74   const uint8x8_t va_zero_point = vld1_dup_u8(&params->rndnu_neon.kernel_zero_point[0]);
75 
76   do {
77     // Initialize accumulators with bias. 16 bias values are loaded from the
78     // weight matrix, at the start of the group of 16 columns.
79     uint32x4_t vpacc0x0123 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
80     uint32x4_t vpacc0x4567 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
81     uint32x4_t vpacc0x89AB = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
82     uint32x4_t vpacc0xCDEF = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
83     uint32x4_t vpacc1x0123 = vpacc0x0123;
84     uint32x4_t vpacc1x4567 = vpacc0x4567;
85     uint32x4_t vpacc1x89AB = vpacc0x89AB;
86     uint32x4_t vpacc1xCDEF = vpacc0xCDEF;
87     uint32x4_t vpacc2x0123 = vpacc0x0123;
88     uint32x4_t vpacc2x4567 = vpacc0x4567;
89     uint32x4_t vpacc2x89AB = vpacc0x89AB;
90     uint32x4_t vpacc2xCDEF = vpacc0xCDEF;
91     uint32x4_t vpacc3x0123 = vpacc0x0123;
92     uint32x4_t vpacc3x4567 = vpacc0x4567;
93     uint32x4_t vpacc3x89AB = vpacc0x89AB;
94     uint32x4_t vpacc3xCDEF = vpacc0xCDEF;
95     uint32x4_t vpacc4x0123 = vpacc0x0123;
96     uint32x4_t vpacc4x4567 = vpacc0x4567;
97     uint32x4_t vpacc4x89AB = vpacc0x89AB;
98     uint32x4_t vpacc4xCDEF = vpacc0xCDEF;
99     uint32x4_t vpacc5x0123 = vpacc0x0123;
100     uint32x4_t vpacc5x4567 = vpacc0x4567;
101     uint32x4_t vpacc5x89AB = vpacc0x89AB;
102     uint32x4_t vpacc5xCDEF = vpacc0xCDEF;
103     uint32x4_t vpacc6x0123 = vpacc0x0123;
104     uint32x4_t vpacc6x4567 = vpacc0x4567;
105     uint32x4_t vpacc6x89AB = vpacc0x89AB;
106     uint32x4_t vpacc6xCDEF = vpacc0xCDEF;
107     uint32x4_t vpacc7x0123 = vpacc0x0123;
108     uint32x4_t vpacc7x4567 = vpacc0x4567;
109     uint32x4_t vpacc7x89AB = vpacc0x89AB;
110     uint32x4_t vpacc7xCDEF = vpacc0xCDEF;
111     uint32x2_t vnacc0 = vmov_n_u32(0);
112     uint32x2_t vnacc1 = vmov_n_u32(0);
113     uint32x2_t vnacc2 = vmov_n_u32(0);
114     uint32x2_t vnacc3 = vmov_n_u32(0);
115     uint32x2_t vnacc4 = vmov_n_u32(0);
116     uint32x2_t vnacc5 = vmov_n_u32(0);
117     uint32x2_t vnacc6 = vmov_n_u32(0);
118     uint32x2_t vnacc7 = vmov_n_u32(0);
119 
120     size_t p = ks;
121     do {
122       const uint8_t* restrict a0 = a[0];
123       if XNN_UNPREDICTABLE(a0 != zero) {
124         a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
125       }
126       const uint8_t* restrict a1 = a[1];
127       if XNN_UNPREDICTABLE(a1 != zero) {
128         a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
129       }
130       const uint8_t* restrict a2 = a[2];
131       if XNN_UNPREDICTABLE(a2 != zero) {
132         a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
133       }
134       const uint8_t* restrict a3 = a[3];
135       if XNN_UNPREDICTABLE(a3 != zero) {
136         a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
137       }
138       const uint8_t* restrict a4 = a[4];
139       if XNN_UNPREDICTABLE(a4 != zero) {
140         a4 = (const uint8_t*) ((uintptr_t) a4 + a_offset);
141       }
142       const uint8_t* restrict a5 = a[5];
143       if XNN_UNPREDICTABLE(a5 != zero) {
144         a5 = (const uint8_t*) ((uintptr_t) a5 + a_offset);
145       }
146       const uint8_t* restrict a6 = a[6];
147       if XNN_UNPREDICTABLE(a6 != zero) {
148         a6 = (const uint8_t*) ((uintptr_t) a6 + a_offset);
149       }
150       const uint8_t* restrict a7 = a[7];
151       if XNN_UNPREDICTABLE(a7 != zero) {
152         a7 = (const uint8_t*) ((uintptr_t) a7 + a_offset);
153       }
154       a += 8;
155 
156       // Inner accumulation loop along the 16 columns.
157       size_t k = kc;
158       // 2x partial unrolled loop to load 8 bytes at a time.
159       while (k >= 8 * sizeof(uint8_t)) {
160         // Load a 8x8 block of activations.
161         const uint8x8_t va0x01234567 = vld1_u8(a0); a0 += 8;
162         const uint8x8_t va1x01234567 = vld1_u8(a1); a1 += 8;
163         const uint8x8_t va2x01234567 = vld1_u8(a2); a2 += 8;
164         const uint8x8_t va3x01234567 = vld1_u8(a3); a3 += 8;
165         const uint8x8_t va4x01234567 = vld1_u8(a4); a4 += 8;
166         const uint8x8_t va5x01234567 = vld1_u8(a5); a5 += 8;
167         const uint8x8_t va6x01234567 = vld1_u8(a6); a6 += 8;
168         const uint8x8_t va7x01234567 = vld1_u8(a7); a7 += 8;
169 
170         // Load a 8x16 block of weights.
171         const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
172         const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
173         const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
174         const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
175         const uint8x16_t vb4567x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
176         const uint8x16_t vb4567x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
177         const uint8x16_t vb4567x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
178         const uint8x16_t vb4567xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
179 
180         // Multiply-accumulate: 8x8 * 8x16 --> 8x16.
181         vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
182         vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
183         vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
184         vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
185         vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
186         vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb4567x0123, va0x01234567, 1);
187         vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb4567x4567, va0x01234567, 1);
188         vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb4567x89AB, va0x01234567, 1);
189         vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
190         vnacc1 = vdot_u32(vnacc1, va_zero_point, va1x01234567);
191         vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb0123x0123, va1x01234567, 0);
192         vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb0123x4567, va1x01234567, 0);
193         vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb0123x89AB, va1x01234567, 0);
194         vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
195         vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb4567x0123, va1x01234567, 1);
196         vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb4567x4567, va1x01234567, 1);
197         vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb4567x89AB, va1x01234567, 1);
198         vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
199         vnacc2 = vdot_u32(vnacc2, va_zero_point, va2x01234567);
200         vpacc2x0123 = vdotq_lane_u32(vpacc2x0123, vb0123x0123, va2x01234567, 0);
201         vpacc2x4567 = vdotq_lane_u32(vpacc2x4567, vb0123x4567, va2x01234567, 0);
202         vpacc2x89AB = vdotq_lane_u32(vpacc2x89AB, vb0123x89AB, va2x01234567, 0);
203         vpacc2xCDEF = vdotq_lane_u32(vpacc2xCDEF, vb0123xCDEF, va2x01234567, 0);
204         vpacc2x0123 = vdotq_lane_u32(vpacc2x0123, vb4567x0123, va2x01234567, 1);
205         vpacc2x4567 = vdotq_lane_u32(vpacc2x4567, vb4567x4567, va2x01234567, 1);
206         vpacc2x89AB = vdotq_lane_u32(vpacc2x89AB, vb4567x89AB, va2x01234567, 1);
207         vpacc2xCDEF = vdotq_lane_u32(vpacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
208         vnacc3 = vdot_u32(vnacc3, va_zero_point, va3x01234567);
209         vpacc3x0123 = vdotq_lane_u32(vpacc3x0123, vb0123x0123, va3x01234567, 0);
210         vpacc3x4567 = vdotq_lane_u32(vpacc3x4567, vb0123x4567, va3x01234567, 0);
211         vpacc3x89AB = vdotq_lane_u32(vpacc3x89AB, vb0123x89AB, va3x01234567, 0);
212         vpacc3xCDEF = vdotq_lane_u32(vpacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
213         vpacc3x0123 = vdotq_lane_u32(vpacc3x0123, vb4567x0123, va3x01234567, 1);
214         vpacc3x4567 = vdotq_lane_u32(vpacc3x4567, vb4567x4567, va3x01234567, 1);
215         vpacc3x89AB = vdotq_lane_u32(vpacc3x89AB, vb4567x89AB, va3x01234567, 1);
216         vpacc3xCDEF = vdotq_lane_u32(vpacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
217         vnacc4 = vdot_u32(vnacc4, va_zero_point, va4x01234567);
218         vpacc4x0123 = vdotq_lane_u32(vpacc4x0123, vb0123x0123, va4x01234567, 0);
219         vpacc4x4567 = vdotq_lane_u32(vpacc4x4567, vb0123x4567, va4x01234567, 0);
220         vpacc4x89AB = vdotq_lane_u32(vpacc4x89AB, vb0123x89AB, va4x01234567, 0);
221         vpacc4xCDEF = vdotq_lane_u32(vpacc4xCDEF, vb0123xCDEF, va4x01234567, 0);
222         vpacc4x0123 = vdotq_lane_u32(vpacc4x0123, vb4567x0123, va4x01234567, 1);
223         vpacc4x4567 = vdotq_lane_u32(vpacc4x4567, vb4567x4567, va4x01234567, 1);
224         vpacc4x89AB = vdotq_lane_u32(vpacc4x89AB, vb4567x89AB, va4x01234567, 1);
225         vpacc4xCDEF = vdotq_lane_u32(vpacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
226         vnacc5 = vdot_u32(vnacc5, va_zero_point, va5x01234567);
227         vpacc5x0123 = vdotq_lane_u32(vpacc5x0123, vb0123x0123, va5x01234567, 0);
228         vpacc5x4567 = vdotq_lane_u32(vpacc5x4567, vb0123x4567, va5x01234567, 0);
229         vpacc5x89AB = vdotq_lane_u32(vpacc5x89AB, vb0123x89AB, va5x01234567, 0);
230         vpacc5xCDEF = vdotq_lane_u32(vpacc5xCDEF, vb0123xCDEF, va5x01234567, 0);
231         vpacc5x0123 = vdotq_lane_u32(vpacc5x0123, vb4567x0123, va5x01234567, 1);
232         vpacc5x4567 = vdotq_lane_u32(vpacc5x4567, vb4567x4567, va5x01234567, 1);
233         vpacc5x89AB = vdotq_lane_u32(vpacc5x89AB, vb4567x89AB, va5x01234567, 1);
234         vpacc5xCDEF = vdotq_lane_u32(vpacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
235         vnacc6 = vdot_u32(vnacc6, va_zero_point, va6x01234567);
236         vpacc6x0123 = vdotq_lane_u32(vpacc6x0123, vb0123x0123, va6x01234567, 0);
237         vpacc6x4567 = vdotq_lane_u32(vpacc6x4567, vb0123x4567, va6x01234567, 0);
238         vpacc6x89AB = vdotq_lane_u32(vpacc6x89AB, vb0123x89AB, va6x01234567, 0);
239         vpacc6xCDEF = vdotq_lane_u32(vpacc6xCDEF, vb0123xCDEF, va6x01234567, 0);
240         vpacc6x0123 = vdotq_lane_u32(vpacc6x0123, vb4567x0123, va6x01234567, 1);
241         vpacc6x4567 = vdotq_lane_u32(vpacc6x4567, vb4567x4567, va6x01234567, 1);
242         vpacc6x89AB = vdotq_lane_u32(vpacc6x89AB, vb4567x89AB, va6x01234567, 1);
243         vpacc6xCDEF = vdotq_lane_u32(vpacc6xCDEF, vb4567xCDEF, va6x01234567, 1);
244         vnacc7 = vdot_u32(vnacc7, va_zero_point, va7x01234567);
245         vpacc7x0123 = vdotq_lane_u32(vpacc7x0123, vb0123x0123, va7x01234567, 0);
246         vpacc7x4567 = vdotq_lane_u32(vpacc7x4567, vb0123x4567, va7x01234567, 0);
247         vpacc7x89AB = vdotq_lane_u32(vpacc7x89AB, vb0123x89AB, va7x01234567, 0);
248         vpacc7xCDEF = vdotq_lane_u32(vpacc7xCDEF, vb0123xCDEF, va7x01234567, 0);
249         vpacc7x0123 = vdotq_lane_u32(vpacc7x0123, vb4567x0123, va7x01234567, 1);
250         vpacc7x4567 = vdotq_lane_u32(vpacc7x4567, vb4567x4567, va7x01234567, 1);
251         vpacc7x89AB = vdotq_lane_u32(vpacc7x89AB, vb4567x89AB, va7x01234567, 1);
252         vpacc7xCDEF = vdotq_lane_u32(vpacc7xCDEF, vb4567xCDEF, va7x01234567, 1);
253 
254         k -= 8 * sizeof(uint8_t);
255       }
256       // Handle up to 4 final positions of `k`
257       if XNN_UNLIKELY(k != 0) {
258         // Load a 8x4 block of activations.
259         const uint8x8_t va0x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a0, vmov_n_u32(0), 0)); a0 += 4;
260         const uint8x8_t va1x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a1, vmov_n_u32(0), 0)); a1 += 4;
261         const uint8x8_t va2x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a2, vmov_n_u32(0), 0)); a2 += 4;
262         const uint8x8_t va3x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a3, vmov_n_u32(0), 0)); a3 += 4;
263         const uint8x8_t va4x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a4, vmov_n_u32(0), 0)); a4 += 4;
264         const uint8x8_t va5x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a5, vmov_n_u32(0), 0)); a5 += 4;
265         const uint8x8_t va6x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a6, vmov_n_u32(0), 0)); a6 += 4;
266         const uint8x8_t va7x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a7, vmov_n_u32(0), 0)); a7 += 4;
267 
268         // Load a 4x16 block of weights.
269         const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
270         const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
271         const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
272         const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
273 
274         // Multiply-accumulate: 8x4 * 4x16 --> 8x16.
275         vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
276         vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
277         vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
278         vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
279         vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
280         vnacc1 = vdot_u32(vnacc1, va_zero_point, va1x01234567);
281         vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb0123x0123, va1x01234567, 0);
282         vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb0123x4567, va1x01234567, 0);
283         vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb0123x89AB, va1x01234567, 0);
284         vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
285         vnacc2 = vdot_u32(vnacc2, va_zero_point, va2x01234567);
286         vpacc2x0123 = vdotq_lane_u32(vpacc2x0123, vb0123x0123, va2x01234567, 0);
287         vpacc2x4567 = vdotq_lane_u32(vpacc2x4567, vb0123x4567, va2x01234567, 0);
288         vpacc2x89AB = vdotq_lane_u32(vpacc2x89AB, vb0123x89AB, va2x01234567, 0);
289         vpacc2xCDEF = vdotq_lane_u32(vpacc2xCDEF, vb0123xCDEF, va2x01234567, 0);
290         vnacc3 = vdot_u32(vnacc3, va_zero_point, va3x01234567);
291         vpacc3x0123 = vdotq_lane_u32(vpacc3x0123, vb0123x0123, va3x01234567, 0);
292         vpacc3x4567 = vdotq_lane_u32(vpacc3x4567, vb0123x4567, va3x01234567, 0);
293         vpacc3x89AB = vdotq_lane_u32(vpacc3x89AB, vb0123x89AB, va3x01234567, 0);
294         vpacc3xCDEF = vdotq_lane_u32(vpacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
295         vnacc4 = vdot_u32(vnacc4, va_zero_point, va4x01234567);
296         vpacc4x0123 = vdotq_lane_u32(vpacc4x0123, vb0123x0123, va4x01234567, 0);
297         vpacc4x4567 = vdotq_lane_u32(vpacc4x4567, vb0123x4567, va4x01234567, 0);
298         vpacc4x89AB = vdotq_lane_u32(vpacc4x89AB, vb0123x89AB, va4x01234567, 0);
299         vpacc4xCDEF = vdotq_lane_u32(vpacc4xCDEF, vb0123xCDEF, va4x01234567, 0);
300         vnacc5 = vdot_u32(vnacc5, va_zero_point, va5x01234567);
301         vpacc5x0123 = vdotq_lane_u32(vpacc5x0123, vb0123x0123, va5x01234567, 0);
302         vpacc5x4567 = vdotq_lane_u32(vpacc5x4567, vb0123x4567, va5x01234567, 0);
303         vpacc5x89AB = vdotq_lane_u32(vpacc5x89AB, vb0123x89AB, va5x01234567, 0);
304         vpacc5xCDEF = vdotq_lane_u32(vpacc5xCDEF, vb0123xCDEF, va5x01234567, 0);
305         vnacc6 = vdot_u32(vnacc6, va_zero_point, va6x01234567);
306         vpacc6x0123 = vdotq_lane_u32(vpacc6x0123, vb0123x0123, va6x01234567, 0);
307         vpacc6x4567 = vdotq_lane_u32(vpacc6x4567, vb0123x4567, va6x01234567, 0);
308         vpacc6x89AB = vdotq_lane_u32(vpacc6x89AB, vb0123x89AB, va6x01234567, 0);
309         vpacc6xCDEF = vdotq_lane_u32(vpacc6xCDEF, vb0123xCDEF, va6x01234567, 0);
310         vnacc7 = vdot_u32(vnacc7, va_zero_point, va7x01234567);
311         vpacc7x0123 = vdotq_lane_u32(vpacc7x0123, vb0123x0123, va7x01234567, 0);
312         vpacc7x4567 = vdotq_lane_u32(vpacc7x4567, vb0123x4567, va7x01234567, 0);
313         vpacc7x89AB = vdotq_lane_u32(vpacc7x89AB, vb0123x89AB, va7x01234567, 0);
314         vpacc7xCDEF = vdotq_lane_u32(vpacc7xCDEF, vb0123xCDEF, va7x01234567, 0);
315       }
316       p -= 8 * sizeof(void*);
317     } while (p != 0);
318 
319     // Subtract zero point from accumulators.
320     vnacc0 = vpadd_u32(vnacc0, vnacc0);
321     const uint32x4_t vnacc0x0123 = vcombine_u32(vnacc0, vnacc0);
322     int32x4_t vacc0x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x0123, vnacc0x0123));
323     int32x4_t vacc0x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x4567, vnacc0x0123));
324     int32x4_t vacc0x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc0x89AB, vnacc0x0123));
325     int32x4_t vacc0xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc0xCDEF, vnacc0x0123));
326     vnacc1 = vpadd_u32(vnacc1, vnacc1);
327     const uint32x4_t vnacc1x0123 = vcombine_u32(vnacc1, vnacc1);
328     int32x4_t vacc1x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc1x0123, vnacc1x0123));
329     int32x4_t vacc1x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc1x4567, vnacc1x0123));
330     int32x4_t vacc1x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc1x89AB, vnacc1x0123));
331     int32x4_t vacc1xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc1xCDEF, vnacc1x0123));
332     vnacc2 = vpadd_u32(vnacc2, vnacc2);
333     const uint32x4_t vnacc2x0123 = vcombine_u32(vnacc2, vnacc2);
334     int32x4_t vacc2x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc2x0123, vnacc2x0123));
335     int32x4_t vacc2x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc2x4567, vnacc2x0123));
336     int32x4_t vacc2x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc2x89AB, vnacc2x0123));
337     int32x4_t vacc2xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc2xCDEF, vnacc2x0123));
338     vnacc3 = vpadd_u32(vnacc3, vnacc3);
339     const uint32x4_t vnacc3x0123 = vcombine_u32(vnacc3, vnacc3);
340     int32x4_t vacc3x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc3x0123, vnacc3x0123));
341     int32x4_t vacc3x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc3x4567, vnacc3x0123));
342     int32x4_t vacc3x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc3x89AB, vnacc3x0123));
343     int32x4_t vacc3xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc3xCDEF, vnacc3x0123));
344     vnacc4 = vpadd_u32(vnacc4, vnacc4);
345     const uint32x4_t vnacc4x0123 = vcombine_u32(vnacc4, vnacc4);
346     int32x4_t vacc4x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc4x0123, vnacc4x0123));
347     int32x4_t vacc4x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc4x4567, vnacc4x0123));
348     int32x4_t vacc4x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc4x89AB, vnacc4x0123));
349     int32x4_t vacc4xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc4xCDEF, vnacc4x0123));
350     vnacc5 = vpadd_u32(vnacc5, vnacc5);
351     const uint32x4_t vnacc5x0123 = vcombine_u32(vnacc5, vnacc5);
352     int32x4_t vacc5x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc5x0123, vnacc5x0123));
353     int32x4_t vacc5x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc5x4567, vnacc5x0123));
354     int32x4_t vacc5x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc5x89AB, vnacc5x0123));
355     int32x4_t vacc5xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc5xCDEF, vnacc5x0123));
356     vnacc6 = vpadd_u32(vnacc6, vnacc6);
357     const uint32x4_t vnacc6x0123 = vcombine_u32(vnacc6, vnacc6);
358     int32x4_t vacc6x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc6x0123, vnacc6x0123));
359     int32x4_t vacc6x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc6x4567, vnacc6x0123));
360     int32x4_t vacc6x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc6x89AB, vnacc6x0123));
361     int32x4_t vacc6xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc6xCDEF, vnacc6x0123));
362     vnacc7 = vpadd_u32(vnacc7, vnacc7);
363     const uint32x4_t vnacc7x0123 = vcombine_u32(vnacc7, vnacc7);
364     int32x4_t vacc7x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc7x0123, vnacc7x0123));
365     int32x4_t vacc7x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc7x4567, vnacc7x0123));
366     int32x4_t vacc7x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc7x89AB, vnacc7x0123));
367     int32x4_t vacc7xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc7xCDEF, vnacc7x0123));
368 
369     const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
370     const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
371     const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
372 
373     vacc0x0123 = vshlq_s32(vacc0x0123, vright_pre_shift);
374     vacc0x4567 = vshlq_s32(vacc0x4567, vright_pre_shift);
375     vacc0x89AB = vshlq_s32(vacc0x89AB, vright_pre_shift);
376     vacc0xCDEF = vshlq_s32(vacc0xCDEF, vright_pre_shift);
377     vacc1x0123 = vshlq_s32(vacc1x0123, vright_pre_shift);
378     vacc1x4567 = vshlq_s32(vacc1x4567, vright_pre_shift);
379     vacc1x89AB = vshlq_s32(vacc1x89AB, vright_pre_shift);
380     vacc1xCDEF = vshlq_s32(vacc1xCDEF, vright_pre_shift);
381     vacc2x0123 = vshlq_s32(vacc2x0123, vright_pre_shift);
382     vacc2x4567 = vshlq_s32(vacc2x4567, vright_pre_shift);
383     vacc2x89AB = vshlq_s32(vacc2x89AB, vright_pre_shift);
384     vacc2xCDEF = vshlq_s32(vacc2xCDEF, vright_pre_shift);
385     vacc3x0123 = vshlq_s32(vacc3x0123, vright_pre_shift);
386     vacc3x4567 = vshlq_s32(vacc3x4567, vright_pre_shift);
387     vacc3x89AB = vshlq_s32(vacc3x89AB, vright_pre_shift);
388     vacc3xCDEF = vshlq_s32(vacc3xCDEF, vright_pre_shift);
389     vacc4x0123 = vshlq_s32(vacc4x0123, vright_pre_shift);
390     vacc4x4567 = vshlq_s32(vacc4x4567, vright_pre_shift);
391     vacc4x89AB = vshlq_s32(vacc4x89AB, vright_pre_shift);
392     vacc4xCDEF = vshlq_s32(vacc4xCDEF, vright_pre_shift);
393     vacc5x0123 = vshlq_s32(vacc5x0123, vright_pre_shift);
394     vacc5x4567 = vshlq_s32(vacc5x4567, vright_pre_shift);
395     vacc5x89AB = vshlq_s32(vacc5x89AB, vright_pre_shift);
396     vacc5xCDEF = vshlq_s32(vacc5xCDEF, vright_pre_shift);
397     vacc6x0123 = vshlq_s32(vacc6x0123, vright_pre_shift);
398     vacc6x4567 = vshlq_s32(vacc6x4567, vright_pre_shift);
399     vacc6x89AB = vshlq_s32(vacc6x89AB, vright_pre_shift);
400     vacc6xCDEF = vshlq_s32(vacc6xCDEF, vright_pre_shift);
401     vacc7x0123 = vshlq_s32(vacc7x0123, vright_pre_shift);
402     vacc7x4567 = vshlq_s32(vacc7x4567, vright_pre_shift);
403     vacc7x89AB = vshlq_s32(vacc7x89AB, vright_pre_shift);
404     vacc7xCDEF = vshlq_s32(vacc7xCDEF, vright_pre_shift);
405 
406     vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
407     vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
408     vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
409     vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);
410     vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
411     vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
412     vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier);
413     vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier);
414     vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
415     vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);
416     vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier);
417     vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier);
418     vacc3x0123 = vqdmulhq_s32(vacc3x0123, vmultiplier);
419     vacc3x4567 = vqdmulhq_s32(vacc3x4567, vmultiplier);
420     vacc3x89AB = vqdmulhq_s32(vacc3x89AB, vmultiplier);
421     vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier);
422     vacc4x0123 = vqdmulhq_s32(vacc4x0123, vmultiplier);
423     vacc4x4567 = vqdmulhq_s32(vacc4x4567, vmultiplier);
424     vacc4x89AB = vqdmulhq_s32(vacc4x89AB, vmultiplier);
425     vacc4xCDEF = vqdmulhq_s32(vacc4xCDEF, vmultiplier);
426     vacc5x0123 = vqdmulhq_s32(vacc5x0123, vmultiplier);
427     vacc5x4567 = vqdmulhq_s32(vacc5x4567, vmultiplier);
428     vacc5x89AB = vqdmulhq_s32(vacc5x89AB, vmultiplier);
429     vacc5xCDEF = vqdmulhq_s32(vacc5xCDEF, vmultiplier);
430     vacc6x0123 = vqdmulhq_s32(vacc6x0123, vmultiplier);
431     vacc6x4567 = vqdmulhq_s32(vacc6x4567, vmultiplier);
432     vacc6x89AB = vqdmulhq_s32(vacc6x89AB, vmultiplier);
433     vacc6xCDEF = vqdmulhq_s32(vacc6xCDEF, vmultiplier);
434     vacc7x0123 = vqdmulhq_s32(vacc7x0123, vmultiplier);
435     vacc7x4567 = vqdmulhq_s32(vacc7x4567, vmultiplier);
436     vacc7x89AB = vqdmulhq_s32(vacc7x89AB, vmultiplier);
437     vacc7xCDEF = vqdmulhq_s32(vacc7xCDEF, vmultiplier);
438 
439     vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
440     vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
441     vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
442     vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);
443     vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
444     vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
445     vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift);
446     vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift);
447     vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
448     vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);
449     vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift);
450     vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift);
451     vacc3x0123 = vrshlq_s32(vacc3x0123, vright_post_shift);
452     vacc3x4567 = vrshlq_s32(vacc3x4567, vright_post_shift);
453     vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_post_shift);
454     vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift);
455     vacc4x0123 = vrshlq_s32(vacc4x0123, vright_post_shift);
456     vacc4x4567 = vrshlq_s32(vacc4x4567, vright_post_shift);
457     vacc4x89AB = vrshlq_s32(vacc4x89AB, vright_post_shift);
458     vacc4xCDEF = vrshlq_s32(vacc4xCDEF, vright_post_shift);
459     vacc5x0123 = vrshlq_s32(vacc5x0123, vright_post_shift);
460     vacc5x4567 = vrshlq_s32(vacc5x4567, vright_post_shift);
461     vacc5x89AB = vrshlq_s32(vacc5x89AB, vright_post_shift);
462     vacc5xCDEF = vrshlq_s32(vacc5xCDEF, vright_post_shift);
463     vacc6x0123 = vrshlq_s32(vacc6x0123, vright_post_shift);
464     vacc6x4567 = vrshlq_s32(vacc6x4567, vright_post_shift);
465     vacc6x89AB = vrshlq_s32(vacc6x89AB, vright_post_shift);
466     vacc6xCDEF = vrshlq_s32(vacc6xCDEF, vright_post_shift);
467     vacc7x0123 = vrshlq_s32(vacc7x0123, vright_post_shift);
468     vacc7x4567 = vrshlq_s32(vacc7x4567, vright_post_shift);
469     vacc7x89AB = vrshlq_s32(vacc7x89AB, vright_post_shift);
470     vacc7xCDEF = vrshlq_s32(vacc7xCDEF, vright_post_shift);
471 
472     const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
473 #if XNN_ARCH_ARM64
474     const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
475     const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
476     const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
477     const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
478     const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
479     const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
480     const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
481     const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
482     const int16x8_t vacc4x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
483     const int16x8_t vacc4x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc4x89AB), vacc4xCDEF), voutput_zero_point);
484     const int16x8_t vacc5x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
485     const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), voutput_zero_point);
486     const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), voutput_zero_point);
487     const int16x8_t vacc6x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x89AB), vacc6xCDEF), voutput_zero_point);
488     const int16x8_t vacc7x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc7x0123), vacc7x4567), voutput_zero_point);
489     const int16x8_t vacc7x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc7x89AB), vacc7xCDEF), voutput_zero_point);
490 
491     uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
492     uint8x16_t vout1x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc1x01234567), vacc1x89ABCDEF);
493     uint8x16_t vout2x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc2x89ABCDEF);
494     uint8x16_t vout3x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc3x01234567), vacc3x89ABCDEF);
495     uint8x16_t vout4x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc4x01234567), vacc4x89ABCDEF);
496     uint8x16_t vout5x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc5x01234567), vacc5x89ABCDEF);
497     uint8x16_t vout6x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc6x01234567), vacc6x89ABCDEF);
498     uint8x16_t vout7x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc7x01234567), vacc7x89ABCDEF);
499 #else
500     const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
501     const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
502     const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
503     const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
504     const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
505     const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
506     const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
507     const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
508     const int16x8_t vacc4x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567)), voutput_zero_point);
509     const int16x8_t vacc4x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc4x89AB), vqmovn_s32(vacc4xCDEF)), voutput_zero_point);
510     const int16x8_t vacc5x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567)), voutput_zero_point);
511     const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5xCDEF)), voutput_zero_point);
512     const int16x8_t vacc6x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_point);
513     const int16x8_t vacc6x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x89AB), vqmovn_s32(vacc6xCDEF)), voutput_zero_point);
514     const int16x8_t vacc7x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc7x0123), vqmovn_s32(vacc7x4567)), voutput_zero_point);
515     const int16x8_t vacc7x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc7x89AB), vqmovn_s32(vacc7xCDEF)), voutput_zero_point);
516 
517     uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
518     uint8x16_t vout1x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc1x01234567), vqmovun_s16(vacc1x89ABCDEF));
519     uint8x16_t vout2x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc2x89ABCDEF));
520     uint8x16_t vout3x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc3x01234567), vqmovun_s16(vacc3x89ABCDEF));
521     uint8x16_t vout4x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc4x01234567), vqmovun_s16(vacc4x89ABCDEF));
522     uint8x16_t vout5x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc5x01234567), vqmovun_s16(vacc5x89ABCDEF));
523     uint8x16_t vout6x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc6x01234567), vqmovun_s16(vacc6x89ABCDEF));
524     uint8x16_t vout7x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc7x01234567), vqmovun_s16(vacc7x89ABCDEF));
525 #endif
526     const uint8x16_t voutput_min = vld1q_dup_u8(&params->rndnu_neon.output_min);
527     const uint8x16_t voutput_max = vld1q_dup_u8(&params->rndnu_neon.output_max);
528 
529     vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);
530     vout1x0123456789ABCDEF = vmaxq_u8(vout1x0123456789ABCDEF, voutput_min);
531     vout2x0123456789ABCDEF = vmaxq_u8(vout2x0123456789ABCDEF, voutput_min);
532     vout3x0123456789ABCDEF = vmaxq_u8(vout3x0123456789ABCDEF, voutput_min);
533     vout4x0123456789ABCDEF = vmaxq_u8(vout4x0123456789ABCDEF, voutput_min);
534     vout5x0123456789ABCDEF = vmaxq_u8(vout5x0123456789ABCDEF, voutput_min);
535     vout6x0123456789ABCDEF = vmaxq_u8(vout6x0123456789ABCDEF, voutput_min);
536     vout7x0123456789ABCDEF = vmaxq_u8(vout7x0123456789ABCDEF, voutput_min);
537 
538     vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);
539     vout1x0123456789ABCDEF = vminq_u8(vout1x0123456789ABCDEF, voutput_max);
540     vout2x0123456789ABCDEF = vminq_u8(vout2x0123456789ABCDEF, voutput_max);
541     vout3x0123456789ABCDEF = vminq_u8(vout3x0123456789ABCDEF, voutput_max);
542     vout4x0123456789ABCDEF = vminq_u8(vout4x0123456789ABCDEF, voutput_max);
543     vout5x0123456789ABCDEF = vminq_u8(vout5x0123456789ABCDEF, voutput_max);
544     vout6x0123456789ABCDEF = vminq_u8(vout6x0123456789ABCDEF, voutput_max);
545     vout7x0123456789ABCDEF = vminq_u8(vout7x0123456789ABCDEF, voutput_max);
546 
547     if (nc >= 16) {
548       vst1q_u8(c7 + 0, vout7x0123456789ABCDEF);
549       vst1q_u8(c6 + 0, vout6x0123456789ABCDEF);
550       vst1q_u8(c5 + 0, vout5x0123456789ABCDEF);
551       vst1q_u8(c4 + 0, vout4x0123456789ABCDEF);
552       vst1q_u8(c3 + 0, vout3x0123456789ABCDEF);
553       vst1q_u8(c2 + 0, vout2x0123456789ABCDEF);
554       vst1q_u8(c1 + 0, vout1x0123456789ABCDEF);
555       vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);
556 
557       c7 = (uint8_t*) ((uintptr_t) c7 + cn_stride);
558       c6 = (uint8_t*) ((uintptr_t) c6 + cn_stride);
559       c5 = (uint8_t*) ((uintptr_t) c5 + cn_stride);
560       c4 = (uint8_t*) ((uintptr_t) c4 + cn_stride);
561       c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
562       c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
563       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
564       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
565 
566       a = (const uint8_t**restrict) ((uintptr_t) a - ks);
567 
568       nc -= 16;
569     } else {
570       uint8x16_t vout6x01234567_7x01234567 = vcombine_u8(vget_low_u8(vout6x0123456789ABCDEF), vget_low_u8(vout7x0123456789ABCDEF));
571       uint8x16_t vout4x01234567_5x01234567 = vcombine_u8(vget_low_u8(vout4x0123456789ABCDEF), vget_low_u8(vout5x0123456789ABCDEF));
572       uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vget_low_u8(vout2x0123456789ABCDEF), vget_low_u8(vout3x0123456789ABCDEF));
573       uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vget_low_u8(vout0x0123456789ABCDEF), vget_low_u8(vout1x0123456789ABCDEF));
574       if (nc & 8) {
575         vst1_u8(c7, vget_high_u8(vout6x01234567_7x01234567)); c7 += 8;
576         vst1_u8(c6, vget_low_u8(vout6x01234567_7x01234567)); c6 += 8;
577         vst1_u8(c5, vget_high_u8(vout4x01234567_5x01234567)); c5 += 8;
578         vst1_u8(c4, vget_low_u8(vout4x01234567_5x01234567)); c4 += 8;
579         vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567)); c3 += 8;
580         vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567)); c2 += 8;
581         vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += 8;
582         vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 += 8;
583         vout6x01234567_7x01234567 = vcombine_u8(vget_high_u8(vout6x0123456789ABCDEF), vget_high_u8(vout7x0123456789ABCDEF));
584         vout4x01234567_5x01234567 = vcombine_u8(vget_high_u8(vout4x0123456789ABCDEF), vget_high_u8(vout5x0123456789ABCDEF));
585         vout2x01234567_3x01234567 = vcombine_u8(vget_high_u8(vout2x0123456789ABCDEF), vget_high_u8(vout3x0123456789ABCDEF));
586         vout0x01234567_1x01234567 = vcombine_u8(vget_high_u8(vout0x0123456789ABCDEF), vget_high_u8(vout1x0123456789ABCDEF));
587       }
588       if (nc & 4) {
589         vst1q_lane_u32((void*) c7, vreinterpretq_u32_u8(vout6x01234567_7x01234567), 2); c7 += 4;
590         vst1q_lane_u32((void*) c6, vreinterpretq_u32_u8(vout6x01234567_7x01234567), 0); c6 += 4;
591         vst1q_lane_u32((void*) c5, vreinterpretq_u32_u8(vout4x01234567_5x01234567), 2); c5 += 4;
592         vst1q_lane_u32((void*) c4, vreinterpretq_u32_u8(vout4x01234567_5x01234567), 0); c4 += 4;
593         vst1q_lane_u32((void*) c3, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4;
594         vst1q_lane_u32((void*) c2, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4;
595         vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
596         vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
597         vout6x01234567_7x01234567 = vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 4);
598         vout4x01234567_5x01234567 = vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
599         vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
600         vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
601       }
602       if (nc & 2) {
603         vst1q_lane_u16((void*) c7, vreinterpretq_u16_u8(vout6x01234567_7x01234567), 4); c7 += 2;
604         vst1q_lane_u16((void*) c6, vreinterpretq_u16_u8(vout6x01234567_7x01234567), 0); c6 += 2;
605         vst1q_lane_u16((void*) c5, vreinterpretq_u16_u8(vout4x01234567_5x01234567), 4); c5 += 2;
606         vst1q_lane_u16((void*) c4, vreinterpretq_u16_u8(vout4x01234567_5x01234567), 0); c4 += 2;
607         vst1q_lane_u16((void*) c3, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2;
608         vst1q_lane_u16((void*) c2, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2;
609         vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
610         vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
611         vout6x01234567_7x01234567 = vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 2);
612         vout4x01234567_5x01234567 = vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
613         vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
614         vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
615       }
616       if (nc & 1) {
617         vst1q_lane_u8(c7, vout6x01234567_7x01234567, 8);
618         vst1q_lane_u8(c6, vout6x01234567_7x01234567, 0);
619         vst1q_lane_u8(c5, vout4x01234567_5x01234567, 8);
620         vst1q_lane_u8(c4, vout4x01234567_5x01234567, 0);
621         vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
622         vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
623         vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
624         vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
625       }
626 
627       nc = 0;
628     }
629   } while (nc != 0);
630 }
631