xref: /aosp_15_r20/external/XNNPACK/src/qs8-igemm/gen/4x16c2s4-minmax-rndnu-neon-mull.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-igemm/c2-neon-mull-shuffle.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/gemm.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull(
19     size_t mr,
20     size_t nc,
21     size_t kc,
22     size_t ks,
23     const int8_t** restrict a,
24     const void* restrict w,
25     int8_t* restrict c,
26     size_t cm_stride,
27     size_t cn_stride,
28     size_t a_offset,
29     const int8_t* zero,
30     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
31 {
32   assert(mr != 0);
33   assert(mr <= 4);
34   assert(nc != 0);
35   assert(kc != 0);
36   assert(ks != 0);
37   assert(ks % (4 * sizeof(void*)) == 0);
38   assert(a_offset % sizeof(int8_t) == 0);
39   assert(a != NULL);
40   assert(w != NULL);
41   assert(c != NULL);
42 
43   int8_t* c0 = c;
44   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
45   if XNN_UNPREDICTABLE(mr < 2) {
46     c1 = c0;
47   }
48   int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
49   if XNN_UNPREDICTABLE(mr <= 2) {
50     c2 = c1;
51   }
52   int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
53   if XNN_UNPREDICTABLE(mr != 4) {
54     c3 = c2;
55   }
56 
57   kc = round_up_po2(kc, 8 * sizeof(int8_t));
58   do {
59     int32x4_t vacc0x0123 = vld1q_s32(w); w = (const int32_t*) w + 4;
60     int32x4_t vacc0x4567 = vld1q_s32(w); w = (const int32_t*) w + 4;
61     int32x4_t vacc0x89AB = vld1q_s32(w); w = (const int32_t*) w + 4;
62     int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const int32_t*) w + 4;
63     int32x4_t vacc1x0123 = vacc0x0123;
64     int32x4_t vacc1x4567 = vacc0x4567;
65     int32x4_t vacc1x89AB = vacc0x89AB;
66     int32x4_t vacc1xCDEF = vacc0xCDEF;
67     int32x4_t vacc2x0123 = vacc0x0123;
68     int32x4_t vacc2x4567 = vacc0x4567;
69     int32x4_t vacc2x89AB = vacc0x89AB;
70     int32x4_t vacc2xCDEF = vacc0xCDEF;
71     int32x4_t vacc3x0123 = vacc0x0123;
72     int32x4_t vacc3x4567 = vacc0x4567;
73     int32x4_t vacc3x89AB = vacc0x89AB;
74     int32x4_t vacc3xCDEF = vacc0xCDEF;
75 
76     size_t p = ks;
77     do {
78       const int8_t* restrict a0 = a[0];
79       if XNN_UNPREDICTABLE(a0 != zero) {
80         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
81       }
82       const int8_t* restrict a1 = a[1];
83       if XNN_UNPREDICTABLE(a1 != zero) {
84         a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
85       }
86       const int8_t* restrict a2 = a[2];
87       if XNN_UNPREDICTABLE(a2 != zero) {
88         a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
89       }
90       const int8_t* restrict a3 = a[3];
91       if XNN_UNPREDICTABLE(a3 != zero) {
92         a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
93       }
94       a += 4;
95 
96       size_t k = kc;
97       do {
98         int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
99         int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
100         int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
101         int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
102 
103         const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
104         const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
105         const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
106         const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const int8_t*) w + 8;
107         const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
108         const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
109         const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
110         const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const int8_t*) w + 8;
111         const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
112         const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
113         const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
114         const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const int8_t*) w + 8;
115         const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
116         const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
117         const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
118         const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const int8_t*) w + 8;
119 
120         int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
121         int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
122         int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, va2x0);
123         int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0);
124         vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
125         vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
126         vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
127         vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
128         int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
129         int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
130         int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, va2x0);
131         int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0);
132         vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
133         vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
134         vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
135         vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
136         int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, va0x0);
137         int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, va1x0);
138         int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, va2x0);
139         int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0);
140         vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
141         vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
142         vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
143         vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
144         int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, va0x0);
145         int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0);
146         int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, va2x0);
147         int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0);
148         vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
149         vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
150         vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
151         vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
152         va0x0 = vext_s8(va0x0, va0x0, 2);
153         va1x0 = vext_s8(va1x0, va1x0, 2);
154         va2x0 = vext_s8(va2x0, va2x0, 2);
155         va3x0 = vext_s8(va3x0, va3x0, 2);
156         int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
157         int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
158         int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, va2x0);
159         int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0);
160         vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
161         vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
162         vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
163         vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
164         int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
165         int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
166         int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, va2x0);
167         int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0);
168         vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
169         vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
170         vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
171         vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
172         int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, va0x0);
173         int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, va1x0);
174         int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, va2x0);
175         int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0);
176         vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
177         vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
178         vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
179         vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
180         int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0);
181         int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0);
182         int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0);
183         int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0);
184         vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
185         vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
186         vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
187         vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
188         va0x0 = vext_s8(va0x0, va0x0, 2);
189         va1x0 = vext_s8(va1x0, va1x0, 2);
190         va2x0 = vext_s8(va2x0, va2x0, 2);
191         va3x0 = vext_s8(va3x0, va3x0, 2);
192         int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
193         int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
194         int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, va2x0);
195         int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0);
196         vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
197         vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
198         vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
199         vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
200         int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
201         int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
202         int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, va2x0);
203         int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0);
204         vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
205         vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
206         vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
207         vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
208         int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, va0x0);
209         int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, va1x0);
210         int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, va2x0);
211         int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2x0, va3x0);
212         vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
213         vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
214         vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
215         vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
216         int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, va0x0);
217         int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, va1x0);
218         int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, va2x0);
219         int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2x0, va3x0);
220         vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
221         vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
222         vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
223         vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
224         va0x0 = vext_s8(va0x0, va0x0, 2);
225         va1x0 = vext_s8(va1x0, va1x0, 2);
226         va2x0 = vext_s8(va2x0, va2x0, 2);
227         va3x0 = vext_s8(va3x0, va3x0, 2);
228         int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
229         int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
230         int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, va2x0);
231         int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, va3x0);
232         vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
233         vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
234         vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
235         vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
236         int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
237         int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
238         int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, va2x0);
239         int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, va3x0);
240         vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
241         vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
242         vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
243         vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
244         int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, va0x0);
245         int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, va1x0);
246         int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, va2x0);
247         int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3x0, va3x0);
248         vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
249         vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
250         vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
251         vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
252         int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0);
253         int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0);
254         int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0);
255         int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3x0);
256         vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
257         vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
258         vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
259         vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
260 
261         k -= 8 * sizeof(int8_t);
262       } while (k != 0);
263 
264       p -= 4 * sizeof(void*);
265     } while (p != 0);
266 
267     const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
268     const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
269     const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
270 
271     vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
272     vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
273     vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
274     vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);
275     vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
276     vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);
277     vacc1x89AB = vqshlq_s32(vacc1x89AB, vright_pre_shift);
278     vacc1xCDEF = vqshlq_s32(vacc1xCDEF, vright_pre_shift);
279     vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift);
280     vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift);
281     vacc2x89AB = vqshlq_s32(vacc2x89AB, vright_pre_shift);
282     vacc2xCDEF = vqshlq_s32(vacc2xCDEF, vright_pre_shift);
283     vacc3x0123 = vqshlq_s32(vacc3x0123, vright_pre_shift);
284     vacc3x4567 = vqshlq_s32(vacc3x4567, vright_pre_shift);
285     vacc3x89AB = vqshlq_s32(vacc3x89AB, vright_pre_shift);
286     vacc3xCDEF = vqshlq_s32(vacc3xCDEF, vright_pre_shift);
287 
288     vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
289     vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
290     vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
291     vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);
292     vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
293     vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
294     vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier);
295     vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier);
296     vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
297     vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);
298     vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier);
299     vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier);
300     vacc3x0123 = vqdmulhq_s32(vacc3x0123, vmultiplier);
301     vacc3x4567 = vqdmulhq_s32(vacc3x4567, vmultiplier);
302     vacc3x89AB = vqdmulhq_s32(vacc3x89AB, vmultiplier);
303     vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier);
304 
305     vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
306     vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
307     vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
308     vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);
309     vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
310     vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
311     vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift);
312     vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift);
313     vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
314     vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);
315     vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift);
316     vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift);
317     vacc3x0123 = vrshlq_s32(vacc3x0123, vright_post_shift);
318     vacc3x4567 = vrshlq_s32(vacc3x4567, vright_post_shift);
319     vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_post_shift);
320     vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift);
321 
322     const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
323 #if XNN_ARCH_ARM64
324     int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
325     int16x8_t vacc0x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF);
326     int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);
327     int16x8_t vacc1x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF);
328     int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567);
329     int16x8_t vacc2x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF);
330     int16x8_t vacc3x01234567 = vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567);
331     int16x8_t vacc3x89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF);
332 
333     vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
334     vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);
335     vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
336     vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point);
337     vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);
338     vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point);
339     vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point);
340     vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point);
341 
342     int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
343     int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
344     int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
345     int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
346 #else
347     int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
348     int16x8_t vacc0x89ABCDEF = vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF));
349     int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
350     int16x8_t vacc1x89ABCDEF = vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF));
351     int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));
352     int16x8_t vacc2x89ABCDEF = vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF));
353     int16x8_t vacc3x01234567 = vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567));
354     int16x8_t vacc3x89ABCDEF = vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF));
355 
356     vacc0x01234567 = vqaddq_s16(vacc0x01234567, voutput_zero_point);
357     vacc0x89ABCDEF = vqaddq_s16(vacc0x89ABCDEF, voutput_zero_point);
358     vacc1x01234567 = vqaddq_s16(vacc1x01234567, voutput_zero_point);
359     vacc1x89ABCDEF = vqaddq_s16(vacc1x89ABCDEF, voutput_zero_point);
360     vacc2x01234567 = vqaddq_s16(vacc2x01234567, voutput_zero_point);
361     vacc2x89ABCDEF = vqaddq_s16(vacc2x89ABCDEF, voutput_zero_point);
362     vacc3x01234567 = vqaddq_s16(vacc3x01234567, voutput_zero_point);
363     vacc3x89ABCDEF = vqaddq_s16(vacc3x89ABCDEF, voutput_zero_point);
364 
365     int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
366     int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
367     int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
368     int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
369 #endif
370 
371     const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
372     vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
373     vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
374     vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
375     vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
376 
377     const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);
378     vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
379     vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
380     vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
381     vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
382 
383     if (nc >= 16) {
384       vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
385       vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
386       vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
387       vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
388 
389       c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
390       c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
391       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
392       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
393 
394       a = (const int8_t**restrict) ((uintptr_t) a - ks);
395 
396       nc -= 16;
397     } else {
398       int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
399       int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
400       if (nc & 8) {
401         vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
402         vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
403         vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
404         vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
405         vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
406         vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
407       }
408       if (nc & 4) {
409         vst1q_lane_u32((void*) c3, vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
410         vst1q_lane_u32((void*) c2, vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
411         vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
412         vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
413         vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
414         vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
415       }
416       if (nc & 2) {
417         vst1q_lane_u16((void*) c3, vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
418         vst1q_lane_u16((void*) c2, vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
419         vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
420         vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
421         vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
422         vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
423       }
424       if (nc & 1) {
425         vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
426         vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
427         vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
428         vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
429       }
430 
431       nc = 0;
432     }
433   } while (nc != 0);
434 }
435