1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gemm/neon-mull-addw-dup.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/common.h>
15 #include <xnnpack/gemm.h>
16
17
xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup(
19 size_t mr,
20 size_t nc,
21 size_t kc,
22 const int8_t* restrict a,
23 size_t a_stride,
24 const void* restrict w,
25 int8_t* restrict c,
26 size_t cm_stride,
27 size_t cn_stride,
28 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
29 {
30 assert(mr != 0);
31 assert(mr <= 3);
32 assert(nc != 0);
33 assert(kc != 0);
34 assert(kc % sizeof(int8_t) == 0);
35 assert(a != NULL);
36 assert(w != NULL);
37 assert(c != NULL);
38
39 const int8_t* a0 = a;
40 int8_t* c0 = c;
41 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
42 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
43 if XNN_UNPREDICTABLE(mr < 2) {
44 a1 = a0;
45 c1 = c0;
46 }
47 const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
48 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
49 if XNN_UNPREDICTABLE(mr <= 2) {
50 a2 = a1;
51 c2 = c1;
52 }
53
54 do {
55 int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
56 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
57 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
58 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
59 int32x4_t vacc1x0123 = vacc0x0123;
60 int32x4_t vacc1x4567 = vacc0x4567;
61 int32x4_t vacc1x89AB = vacc0x89AB;
62 int32x4_t vacc1xCDEF = vacc0xCDEF;
63 int32x4_t vacc2x0123 = vacc0x0123;
64 int32x4_t vacc2x4567 = vacc0x4567;
65 int32x4_t vacc2x89AB = vacc0x89AB;
66 int32x4_t vacc2xCDEF = vacc0xCDEF;
67
68 size_t k = kc;
69 while (k >= 8 * sizeof(int8_t)) {
70 const int8x8_t va0 = vld1_s8(a0); a0 += 8;
71 const int8x8_t va1 = vld1_s8(a1); a1 += 8;
72 const int8x8_t va2 = vld1_s8(a2); a2 += 8;
73
74 const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
75
76 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
77 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
78 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
79 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
80 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
81 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
82 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
83 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
84 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
85 const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
86
87 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
88 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
89 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
90 const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
91 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
92 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
93 const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
94 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
95 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
96 const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
97
98 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
99 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
100 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
101 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
102 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
103 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
104 const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
105 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
106 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
107 const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
108
109 const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
110 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
111 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
112 const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
113 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
114 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
115 const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
116 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
117 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
118 const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
119
120 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
121 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
122 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
123 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
124 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
125 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
126 const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
127 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
128 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
129 const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
130
131 const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
132 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
133 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
134 const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
135 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
136 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
137 const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
138 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
139 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
140 const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
141
142 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
143 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
144 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
145 const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
146 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
147 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
148 const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
149 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
150 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
151 const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
152
153 const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
154 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
155 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
156 const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
157 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
158 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
159 const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
160 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
161 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
162 const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
163
164 const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
165 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
166 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
167 const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
168 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
169 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
170 const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
171 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
172 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
173 const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
174
175 const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
176 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
177 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
178 const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
179 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
180 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
181 const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
182 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
183 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
184 const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
185
186 const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
187 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
188 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
189 const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
190 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
191 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
192 const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
193 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
194 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
195 const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
196
197 const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
198 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
199 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
200 const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
201 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
202 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
203 const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
204 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
205 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
206 const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
207
208 const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
209 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
210 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
211 const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
212 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
213 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
214 const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
215 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
216 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
217 const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
218
219 const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
220 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
221 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
222 const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
223 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
224 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
225 const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
226 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
227 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
228 const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
229
230 const int16x8_t vprod0x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va0, 7));
231 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c7));
232 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c7));
233 const int16x8_t vprod1x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va1, 7));
234 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c7));
235 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c7));
236 const int16x8_t vprod2x01234567c7 = vmull_s8(vb01234567c7, vdup_lane_s8(va2, 7));
237 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c7));
238 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c7));
239 const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
240
241 const int16x8_t vprod0x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va0, 7));
242 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc7));
243 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7));
244 const int16x8_t vprod1x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va1, 7));
245 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc7));
246 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc7));
247 const int16x8_t vprod2x89ABCDEFc7 = vmull_s8(vb89ABCDEFc7, vdup_lane_s8(va2, 7));
248 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc7));
249 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc7));
250
251 k -= 8 * sizeof(int8_t);
252 }
253 if XNN_UNLIKELY(k != 0) {
254 const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
255 const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
256 const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
257
258 const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
259 const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
260
261 const int16x8_t vprod0x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va0, 0));
262 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0));
263 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c0));
264 const int16x8_t vprod0x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va0, 0));
265 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0));
266 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0));
267 const int16x8_t vprod1x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va1, 0));
268 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0));
269 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c0));
270 const int16x8_t vprod1x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va1, 0));
271 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0));
272 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc0));
273 const int16x8_t vprod2x01234567c0 = vmull_s8(vb01234567c0, vdup_lane_s8(va2, 0));
274 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0));
275 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c0));
276 const int16x8_t vprod2x89ABCDEFc0 = vmull_s8(vb89ABCDEFc0, vdup_lane_s8(va2, 0));
277 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0));
278 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc0));
279
280 if (k >= 2 * sizeof(int8_t)) {
281 const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
282 const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
283
284 const int16x8_t vprod0x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va0, 1));
285 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1));
286 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c1));
287 const int16x8_t vprod0x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va0, 1));
288 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc1));
289 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1));
290 const int16x8_t vprod1x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va1, 1));
291 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1));
292 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c1));
293 const int16x8_t vprod1x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va1, 1));
294 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc1));
295 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc1));
296 const int16x8_t vprod2x01234567c1 = vmull_s8(vb01234567c1, vdup_lane_s8(va2, 1));
297 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c1));
298 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c1));
299 const int16x8_t vprod2x89ABCDEFc1 = vmull_s8(vb89ABCDEFc1, vdup_lane_s8(va2, 1));
300 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc1));
301 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc1));
302
303 if (k > 2 * sizeof(int8_t)) {
304 const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
305 const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
306
307 const int16x8_t vprod0x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va0, 2));
308 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c2));
309 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c2));
310 const int16x8_t vprod0x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va0, 2));
311 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc2));
312 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2));
313 const int16x8_t vprod1x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va1, 2));
314 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c2));
315 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c2));
316 const int16x8_t vprod1x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va1, 2));
317 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc2));
318 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc2));
319 const int16x8_t vprod2x01234567c2 = vmull_s8(vb01234567c2, vdup_lane_s8(va2, 2));
320 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c2));
321 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c2));
322 const int16x8_t vprod2x89ABCDEFc2 = vmull_s8(vb89ABCDEFc2, vdup_lane_s8(va2, 2));
323 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc2));
324 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc2));
325
326 if (k >= 4 * sizeof(int8_t)) {
327 const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
328 const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
329
330 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3));
331 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3));
332 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3));
333 const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3));
334 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc3));
335 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3));
336 const int16x8_t vprod1x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va1, 3));
337 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c3));
338 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c3));
339 const int16x8_t vprod1x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va1, 3));
340 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc3));
341 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc3));
342 const int16x8_t vprod2x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va2, 3));
343 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c3));
344 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c3));
345 const int16x8_t vprod2x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va2, 3));
346 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc3));
347 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc3));
348
349 if (k > 4 * sizeof(int8_t)) {
350 const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
351 const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
352
353 const int16x8_t vprod0x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va0, 4));
354 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c4));
355 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c4));
356 const int16x8_t vprod0x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va0, 4));
357 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc4));
358 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4));
359 const int16x8_t vprod1x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va1, 4));
360 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c4));
361 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c4));
362 const int16x8_t vprod1x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va1, 4));
363 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc4));
364 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc4));
365 const int16x8_t vprod2x01234567c4 = vmull_s8(vb01234567c4, vdup_lane_s8(va2, 4));
366 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c4));
367 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c4));
368 const int16x8_t vprod2x89ABCDEFc4 = vmull_s8(vb89ABCDEFc4, vdup_lane_s8(va2, 4));
369 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc4));
370 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc4));
371
372 if (k >= 6 * sizeof(int8_t)) {
373 const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
374 const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
375
376 const int16x8_t vprod0x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va0, 5));
377 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c5));
378 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c5));
379 const int16x8_t vprod0x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va0, 5));
380 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc5));
381 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5));
382 const int16x8_t vprod1x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va1, 5));
383 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c5));
384 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c5));
385 const int16x8_t vprod1x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va1, 5));
386 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc5));
387 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc5));
388 const int16x8_t vprod2x01234567c5 = vmull_s8(vb01234567c5, vdup_lane_s8(va2, 5));
389 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c5));
390 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c5));
391 const int16x8_t vprod2x89ABCDEFc5 = vmull_s8(vb89ABCDEFc5, vdup_lane_s8(va2, 5));
392 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc5));
393 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc5));
394
395 if (k > 6 * sizeof(int8_t)) {
396 const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
397 const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
398
399 const int16x8_t vprod0x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va0, 6));
400 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c6));
401 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c6));
402 const int16x8_t vprod0x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va0, 6));
403 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc6));
404 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6));
405 const int16x8_t vprod1x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va1, 6));
406 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c6));
407 vacc1x4567 = vaddw_s16(vacc1x4567, vget_high_s16(vprod1x01234567c6));
408 const int16x8_t vprod1x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va1, 6));
409 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc6));
410 vacc1xCDEF = vaddw_s16(vacc1xCDEF, vget_high_s16(vprod1x89ABCDEFc6));
411 const int16x8_t vprod2x01234567c6 = vmull_s8(vb01234567c6, vdup_lane_s8(va2, 6));
412 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c6));
413 vacc2x4567 = vaddw_s16(vacc2x4567, vget_high_s16(vprod2x01234567c6));
414 const int16x8_t vprod2x89ABCDEFc6 = vmull_s8(vb89ABCDEFc6, vdup_lane_s8(va2, 6));
415 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc6));
416 vacc2xCDEF = vaddw_s16(vacc2xCDEF, vget_high_s16(vprod2x89ABCDEFc6));
417 }
418 }
419 }
420 }
421 }
422 }
423 }
424
425 // Post-accumulation work
426 const int32x4_t vright_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_pre_shift);
427 const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier);
428 const int32x4_t vright_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.right_post_shift);
429
430 vacc0x0123 = vqshlq_s32(vacc0x0123, vright_pre_shift);
431 vacc0x4567 = vqshlq_s32(vacc0x4567, vright_pre_shift);
432 vacc0x89AB = vqshlq_s32(vacc0x89AB, vright_pre_shift);
433 vacc0xCDEF = vqshlq_s32(vacc0xCDEF, vright_pre_shift);
434 vacc1x0123 = vqshlq_s32(vacc1x0123, vright_pre_shift);
435 vacc1x4567 = vqshlq_s32(vacc1x4567, vright_pre_shift);
436 vacc1x89AB = vqshlq_s32(vacc1x89AB, vright_pre_shift);
437 vacc1xCDEF = vqshlq_s32(vacc1xCDEF, vright_pre_shift);
438 vacc2x0123 = vqshlq_s32(vacc2x0123, vright_pre_shift);
439 vacc2x4567 = vqshlq_s32(vacc2x4567, vright_pre_shift);
440 vacc2x89AB = vqshlq_s32(vacc2x89AB, vright_pre_shift);
441 vacc2xCDEF = vqshlq_s32(vacc2xCDEF, vright_pre_shift);
442
443 vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
444 vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
445 vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
446 vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);
447 vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
448 vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
449 vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier);
450 vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier);
451 vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
452 vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);
453 vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier);
454 vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier);
455
456 vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
457 vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
458 vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
459 vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);
460 vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
461 vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
462 vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift);
463 vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift);
464 vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
465 vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);
466 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift);
467 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift);
468
469 const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point);
470 #if XNN_ARCH_ARM64
471 const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
472 const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
473 const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
474 const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
475 const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
476 const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
477
478 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
479 int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
480 int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
481 #else
482 const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
483 const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
484 const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
485 const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
486 const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
487 const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
488
489 int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
490 int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
491 int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
492 #endif
493 const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min);
494 const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max);
495
496 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
497 vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
498 vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
499
500 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
501 vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
502 vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
503
504 if (nc >= 16) {
505 // Main case where there the 16 columns fit in the destination.
506 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
507 vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
508 vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
509
510 // Advance to the next 16 columns.
511 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
512 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
513 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
514
515 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
516 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
517 a2 = (const int8_t*) ((uintptr_t) a2 - kc);
518
519 nc -= 16;
520 } else {
521 // Final case where not all of the 16 columns fit in the destination.
522 int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
523 int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
524 if (nc & 8) {
525 vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
526 vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
527 vst1_s8(c2, vout2x01234567); c2 += 8;
528 vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
529 vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
530 }
531 if (nc & 4) {
532 vst1q_lane_u32((void*) c0, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
533 vst1q_lane_u32((void*) c1, vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
534 vst1_lane_u32((void*) c2, vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
535 vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
536 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
537 }
538 if (nc & 2) {
539 vst1q_lane_u16((void*) c0, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
540 vst1q_lane_u16((void*) c1, vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
541 vst1_lane_u16((void*) c2, vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
542 vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
543 vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
544 }
545 if (nc & 1) {
546 vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
547 vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
548 vst1_lane_s8(c2, vout2x01234567, 0);
549 }
550
551 nc = 0;
552 }
553 } while (nc != 0);
554 }
555