1 /*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <arm_neon.h>
10
11 #include <qnnpack/q8conv.h>
12 #include <requantization/runtime-neon.h>
13
pytorch_q8conv_ukernel_8x8__neon(size_t mr,size_t nr,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t c_stride,size_t output_channel_index,const union pytorch_qnnp_conv_quantization_params quantization_params[restrict static1])14 void pytorch_q8conv_ukernel_8x8__neon(
15 size_t mr,
16 size_t nr,
17 size_t kc,
18 size_t ks,
19 const uint8_t** restrict a,
20 const void* restrict w,
21 uint8_t* restrict c,
22 size_t c_stride,
23 size_t output_channel_index,
24 const union pytorch_qnnp_conv_quantization_params
25 quantization_params[restrict static 1]) {
26 const uint8x8_t va_zero_point =
27 vld1_dup_u8((const uint8_t*)&quantization_params->neon.input_zero_point);
28 const uint8x8_t vb_zero_point =
29 vld1_u8((const uint8_t*)&quantization_params->neon.kernel_zero_points
30 [output_channel_index]);
31
32 int32x4_t vacc0x0123 = vld1q_s32(w);
33 w = (void*)((uintptr_t)w + sizeof(int32x4_t));
34 int32x4_t vacc0x4567 = vld1q_s32(w);
35 w = (void*)((uintptr_t)w + sizeof(int32x4_t));
36 int32x4_t vacc1x0123 = vacc0x0123;
37 int32x4_t vacc1x4567 = vacc0x4567;
38 int32x4_t vacc2x0123 = vacc0x0123;
39 int32x4_t vacc2x4567 = vacc0x4567;
40 int32x4_t vacc3x0123 = vacc0x0123;
41 int32x4_t vacc3x4567 = vacc0x4567;
42 int32x4_t vacc4x0123 = vacc0x0123;
43 int32x4_t vacc4x4567 = vacc0x4567;
44 int32x4_t vacc5x0123 = vacc0x0123;
45 int32x4_t vacc5x4567 = vacc0x4567;
46 int32x4_t vacc6x0123 = vacc0x0123;
47 int32x4_t vacc6x4567 = vacc0x4567;
48 int32x4_t vacc7x0123 = vacc0x0123;
49 int32x4_t vacc7x4567 = vacc0x4567;
50
51 do {
52 const uint8_t* restrict a0 = *a++;
53 const uint8_t* restrict a1 = *a++;
54 const uint8_t* restrict a2 = *a++;
55 const uint8_t* restrict a3 = *a++;
56 const uint8_t* restrict a4 = *a++;
57 const uint8_t* restrict a5 = *a++;
58 const uint8_t* restrict a6 = *a++;
59 const uint8_t* restrict a7 = *a++;
60
61 size_t k = kc;
62 for (; k >= 8; k -= 8) {
63 const uint8x8_t va0 = vld1_u8(a0);
64 a0 += 8;
65 const uint8x8_t va1 = vld1_u8(a1);
66 a1 += 8;
67 const uint8x8_t va2 = vld1_u8(a2);
68 a2 += 8;
69 const uint8x8_t va3 = vld1_u8(a3);
70 a3 += 8;
71 const uint8x8_t va4 = vld1_u8(a4);
72 a4 += 8;
73 const uint8x8_t va5 = vld1_u8(a5);
74 a5 += 8;
75 const uint8x8_t va6 = vld1_u8(a6);
76 a6 += 8;
77 const uint8x8_t va7 = vld1_u8(a7);
78 a7 += 8;
79 const int16x8_t vxa0 =
80 vreinterpretq_s16_u16(sub_zero_point(va0, va_zero_point));
81 const int16x8_t vxa1 =
82 vreinterpretq_s16_u16(sub_zero_point(va1, va_zero_point));
83 const int16x8_t vxa2 =
84 vreinterpretq_s16_u16(sub_zero_point(va2, va_zero_point));
85 const int16x8_t vxa3 =
86 vreinterpretq_s16_u16(sub_zero_point(va3, va_zero_point));
87 const int16x8_t vxa4 =
88 vreinterpretq_s16_u16(sub_zero_point(va4, va_zero_point));
89 const int16x8_t vxa5 =
90 vreinterpretq_s16_u16(sub_zero_point(va5, va_zero_point));
91 const int16x8_t vxa6 =
92 vreinterpretq_s16_u16(sub_zero_point(va6, va_zero_point));
93 const int16x8_t vxa7 =
94 vreinterpretq_s16_u16(sub_zero_point(va7, va_zero_point));
95
96 {
97 const uint8x8_t vb01234567 = vld1_u8(w);
98 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
99 const int16x8_t vxb01234567 =
100 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
101
102 vacc0x0123 = vmlal_lane_s16(
103 vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0);
104 vacc0x4567 = vmlal_lane_s16(
105 vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0);
106 vacc1x0123 = vmlal_lane_s16(
107 vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0);
108 vacc1x4567 = vmlal_lane_s16(
109 vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0);
110 vacc2x0123 = vmlal_lane_s16(
111 vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 0);
112 vacc2x4567 = vmlal_lane_s16(
113 vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 0);
114 vacc3x0123 = vmlal_lane_s16(
115 vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 0);
116 vacc3x4567 = vmlal_lane_s16(
117 vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0);
118 vacc4x0123 = vmlal_lane_s16(
119 vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 0);
120 vacc4x4567 = vmlal_lane_s16(
121 vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 0);
122 vacc5x0123 = vmlal_lane_s16(
123 vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 0);
124 vacc5x4567 = vmlal_lane_s16(
125 vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 0);
126 vacc6x0123 = vmlal_lane_s16(
127 vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 0);
128 vacc6x4567 = vmlal_lane_s16(
129 vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 0);
130 vacc7x0123 = vmlal_lane_s16(
131 vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 0);
132 vacc7x4567 = vmlal_lane_s16(
133 vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 0);
134 }
135
136 {
137 const uint8x8_t vb01234567 = vld1_u8(w);
138 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
139 const int16x8_t vxb01234567 =
140 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
141
142 vacc0x0123 = vmlal_lane_s16(
143 vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 1);
144 vacc0x4567 = vmlal_lane_s16(
145 vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 1);
146 vacc1x0123 = vmlal_lane_s16(
147 vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1);
148 vacc1x4567 = vmlal_lane_s16(
149 vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1);
150 vacc2x0123 = vmlal_lane_s16(
151 vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 1);
152 vacc2x4567 = vmlal_lane_s16(
153 vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 1);
154 vacc3x0123 = vmlal_lane_s16(
155 vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 1);
156 vacc3x4567 = vmlal_lane_s16(
157 vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 1);
158 vacc4x0123 = vmlal_lane_s16(
159 vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 1);
160 vacc4x4567 = vmlal_lane_s16(
161 vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 1);
162 vacc5x0123 = vmlal_lane_s16(
163 vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 1);
164 vacc5x4567 = vmlal_lane_s16(
165 vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 1);
166 vacc6x0123 = vmlal_lane_s16(
167 vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 1);
168 vacc6x4567 = vmlal_lane_s16(
169 vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 1);
170 vacc7x0123 = vmlal_lane_s16(
171 vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 1);
172 vacc7x4567 = vmlal_lane_s16(
173 vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 1);
174 }
175
176 {
177 const uint8x8_t vb01234567 = vld1_u8(w);
178 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
179 const int16x8_t vxb01234567 =
180 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
181
182 vacc0x0123 = vmlal_lane_s16(
183 vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 2);
184 vacc0x4567 = vmlal_lane_s16(
185 vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 2);
186 vacc1x0123 = vmlal_lane_s16(
187 vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2);
188 vacc1x4567 = vmlal_lane_s16(
189 vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2);
190 vacc2x0123 = vmlal_lane_s16(
191 vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 2);
192 vacc2x4567 = vmlal_lane_s16(
193 vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 2);
194 vacc3x0123 = vmlal_lane_s16(
195 vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 2);
196 vacc3x4567 = vmlal_lane_s16(
197 vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 2);
198 vacc4x0123 = vmlal_lane_s16(
199 vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 2);
200 vacc4x4567 = vmlal_lane_s16(
201 vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 2);
202 vacc5x0123 = vmlal_lane_s16(
203 vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 2);
204 vacc5x4567 = vmlal_lane_s16(
205 vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 2);
206 vacc6x0123 = vmlal_lane_s16(
207 vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 2);
208 vacc6x4567 = vmlal_lane_s16(
209 vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 2);
210 vacc7x0123 = vmlal_lane_s16(
211 vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 2);
212 vacc7x4567 = vmlal_lane_s16(
213 vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 2);
214 }
215
216 {
217 const uint8x8_t vb01234567 = vld1_u8(w);
218 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
219 const int16x8_t vxb01234567 =
220 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
221
222 vacc0x0123 = vmlal_lane_s16(
223 vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 3);
224 vacc0x4567 = vmlal_lane_s16(
225 vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 3);
226 vacc1x0123 = vmlal_lane_s16(
227 vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3);
228 vacc1x4567 = vmlal_lane_s16(
229 vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3);
230 vacc2x0123 = vmlal_lane_s16(
231 vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 3);
232 vacc2x4567 = vmlal_lane_s16(
233 vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 3);
234 vacc3x0123 = vmlal_lane_s16(
235 vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 3);
236 vacc3x4567 = vmlal_lane_s16(
237 vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 3);
238 vacc4x0123 = vmlal_lane_s16(
239 vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 3);
240 vacc4x4567 = vmlal_lane_s16(
241 vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 3);
242 vacc5x0123 = vmlal_lane_s16(
243 vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 3);
244 vacc5x4567 = vmlal_lane_s16(
245 vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 3);
246 vacc6x0123 = vmlal_lane_s16(
247 vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 3);
248 vacc6x4567 = vmlal_lane_s16(
249 vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 3);
250 vacc7x0123 = vmlal_lane_s16(
251 vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 3);
252 vacc7x4567 = vmlal_lane_s16(
253 vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 3);
254 }
255
256 {
257 const uint8x8_t vb01234567 = vld1_u8(w);
258 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
259 const int16x8_t vxb01234567 =
260 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
261
262 vacc0x0123 = vmlal_lane_s16(
263 vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 0);
264 vacc0x4567 = vmlal_lane_s16(
265 vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 0);
266 vacc1x0123 = vmlal_lane_s16(
267 vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 0);
268 vacc1x4567 = vmlal_lane_s16(
269 vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 0);
270 vacc2x0123 = vmlal_lane_s16(
271 vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 0);
272 vacc2x4567 = vmlal_lane_s16(
273 vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 0);
274 vacc3x0123 = vmlal_lane_s16(
275 vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 0);
276 vacc3x4567 = vmlal_lane_s16(
277 vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 0);
278 vacc4x0123 = vmlal_lane_s16(
279 vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 0);
280 vacc4x4567 = vmlal_lane_s16(
281 vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 0);
282 vacc5x0123 = vmlal_lane_s16(
283 vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 0);
284 vacc5x4567 = vmlal_lane_s16(
285 vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 0);
286 vacc6x0123 = vmlal_lane_s16(
287 vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 0);
288 vacc6x4567 = vmlal_lane_s16(
289 vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 0);
290 vacc7x0123 = vmlal_lane_s16(
291 vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 0);
292 vacc7x4567 = vmlal_lane_s16(
293 vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 0);
294 }
295
296 {
297 const uint8x8_t vb01234567 = vld1_u8(w);
298 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
299 const int16x8_t vxb01234567 =
300 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
301
302 vacc0x0123 = vmlal_lane_s16(
303 vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 1);
304 vacc0x4567 = vmlal_lane_s16(
305 vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 1);
306 vacc1x0123 = vmlal_lane_s16(
307 vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 1);
308 vacc1x4567 = vmlal_lane_s16(
309 vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 1);
310 vacc2x0123 = vmlal_lane_s16(
311 vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 1);
312 vacc2x4567 = vmlal_lane_s16(
313 vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 1);
314 vacc3x0123 = vmlal_lane_s16(
315 vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 1);
316 vacc3x4567 = vmlal_lane_s16(
317 vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 1);
318 vacc4x0123 = vmlal_lane_s16(
319 vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 1);
320 vacc4x4567 = vmlal_lane_s16(
321 vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 1);
322 vacc5x0123 = vmlal_lane_s16(
323 vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 1);
324 vacc5x4567 = vmlal_lane_s16(
325 vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 1);
326 vacc6x0123 = vmlal_lane_s16(
327 vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 1);
328 vacc6x4567 = vmlal_lane_s16(
329 vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 1);
330 vacc7x0123 = vmlal_lane_s16(
331 vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 1);
332 vacc7x4567 = vmlal_lane_s16(
333 vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 1);
334 }
335
336 {
337 const uint8x8_t vb01234567 = vld1_u8(w);
338 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
339 const int16x8_t vxb01234567 =
340 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
341
342 vacc0x0123 = vmlal_lane_s16(
343 vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 2);
344 vacc0x4567 = vmlal_lane_s16(
345 vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 2);
346 vacc1x0123 = vmlal_lane_s16(
347 vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 2);
348 vacc1x4567 = vmlal_lane_s16(
349 vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 2);
350 vacc2x0123 = vmlal_lane_s16(
351 vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 2);
352 vacc2x4567 = vmlal_lane_s16(
353 vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 2);
354 vacc3x0123 = vmlal_lane_s16(
355 vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 2);
356 vacc3x4567 = vmlal_lane_s16(
357 vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 2);
358 vacc4x0123 = vmlal_lane_s16(
359 vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 2);
360 vacc4x4567 = vmlal_lane_s16(
361 vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 2);
362 vacc5x0123 = vmlal_lane_s16(
363 vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 2);
364 vacc5x4567 = vmlal_lane_s16(
365 vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 2);
366 vacc6x0123 = vmlal_lane_s16(
367 vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 2);
368 vacc6x4567 = vmlal_lane_s16(
369 vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 2);
370 vacc7x0123 = vmlal_lane_s16(
371 vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 2);
372 vacc7x4567 = vmlal_lane_s16(
373 vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 2);
374 }
375
376 {
377 const uint8x8_t vb01234567 = vld1_u8(w);
378 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
379 const int16x8_t vxb01234567 =
380 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
381
382 vacc0x0123 = vmlal_lane_s16(
383 vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 3);
384 vacc0x4567 = vmlal_lane_s16(
385 vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 3);
386 vacc1x0123 = vmlal_lane_s16(
387 vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 3);
388 vacc1x4567 = vmlal_lane_s16(
389 vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 3);
390 vacc2x0123 = vmlal_lane_s16(
391 vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 3);
392 vacc2x4567 = vmlal_lane_s16(
393 vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 3);
394 vacc3x0123 = vmlal_lane_s16(
395 vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 3);
396 vacc3x4567 = vmlal_lane_s16(
397 vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 3);
398 vacc4x0123 = vmlal_lane_s16(
399 vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 3);
400 vacc4x4567 = vmlal_lane_s16(
401 vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 3);
402 vacc5x0123 = vmlal_lane_s16(
403 vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 3);
404 vacc5x4567 = vmlal_lane_s16(
405 vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 3);
406 vacc6x0123 = vmlal_lane_s16(
407 vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 3);
408 vacc6x4567 = vmlal_lane_s16(
409 vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 3);
410 vacc7x0123 = vmlal_lane_s16(
411 vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 3);
412 vacc7x4567 = vmlal_lane_s16(
413 vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 3);
414 }
415 }
416 if (k != 0) {
417 const size_t a_predecrement = 8 - k;
418 const int64x1_t va_shift = vmov_n_s64(-8 * a_predecrement);
419 const uint8x8_t va0 = vreinterpret_u8_u64(vshl_u64(
420 vreinterpret_u64_u8(vld1_u8(a0 - a_predecrement)), va_shift));
421 const uint8x8_t va1 = vreinterpret_u8_u64(vshl_u64(
422 vreinterpret_u64_u8(vld1_u8(a1 - a_predecrement)), va_shift));
423 const uint8x8_t va2 = vreinterpret_u8_u64(vshl_u64(
424 vreinterpret_u64_u8(vld1_u8(a2 - a_predecrement)), va_shift));
425 const uint8x8_t va3 = vreinterpret_u8_u64(vshl_u64(
426 vreinterpret_u64_u8(vld1_u8(a3 - a_predecrement)), va_shift));
427 const uint8x8_t va4 = vreinterpret_u8_u64(vshl_u64(
428 vreinterpret_u64_u8(vld1_u8(a4 - a_predecrement)), va_shift));
429 const uint8x8_t va5 = vreinterpret_u8_u64(vshl_u64(
430 vreinterpret_u64_u8(vld1_u8(a5 - a_predecrement)), va_shift));
431 const uint8x8_t va6 = vreinterpret_u8_u64(vshl_u64(
432 vreinterpret_u64_u8(vld1_u8(a6 - a_predecrement)), va_shift));
433 const uint8x8_t va7 = vreinterpret_u8_u64(vshl_u64(
434 vreinterpret_u64_u8(vld1_u8(a7 - a_predecrement)), va_shift));
435 const int16x8_t vxa0 =
436 vreinterpretq_s16_u16(sub_zero_point(va0, va_zero_point));
437 const int16x8_t vxa1 =
438 vreinterpretq_s16_u16(sub_zero_point(va1, va_zero_point));
439 const int16x8_t vxa2 =
440 vreinterpretq_s16_u16(sub_zero_point(va2, va_zero_point));
441 const int16x8_t vxa3 =
442 vreinterpretq_s16_u16(sub_zero_point(va3, va_zero_point));
443 const int16x8_t vxa4 =
444 vreinterpretq_s16_u16(sub_zero_point(va4, va_zero_point));
445 const int16x8_t vxa5 =
446 vreinterpretq_s16_u16(sub_zero_point(va5, va_zero_point));
447 const int16x8_t vxa6 =
448 vreinterpretq_s16_u16(sub_zero_point(va6, va_zero_point));
449 const int16x8_t vxa7 =
450 vreinterpretq_s16_u16(sub_zero_point(va7, va_zero_point));
451
452 {
453 const uint8x8_t vb01234567 = vld1_u8(w);
454 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
455 const int16x8_t vxb01234567 =
456 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
457
458 vacc0x0123 = vmlal_lane_s16(
459 vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0);
460 vacc0x4567 = vmlal_lane_s16(
461 vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0);
462 vacc1x0123 = vmlal_lane_s16(
463 vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0);
464 vacc1x4567 = vmlal_lane_s16(
465 vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0);
466 vacc2x0123 = vmlal_lane_s16(
467 vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 0);
468 vacc2x4567 = vmlal_lane_s16(
469 vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 0);
470 vacc3x0123 = vmlal_lane_s16(
471 vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 0);
472 vacc3x4567 = vmlal_lane_s16(
473 vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0);
474 vacc4x0123 = vmlal_lane_s16(
475 vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 0);
476 vacc4x4567 = vmlal_lane_s16(
477 vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 0);
478 vacc5x0123 = vmlal_lane_s16(
479 vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 0);
480 vacc5x4567 = vmlal_lane_s16(
481 vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 0);
482 vacc6x0123 = vmlal_lane_s16(
483 vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 0);
484 vacc6x4567 = vmlal_lane_s16(
485 vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 0);
486 vacc7x0123 = vmlal_lane_s16(
487 vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 0);
488 vacc7x4567 = vmlal_lane_s16(
489 vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 0);
490 }
491
492 if (k >= 2) {
493 const uint8x8_t vb01234567 = vld1_u8(w);
494 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
495 const int16x8_t vxb01234567 =
496 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
497
498 vacc0x0123 = vmlal_lane_s16(
499 vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 1);
500 vacc0x4567 = vmlal_lane_s16(
501 vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 1);
502 vacc1x0123 = vmlal_lane_s16(
503 vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1);
504 vacc1x4567 = vmlal_lane_s16(
505 vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1);
506 vacc2x0123 = vmlal_lane_s16(
507 vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 1);
508 vacc2x4567 = vmlal_lane_s16(
509 vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 1);
510 vacc3x0123 = vmlal_lane_s16(
511 vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 1);
512 vacc3x4567 = vmlal_lane_s16(
513 vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 1);
514 vacc4x0123 = vmlal_lane_s16(
515 vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 1);
516 vacc4x4567 = vmlal_lane_s16(
517 vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 1);
518 vacc5x0123 = vmlal_lane_s16(
519 vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 1);
520 vacc5x4567 = vmlal_lane_s16(
521 vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 1);
522 vacc6x0123 = vmlal_lane_s16(
523 vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 1);
524 vacc6x4567 = vmlal_lane_s16(
525 vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 1);
526 vacc7x0123 = vmlal_lane_s16(
527 vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 1);
528 vacc7x4567 = vmlal_lane_s16(
529 vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 1);
530
531 if (k > 2) {
532 const uint8x8_t vb01234567 = vld1_u8(w);
533 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
534 const int16x8_t vxb01234567 =
535 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
536
537 vacc0x0123 = vmlal_lane_s16(
538 vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 2);
539 vacc0x4567 = vmlal_lane_s16(
540 vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 2);
541 vacc1x0123 = vmlal_lane_s16(
542 vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2);
543 vacc1x4567 = vmlal_lane_s16(
544 vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2);
545 vacc2x0123 = vmlal_lane_s16(
546 vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 2);
547 vacc2x4567 = vmlal_lane_s16(
548 vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 2);
549 vacc3x0123 = vmlal_lane_s16(
550 vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 2);
551 vacc3x4567 = vmlal_lane_s16(
552 vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 2);
553 vacc4x0123 = vmlal_lane_s16(
554 vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 2);
555 vacc4x4567 = vmlal_lane_s16(
556 vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 2);
557 vacc5x0123 = vmlal_lane_s16(
558 vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 2);
559 vacc5x4567 = vmlal_lane_s16(
560 vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 2);
561 vacc6x0123 = vmlal_lane_s16(
562 vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 2);
563 vacc6x4567 = vmlal_lane_s16(
564 vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 2);
565 vacc7x0123 = vmlal_lane_s16(
566 vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 2);
567 vacc7x4567 = vmlal_lane_s16(
568 vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 2);
569
570 if (k >= 4) {
571 const uint8x8_t vb01234567 = vld1_u8(w);
572 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
573 const int16x8_t vxb01234567 =
574 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
575
576 vacc0x0123 = vmlal_lane_s16(
577 vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 3);
578 vacc0x4567 = vmlal_lane_s16(
579 vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 3);
580 vacc1x0123 = vmlal_lane_s16(
581 vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3);
582 vacc1x4567 = vmlal_lane_s16(
583 vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3);
584 vacc2x0123 = vmlal_lane_s16(
585 vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 3);
586 vacc2x4567 = vmlal_lane_s16(
587 vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 3);
588 vacc3x0123 = vmlal_lane_s16(
589 vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 3);
590 vacc3x4567 = vmlal_lane_s16(
591 vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 3);
592 vacc4x0123 = vmlal_lane_s16(
593 vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 3);
594 vacc4x4567 = vmlal_lane_s16(
595 vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 3);
596 vacc5x0123 = vmlal_lane_s16(
597 vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 3);
598 vacc5x4567 = vmlal_lane_s16(
599 vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 3);
600 vacc6x0123 = vmlal_lane_s16(
601 vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 3);
602 vacc6x4567 = vmlal_lane_s16(
603 vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 3);
604 vacc7x0123 = vmlal_lane_s16(
605 vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 3);
606 vacc7x4567 = vmlal_lane_s16(
607 vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 3);
608
609 if (k > 4) {
610 const uint8x8_t vb01234567 = vld1_u8(w);
611 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
612 const int16x8_t vxb01234567 =
613 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
614
615 vacc0x0123 = vmlal_lane_s16(
616 vacc0x0123,
617 vget_low_s16(vxb01234567),
618 vget_high_s16(vxa0),
619 0);
620 vacc0x4567 = vmlal_lane_s16(
621 vacc0x4567,
622 vget_high_s16(vxb01234567),
623 vget_high_s16(vxa0),
624 0);
625 vacc1x0123 = vmlal_lane_s16(
626 vacc1x0123,
627 vget_low_s16(vxb01234567),
628 vget_high_s16(vxa1),
629 0);
630 vacc1x4567 = vmlal_lane_s16(
631 vacc1x4567,
632 vget_high_s16(vxb01234567),
633 vget_high_s16(vxa1),
634 0);
635 vacc2x0123 = vmlal_lane_s16(
636 vacc2x0123,
637 vget_low_s16(vxb01234567),
638 vget_high_s16(vxa2),
639 0);
640 vacc2x4567 = vmlal_lane_s16(
641 vacc2x4567,
642 vget_high_s16(vxb01234567),
643 vget_high_s16(vxa2),
644 0);
645 vacc3x0123 = vmlal_lane_s16(
646 vacc3x0123,
647 vget_low_s16(vxb01234567),
648 vget_high_s16(vxa3),
649 0);
650 vacc3x4567 = vmlal_lane_s16(
651 vacc3x4567,
652 vget_high_s16(vxb01234567),
653 vget_high_s16(vxa3),
654 0);
655 vacc4x0123 = vmlal_lane_s16(
656 vacc4x0123,
657 vget_low_s16(vxb01234567),
658 vget_high_s16(vxa4),
659 0);
660 vacc4x4567 = vmlal_lane_s16(
661 vacc4x4567,
662 vget_high_s16(vxb01234567),
663 vget_high_s16(vxa4),
664 0);
665 vacc5x0123 = vmlal_lane_s16(
666 vacc5x0123,
667 vget_low_s16(vxb01234567),
668 vget_high_s16(vxa5),
669 0);
670 vacc5x4567 = vmlal_lane_s16(
671 vacc5x4567,
672 vget_high_s16(vxb01234567),
673 vget_high_s16(vxa5),
674 0);
675 vacc6x0123 = vmlal_lane_s16(
676 vacc6x0123,
677 vget_low_s16(vxb01234567),
678 vget_high_s16(vxa6),
679 0);
680 vacc6x4567 = vmlal_lane_s16(
681 vacc6x4567,
682 vget_high_s16(vxb01234567),
683 vget_high_s16(vxa6),
684 0);
685 vacc7x0123 = vmlal_lane_s16(
686 vacc7x0123,
687 vget_low_s16(vxb01234567),
688 vget_high_s16(vxa7),
689 0);
690 vacc7x4567 = vmlal_lane_s16(
691 vacc7x4567,
692 vget_high_s16(vxb01234567),
693 vget_high_s16(vxa7),
694 0);
695
696 if (k >= 6) {
697 const uint8x8_t vb01234567 = vld1_u8(w);
698 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
699 const int16x8_t vxb01234567 =
700 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
701
702 vacc0x0123 = vmlal_lane_s16(
703 vacc0x0123,
704 vget_low_s16(vxb01234567),
705 vget_high_s16(vxa0),
706 1);
707 vacc0x4567 = vmlal_lane_s16(
708 vacc0x4567,
709 vget_high_s16(vxb01234567),
710 vget_high_s16(vxa0),
711 1);
712 vacc1x0123 = vmlal_lane_s16(
713 vacc1x0123,
714 vget_low_s16(vxb01234567),
715 vget_high_s16(vxa1),
716 1);
717 vacc1x4567 = vmlal_lane_s16(
718 vacc1x4567,
719 vget_high_s16(vxb01234567),
720 vget_high_s16(vxa1),
721 1);
722 vacc2x0123 = vmlal_lane_s16(
723 vacc2x0123,
724 vget_low_s16(vxb01234567),
725 vget_high_s16(vxa2),
726 1);
727 vacc2x4567 = vmlal_lane_s16(
728 vacc2x4567,
729 vget_high_s16(vxb01234567),
730 vget_high_s16(vxa2),
731 1);
732 vacc3x0123 = vmlal_lane_s16(
733 vacc3x0123,
734 vget_low_s16(vxb01234567),
735 vget_high_s16(vxa3),
736 1);
737 vacc3x4567 = vmlal_lane_s16(
738 vacc3x4567,
739 vget_high_s16(vxb01234567),
740 vget_high_s16(vxa3),
741 1);
742 vacc4x0123 = vmlal_lane_s16(
743 vacc4x0123,
744 vget_low_s16(vxb01234567),
745 vget_high_s16(vxa4),
746 1);
747 vacc4x4567 = vmlal_lane_s16(
748 vacc4x4567,
749 vget_high_s16(vxb01234567),
750 vget_high_s16(vxa4),
751 1);
752 vacc5x0123 = vmlal_lane_s16(
753 vacc5x0123,
754 vget_low_s16(vxb01234567),
755 vget_high_s16(vxa5),
756 1);
757 vacc5x4567 = vmlal_lane_s16(
758 vacc5x4567,
759 vget_high_s16(vxb01234567),
760 vget_high_s16(vxa5),
761 1);
762 vacc6x0123 = vmlal_lane_s16(
763 vacc6x0123,
764 vget_low_s16(vxb01234567),
765 vget_high_s16(vxa6),
766 1);
767 vacc6x4567 = vmlal_lane_s16(
768 vacc6x4567,
769 vget_high_s16(vxb01234567),
770 vget_high_s16(vxa6),
771 1);
772 vacc7x0123 = vmlal_lane_s16(
773 vacc7x0123,
774 vget_low_s16(vxb01234567),
775 vget_high_s16(vxa7),
776 1);
777 vacc7x4567 = vmlal_lane_s16(
778 vacc7x4567,
779 vget_high_s16(vxb01234567),
780 vget_high_s16(vxa7),
781 1);
782
783 if (k > 6) {
784 const uint8x8_t vb01234567 = vld1_u8(w);
785 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
786 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(
787 vsubl_u8(vb01234567, vb_zero_point));
788
789 vacc0x0123 = vmlal_lane_s16(
790 vacc0x0123,
791 vget_low_s16(vxb01234567),
792 vget_high_s16(vxa0),
793 2);
794 vacc0x4567 = vmlal_lane_s16(
795 vacc0x4567,
796 vget_high_s16(vxb01234567),
797 vget_high_s16(vxa0),
798 2);
799 vacc1x0123 = vmlal_lane_s16(
800 vacc1x0123,
801 vget_low_s16(vxb01234567),
802 vget_high_s16(vxa1),
803 2);
804 vacc1x4567 = vmlal_lane_s16(
805 vacc1x4567,
806 vget_high_s16(vxb01234567),
807 vget_high_s16(vxa1),
808 2);
809 vacc2x0123 = vmlal_lane_s16(
810 vacc2x0123,
811 vget_low_s16(vxb01234567),
812 vget_high_s16(vxa2),
813 2);
814 vacc2x4567 = vmlal_lane_s16(
815 vacc2x4567,
816 vget_high_s16(vxb01234567),
817 vget_high_s16(vxa2),
818 2);
819 vacc3x0123 = vmlal_lane_s16(
820 vacc3x0123,
821 vget_low_s16(vxb01234567),
822 vget_high_s16(vxa3),
823 2);
824 vacc3x4567 = vmlal_lane_s16(
825 vacc3x4567,
826 vget_high_s16(vxb01234567),
827 vget_high_s16(vxa3),
828 2);
829 vacc4x0123 = vmlal_lane_s16(
830 vacc4x0123,
831 vget_low_s16(vxb01234567),
832 vget_high_s16(vxa4),
833 2);
834 vacc4x4567 = vmlal_lane_s16(
835 vacc4x4567,
836 vget_high_s16(vxb01234567),
837 vget_high_s16(vxa4),
838 2);
839 vacc5x0123 = vmlal_lane_s16(
840 vacc5x0123,
841 vget_low_s16(vxb01234567),
842 vget_high_s16(vxa5),
843 2);
844 vacc5x4567 = vmlal_lane_s16(
845 vacc5x4567,
846 vget_high_s16(vxb01234567),
847 vget_high_s16(vxa5),
848 2);
849 vacc6x0123 = vmlal_lane_s16(
850 vacc6x0123,
851 vget_low_s16(vxb01234567),
852 vget_high_s16(vxa6),
853 2);
854 vacc6x4567 = vmlal_lane_s16(
855 vacc6x4567,
856 vget_high_s16(vxb01234567),
857 vget_high_s16(vxa6),
858 2);
859 vacc7x0123 = vmlal_lane_s16(
860 vacc7x0123,
861 vget_low_s16(vxb01234567),
862 vget_high_s16(vxa7),
863 2);
864 vacc7x4567 = vmlal_lane_s16(
865 vacc7x4567,
866 vget_high_s16(vxb01234567),
867 vget_high_s16(vxa7),
868 2);
869 }
870 }
871 }
872 }
873 }
874 }
875 }
876 } while (--ks != 0);
877
878 const float32x4_t requantization_scale_c0123 =
879 vld1q_f32(
880 &quantization_params->neon.requantization_scales[output_channel_index]
881 );
882 const float32x4_t requantization_scale_c4567 =
883 vld1q_f32(
884 &quantization_params->neon.requantization_scales[
885 output_channel_index + 4]);
886
887 const float32x4_t vacc0x0123_f =
888 vmulq_f32(vcvtq_f32_s32(vacc0x0123), requantization_scale_c0123);
889 const float32x4_t vacc1x0123_f =
890 vmulq_f32(vcvtq_f32_s32(vacc1x0123), requantization_scale_c0123);
891 const float32x4_t vacc2x0123_f =
892 vmulq_f32(vcvtq_f32_s32(vacc2x0123), requantization_scale_c0123);
893 const float32x4_t vacc3x0123_f =
894 vmulq_f32(vcvtq_f32_s32(vacc3x0123), requantization_scale_c0123);
895 const float32x4_t vacc0x4567_f =
896 vmulq_f32(vcvtq_f32_s32(vacc0x4567), requantization_scale_c4567);
897 const float32x4_t vacc1x4567_f =
898 vmulq_f32(vcvtq_f32_s32(vacc1x4567), requantization_scale_c4567);
899 const float32x4_t vacc2x4567_f =
900 vmulq_f32(vcvtq_f32_s32(vacc2x4567), requantization_scale_c4567);
901 const float32x4_t vacc3x4567_f =
902 vmulq_f32(vcvtq_f32_s32(vacc3x4567), requantization_scale_c4567);
903 const float32x4_t vacc4x0123_f =
904 vmulq_f32(vcvtq_f32_s32(vacc4x0123), requantization_scale_c0123);
905 const float32x4_t vacc5x0123_f =
906 vmulq_f32(vcvtq_f32_s32(vacc5x0123), requantization_scale_c0123);
907 const float32x4_t vacc6x0123_f =
908 vmulq_f32(vcvtq_f32_s32(vacc6x0123), requantization_scale_c0123);
909 const float32x4_t vacc7x0123_f =
910 vmulq_f32(vcvtq_f32_s32(vacc7x0123), requantization_scale_c0123);
911 const float32x4_t vacc4x4567_f =
912 vmulq_f32(vcvtq_f32_s32(vacc4x4567), requantization_scale_c4567);
913 const float32x4_t vacc5x4567_f =
914 vmulq_f32(vcvtq_f32_s32(vacc5x4567), requantization_scale_c4567);
915 const float32x4_t vacc6x4567_f =
916 vmulq_f32(vcvtq_f32_s32(vacc6x4567), requantization_scale_c4567);
917 const float32x4_t vacc7x4567_f =
918 vmulq_f32(vcvtq_f32_s32(vacc7x4567), requantization_scale_c4567);
919
920 #ifdef __aarch64__
921 const int16x8_t voutput_zero_point =
922 vld1q_dup_s16(&quantization_params->neon.output_zero_point);
923
924 vacc0x0123 = vcvtnq_s32_f32(vacc0x0123_f);
925 vacc1x0123 = vcvtnq_s32_f32(vacc1x0123_f);
926 vacc2x0123 = vcvtnq_s32_f32(vacc2x0123_f);
927 vacc3x0123 = vcvtnq_s32_f32(vacc3x0123_f);
928 vacc0x4567 = vcvtnq_s32_f32(vacc0x4567_f);
929 vacc1x4567 = vcvtnq_s32_f32(vacc1x4567_f);
930 vacc2x4567 = vcvtnq_s32_f32(vacc2x4567_f);
931 vacc3x4567 = vcvtnq_s32_f32(vacc3x4567_f);
932 vacc4x0123 = vcvtnq_s32_f32(vacc4x0123_f);
933 vacc5x0123 = vcvtnq_s32_f32(vacc5x0123_f);
934 vacc6x0123 = vcvtnq_s32_f32(vacc6x0123_f);
935 vacc7x0123 = vcvtnq_s32_f32(vacc7x0123_f);
936 vacc4x4567 = vcvtnq_s32_f32(vacc4x4567_f);
937 vacc5x4567 = vcvtnq_s32_f32(vacc5x4567_f);
938 vacc6x4567 = vcvtnq_s32_f32(vacc6x4567_f);
939 vacc7x4567 = vcvtnq_s32_f32(vacc7x4567_f);
940
941 const int16x8_t vacc0x01234567 = vqaddq_s16(
942 vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
943 const int16x8_t vacc1x01234567 = vqaddq_s16(
944 vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
945 const int16x8_t vacc2x01234567 = vqaddq_s16(
946 vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
947 const int16x8_t vacc3x01234567 = vqaddq_s16(
948 vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
949 const int16x8_t vacc4x01234567 = vqaddq_s16(
950 vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
951 const int16x8_t vacc5x01234567 = vqaddq_s16(
952 vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
953 const int16x8_t vacc6x01234567 = vqaddq_s16(
954 vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), voutput_zero_point);
955 const int16x8_t vacc7x01234567 = vqaddq_s16(
956 vqmovn_high_s32(vqmovn_s32(vacc7x0123), vacc7x4567), voutput_zero_point);
957
958 uint8x16_t vout0x01234567_1x01234567 =
959 vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567);
960 uint8x16_t vout2x01234567_3x01234567 =
961 vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc3x01234567);
962 uint8x16_t vout4x01234567_5x01234567 =
963 vqmovun_high_s16(vqmovun_s16(vacc4x01234567), vacc5x01234567);
964 uint8x16_t vout6x01234567_7x01234567 =
965 vqmovun_high_s16(vqmovun_s16(vacc6x01234567), vacc7x01234567);
966
967 const uint8x16_t voutput_min =
968 vld1q_dup_u8(&quantization_params->neon.output_min);
969 const uint8x16_t voutput_max =
970 vld1q_dup_u8(&quantization_params->neon.output_max);
971
972 vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min);
973 vout2x01234567_3x01234567 = vmaxq_u8(vout2x01234567_3x01234567, voutput_min);
974 vout4x01234567_5x01234567 = vmaxq_u8(vout4x01234567_5x01234567, voutput_min);
975 vout6x01234567_7x01234567 = vmaxq_u8(vout6x01234567_7x01234567, voutput_min);
976 vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max);
977 vout2x01234567_3x01234567 = vminq_u8(vout2x01234567_3x01234567, voutput_max);
978 vout4x01234567_5x01234567 = vminq_u8(vout4x01234567_5x01234567, voutput_max);
979 vout6x01234567_7x01234567 = vminq_u8(vout6x01234567_7x01234567, voutput_max);
980 #else
981 const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
982 const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
983 const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
984 const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
985
986 const float32x4_t vacc0x0123_f_clamped =
987 vminq_f32(vmaxq_f32(vacc0x0123_f, vfmin), vfmax);
988 const float32x4_t vacc1x0123_f_clamped =
989 vminq_f32(vmaxq_f32(vacc1x0123_f, vfmin), vfmax);
990 const float32x4_t vacc2x0123_f_clamped =
991 vminq_f32(vmaxq_f32(vacc2x0123_f, vfmin), vfmax);
992 const float32x4_t vacc3x0123_f_clamped =
993 vminq_f32(vmaxq_f32(vacc3x0123_f, vfmin), vfmax);
994 const float32x4_t vacc0x4567_f_clamped =
995 vminq_f32(vmaxq_f32(vacc0x4567_f, vfmin), vfmax);
996 const float32x4_t vacc1x4567_f_clamped =
997 vminq_f32(vmaxq_f32(vacc1x4567_f, vfmin), vfmax);
998 const float32x4_t vacc2x4567_f_clamped =
999 vminq_f32(vmaxq_f32(vacc2x4567_f, vfmin), vfmax);
1000 const float32x4_t vacc3x4567_f_clamped =
1001 vminq_f32(vmaxq_f32(vacc3x4567_f, vfmin), vfmax);
1002 const float32x4_t vacc4x0123_f_clamped =
1003 vminq_f32(vmaxq_f32(vacc4x0123_f, vfmin), vfmax);
1004 const float32x4_t vacc5x0123_f_clamped =
1005 vminq_f32(vmaxq_f32(vacc5x0123_f, vfmin), vfmax);
1006 const float32x4_t vacc6x0123_f_clamped =
1007 vminq_f32(vmaxq_f32(vacc6x0123_f, vfmin), vfmax);
1008 const float32x4_t vacc7x0123_f_clamped =
1009 vminq_f32(vmaxq_f32(vacc7x0123_f, vfmin), vfmax);
1010 const float32x4_t vacc4x4567_f_clamped =
1011 vminq_f32(vmaxq_f32(vacc4x4567_f, vfmin), vfmax);
1012 const float32x4_t vacc5x4567_f_clamped =
1013 vminq_f32(vmaxq_f32(vacc5x4567_f, vfmin), vfmax);
1014 const float32x4_t vacc6x4567_f_clamped =
1015 vminq_f32(vmaxq_f32(vacc6x4567_f, vfmin), vfmax);
1016 const float32x4_t vacc7x4567_f_clamped =
1017 vminq_f32(vmaxq_f32(vacc7x4567_f, vfmin), vfmax);
1018
1019 vacc0x0123 = vsubq_s32(
1020 vreinterpretq_s32_f32(vaddq_f32(vacc0x0123_f_clamped, vfmagic)), vimagic);
1021 vacc1x0123 = vsubq_s32(
1022 vreinterpretq_s32_f32(vaddq_f32(vacc1x0123_f_clamped, vfmagic)), vimagic);
1023 vacc2x0123 = vsubq_s32(
1024 vreinterpretq_s32_f32(vaddq_f32(vacc2x0123_f_clamped, vfmagic)), vimagic);
1025 vacc3x0123 = vsubq_s32(
1026 vreinterpretq_s32_f32(vaddq_f32(vacc3x0123_f_clamped, vfmagic)), vimagic);
1027 vacc0x4567 = vsubq_s32(
1028 vreinterpretq_s32_f32(vaddq_f32(vacc0x4567_f_clamped, vfmagic)), vimagic);
1029 vacc1x4567 = vsubq_s32(
1030 vreinterpretq_s32_f32(vaddq_f32(vacc1x4567_f_clamped, vfmagic)), vimagic);
1031 vacc2x4567 = vsubq_s32(
1032 vreinterpretq_s32_f32(vaddq_f32(vacc2x4567_f_clamped, vfmagic)), vimagic);
1033 vacc3x4567 = vsubq_s32(
1034 vreinterpretq_s32_f32(vaddq_f32(vacc3x4567_f_clamped, vfmagic)), vimagic);
1035 vacc4x0123 = vsubq_s32(
1036 vreinterpretq_s32_f32(vaddq_f32(vacc4x0123_f_clamped, vfmagic)), vimagic);
1037 vacc5x0123 = vsubq_s32(
1038 vreinterpretq_s32_f32(vaddq_f32(vacc5x0123_f_clamped, vfmagic)), vimagic);
1039 vacc6x0123 = vsubq_s32(
1040 vreinterpretq_s32_f32(vaddq_f32(vacc6x0123_f_clamped, vfmagic)), vimagic);
1041 vacc7x0123 = vsubq_s32(
1042 vreinterpretq_s32_f32(vaddq_f32(vacc7x0123_f_clamped, vfmagic)), vimagic);
1043 vacc4x4567 = vsubq_s32(
1044 vreinterpretq_s32_f32(vaddq_f32(vacc4x4567_f_clamped, vfmagic)), vimagic);
1045 vacc5x4567 = vsubq_s32(
1046 vreinterpretq_s32_f32(vaddq_f32(vacc5x4567_f_clamped, vfmagic)), vimagic);
1047 vacc6x4567 = vsubq_s32(
1048 vreinterpretq_s32_f32(vaddq_f32(vacc6x4567_f_clamped, vfmagic)), vimagic);
1049 vacc7x4567 = vsubq_s32(
1050 vreinterpretq_s32_f32(vaddq_f32(vacc7x4567_f_clamped, vfmagic)), vimagic);
1051
1052 const int16x8_t vacc0x01234567 =
1053 vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
1054 const int16x8_t vacc1x01234567 =
1055 vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
1056 const int16x8_t vacc2x01234567 =
1057 vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));
1058 const int16x8_t vacc3x01234567 =
1059 vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567));
1060 const int16x8_t vacc4x01234567 =
1061 vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567));
1062 const int16x8_t vacc5x01234567 =
1063 vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567));
1064 const int16x8_t vacc6x01234567 =
1065 vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567));
1066 const int16x8_t vacc7x01234567 =
1067 vcombine_s16(vqmovn_s32(vacc7x0123), vqmovn_s32(vacc7x4567));
1068
1069 uint8x16_t vout0x01234567_1x01234567 =
1070 vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567));
1071 uint8x16_t vout2x01234567_3x01234567 =
1072 vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc3x01234567));
1073 uint8x16_t vout4x01234567_5x01234567 =
1074 vcombine_u8(vqmovun_s16(vacc4x01234567), vqmovun_s16(vacc5x01234567));
1075 uint8x16_t vout6x01234567_7x01234567 =
1076 vcombine_u8(vqmovun_s16(vacc6x01234567), vqmovun_s16(vacc7x01234567));
1077 #endif
1078
1079 uint8_t* c0 = c;
1080 uint8_t* c1 = (uint8_t*)((uintptr_t)c0 + c_stride);
1081 if (mr < 2) {
1082 c1 = c0;
1083 }
1084 uint8_t* c2 = (uint8_t*)((uintptr_t)c1 + c_stride);
1085 if (mr <= 2) {
1086 c2 = c1;
1087 }
1088 uint8_t* c3 = (uint8_t*)((uintptr_t)c2 + c_stride);
1089 if (mr < 4) {
1090 c3 = c2;
1091 }
1092 uint8_t* c4 = (uint8_t*)((uintptr_t)c3 + c_stride);
1093 if (mr <= 4) {
1094 c4 = c3;
1095 }
1096 uint8_t* c5 = (uint8_t*)((uintptr_t)c4 + c_stride);
1097 if (mr < 6) {
1098 c5 = c4;
1099 }
1100 uint8_t* c6 = (uint8_t*)((uintptr_t)c5 + c_stride);
1101 if (mr <= 6) {
1102 c6 = c5;
1103 }
1104 uint8_t* c7 = (uint8_t*)((uintptr_t)c6 + c_stride);
1105 if (mr != 8) {
1106 c7 = c6;
1107 }
1108 if (nr == 8) {
1109 vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567));
1110 vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567));
1111 vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567));
1112 vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567));
1113 vst1_u8(c4, vget_low_u8(vout4x01234567_5x01234567));
1114 vst1_u8(c5, vget_high_u8(vout4x01234567_5x01234567));
1115 vst1_u8(c6, vget_low_u8(vout6x01234567_7x01234567));
1116 vst1_u8(c7, vget_high_u8(vout6x01234567_7x01234567));
1117 } else {
1118 if (nr >= 4) {
1119 vst1q_lane_u32(
1120 __builtin_assume_aligned(c0, 1),
1121 vreinterpretq_u32_u8(vout0x01234567_1x01234567),
1122 0);
1123 c0 += 4;
1124 vst1q_lane_u32(
1125 __builtin_assume_aligned(c1, 1),
1126 vreinterpretq_u32_u8(vout0x01234567_1x01234567),
1127 2);
1128 c1 += 4;
1129 vst1q_lane_u32(
1130 __builtin_assume_aligned(c2, 1),
1131 vreinterpretq_u32_u8(vout2x01234567_3x01234567),
1132 0);
1133 c2 += 4;
1134 vst1q_lane_u32(
1135 __builtin_assume_aligned(c3, 1),
1136 vreinterpretq_u32_u8(vout2x01234567_3x01234567),
1137 2);
1138 c3 += 4;
1139 vst1q_lane_u32(
1140 __builtin_assume_aligned(c4, 1),
1141 vreinterpretq_u32_u8(vout4x01234567_5x01234567),
1142 0);
1143 c4 += 4;
1144 vst1q_lane_u32(
1145 __builtin_assume_aligned(c5, 1),
1146 vreinterpretq_u32_u8(vout4x01234567_5x01234567),
1147 2);
1148 c5 += 4;
1149 vst1q_lane_u32(
1150 __builtin_assume_aligned(c6, 1),
1151 vreinterpretq_u32_u8(vout6x01234567_7x01234567),
1152 0);
1153 c6 += 4;
1154 vst1q_lane_u32(
1155 __builtin_assume_aligned(c7, 1),
1156 vreinterpretq_u32_u8(vout6x01234567_7x01234567),
1157 2);
1158 c7 += 4;
1159 vout0x01234567_1x01234567 =
1160 vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
1161 vout2x01234567_3x01234567 =
1162 vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
1163 vout4x01234567_5x01234567 =
1164 vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
1165 vout6x01234567_7x01234567 =
1166 vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 4);
1167 nr -= 4;
1168 }
1169 if (nr >= 2) {
1170 vst1q_lane_u16(
1171 __builtin_assume_aligned(c0, 1),
1172 vreinterpretq_u16_u8(vout0x01234567_1x01234567),
1173 0);
1174 c0 += 2;
1175 vst1q_lane_u16(
1176 __builtin_assume_aligned(c1, 1),
1177 vreinterpretq_u16_u8(vout0x01234567_1x01234567),
1178 4);
1179 c1 += 2;
1180 vst1q_lane_u16(
1181 __builtin_assume_aligned(c2, 1),
1182 vreinterpretq_u16_u8(vout2x01234567_3x01234567),
1183 0);
1184 c2 += 2;
1185 vst1q_lane_u16(
1186 __builtin_assume_aligned(c3, 1),
1187 vreinterpretq_u16_u8(vout2x01234567_3x01234567),
1188 4);
1189 c3 += 2;
1190 vst1q_lane_u16(
1191 __builtin_assume_aligned(c4, 1),
1192 vreinterpretq_u16_u8(vout4x01234567_5x01234567),
1193 0);
1194 c4 += 2;
1195 vst1q_lane_u16(
1196 __builtin_assume_aligned(c5, 1),
1197 vreinterpretq_u16_u8(vout4x01234567_5x01234567),
1198 4);
1199 c5 += 2;
1200 vst1q_lane_u16(
1201 __builtin_assume_aligned(c6, 1),
1202 vreinterpretq_u16_u8(vout6x01234567_7x01234567),
1203 0);
1204 c6 += 2;
1205 vst1q_lane_u16(
1206 __builtin_assume_aligned(c7, 1),
1207 vreinterpretq_u16_u8(vout6x01234567_7x01234567),
1208 4);
1209 c7 += 2;
1210 vout0x01234567_1x01234567 =
1211 vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
1212 vout2x01234567_3x01234567 =
1213 vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
1214 vout4x01234567_5x01234567 =
1215 vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
1216 vout6x01234567_7x01234567 =
1217 vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 2);
1218 nr -= 2;
1219 }
1220 if (nr != 0) {
1221 vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
1222 vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
1223 vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
1224 vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
1225 vst1q_lane_u8(c4, vout4x01234567_5x01234567, 0);
1226 vst1q_lane_u8(c5, vout4x01234567_5x01234567, 8);
1227 vst1q_lane_u8(c6, vout6x01234567_7x01234567, 0);
1228 vst1q_lane_u8(c7, vout6x01234567_7x01234567, 8);
1229 }
1230 }
1231 }
1232