xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8conv/8x8-neon.c (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <arm_neon.h>
10 
11 #include <qnnpack/q8conv.h>
12 #include <requantization/runtime-neon.h>
13 
pytorch_q8conv_ukernel_8x8__neon(size_t mr,size_t nr,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t c_stride,size_t output_channel_index,const union pytorch_qnnp_conv_quantization_params quantization_params[restrict static1])14 void pytorch_q8conv_ukernel_8x8__neon(
15     size_t mr,
16     size_t nr,
17     size_t kc,
18     size_t ks,
19     const uint8_t** restrict a,
20     const void* restrict w,
21     uint8_t* restrict c,
22     size_t c_stride,
23     size_t output_channel_index,
24     const union pytorch_qnnp_conv_quantization_params
25         quantization_params[restrict static 1]) {
26   const uint8x8_t va_zero_point =
27       vld1_dup_u8((const uint8_t*)&quantization_params->neon.input_zero_point);
28   const uint8x8_t vb_zero_point =
29       vld1_u8((const uint8_t*)&quantization_params->neon.kernel_zero_points
30           [output_channel_index]);
31 
32   int32x4_t vacc0x0123 = vld1q_s32(w);
33   w = (void*)((uintptr_t)w + sizeof(int32x4_t));
34   int32x4_t vacc0x4567 = vld1q_s32(w);
35   w = (void*)((uintptr_t)w + sizeof(int32x4_t));
36   int32x4_t vacc1x0123 = vacc0x0123;
37   int32x4_t vacc1x4567 = vacc0x4567;
38   int32x4_t vacc2x0123 = vacc0x0123;
39   int32x4_t vacc2x4567 = vacc0x4567;
40   int32x4_t vacc3x0123 = vacc0x0123;
41   int32x4_t vacc3x4567 = vacc0x4567;
42   int32x4_t vacc4x0123 = vacc0x0123;
43   int32x4_t vacc4x4567 = vacc0x4567;
44   int32x4_t vacc5x0123 = vacc0x0123;
45   int32x4_t vacc5x4567 = vacc0x4567;
46   int32x4_t vacc6x0123 = vacc0x0123;
47   int32x4_t vacc6x4567 = vacc0x4567;
48   int32x4_t vacc7x0123 = vacc0x0123;
49   int32x4_t vacc7x4567 = vacc0x4567;
50 
51   do {
52     const uint8_t* restrict a0 = *a++;
53     const uint8_t* restrict a1 = *a++;
54     const uint8_t* restrict a2 = *a++;
55     const uint8_t* restrict a3 = *a++;
56     const uint8_t* restrict a4 = *a++;
57     const uint8_t* restrict a5 = *a++;
58     const uint8_t* restrict a6 = *a++;
59     const uint8_t* restrict a7 = *a++;
60 
61     size_t k = kc;
62     for (; k >= 8; k -= 8) {
63       const uint8x8_t va0 = vld1_u8(a0);
64       a0 += 8;
65       const uint8x8_t va1 = vld1_u8(a1);
66       a1 += 8;
67       const uint8x8_t va2 = vld1_u8(a2);
68       a2 += 8;
69       const uint8x8_t va3 = vld1_u8(a3);
70       a3 += 8;
71       const uint8x8_t va4 = vld1_u8(a4);
72       a4 += 8;
73       const uint8x8_t va5 = vld1_u8(a5);
74       a5 += 8;
75       const uint8x8_t va6 = vld1_u8(a6);
76       a6 += 8;
77       const uint8x8_t va7 = vld1_u8(a7);
78       a7 += 8;
79       const int16x8_t vxa0 =
80           vreinterpretq_s16_u16(sub_zero_point(va0, va_zero_point));
81       const int16x8_t vxa1 =
82           vreinterpretq_s16_u16(sub_zero_point(va1, va_zero_point));
83       const int16x8_t vxa2 =
84           vreinterpretq_s16_u16(sub_zero_point(va2, va_zero_point));
85       const int16x8_t vxa3 =
86           vreinterpretq_s16_u16(sub_zero_point(va3, va_zero_point));
87       const int16x8_t vxa4 =
88           vreinterpretq_s16_u16(sub_zero_point(va4, va_zero_point));
89       const int16x8_t vxa5 =
90           vreinterpretq_s16_u16(sub_zero_point(va5, va_zero_point));
91       const int16x8_t vxa6 =
92           vreinterpretq_s16_u16(sub_zero_point(va6, va_zero_point));
93       const int16x8_t vxa7 =
94           vreinterpretq_s16_u16(sub_zero_point(va7, va_zero_point));
95 
96       {
97         const uint8x8_t vb01234567 = vld1_u8(w);
98         w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
99         const int16x8_t vxb01234567 =
100             vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
101 
102         vacc0x0123 = vmlal_lane_s16(
103             vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0);
104         vacc0x4567 = vmlal_lane_s16(
105             vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0);
106         vacc1x0123 = vmlal_lane_s16(
107             vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0);
108         vacc1x4567 = vmlal_lane_s16(
109             vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0);
110         vacc2x0123 = vmlal_lane_s16(
111             vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 0);
112         vacc2x4567 = vmlal_lane_s16(
113             vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 0);
114         vacc3x0123 = vmlal_lane_s16(
115             vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 0);
116         vacc3x4567 = vmlal_lane_s16(
117             vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0);
118         vacc4x0123 = vmlal_lane_s16(
119             vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 0);
120         vacc4x4567 = vmlal_lane_s16(
121             vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 0);
122         vacc5x0123 = vmlal_lane_s16(
123             vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 0);
124         vacc5x4567 = vmlal_lane_s16(
125             vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 0);
126         vacc6x0123 = vmlal_lane_s16(
127             vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 0);
128         vacc6x4567 = vmlal_lane_s16(
129             vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 0);
130         vacc7x0123 = vmlal_lane_s16(
131             vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 0);
132         vacc7x4567 = vmlal_lane_s16(
133             vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 0);
134       }
135 
136       {
137         const uint8x8_t vb01234567 = vld1_u8(w);
138         w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
139         const int16x8_t vxb01234567 =
140             vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
141 
142         vacc0x0123 = vmlal_lane_s16(
143             vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 1);
144         vacc0x4567 = vmlal_lane_s16(
145             vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 1);
146         vacc1x0123 = vmlal_lane_s16(
147             vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1);
148         vacc1x4567 = vmlal_lane_s16(
149             vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1);
150         vacc2x0123 = vmlal_lane_s16(
151             vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 1);
152         vacc2x4567 = vmlal_lane_s16(
153             vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 1);
154         vacc3x0123 = vmlal_lane_s16(
155             vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 1);
156         vacc3x4567 = vmlal_lane_s16(
157             vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 1);
158         vacc4x0123 = vmlal_lane_s16(
159             vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 1);
160         vacc4x4567 = vmlal_lane_s16(
161             vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 1);
162         vacc5x0123 = vmlal_lane_s16(
163             vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 1);
164         vacc5x4567 = vmlal_lane_s16(
165             vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 1);
166         vacc6x0123 = vmlal_lane_s16(
167             vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 1);
168         vacc6x4567 = vmlal_lane_s16(
169             vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 1);
170         vacc7x0123 = vmlal_lane_s16(
171             vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 1);
172         vacc7x4567 = vmlal_lane_s16(
173             vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 1);
174       }
175 
176       {
177         const uint8x8_t vb01234567 = vld1_u8(w);
178         w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
179         const int16x8_t vxb01234567 =
180             vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
181 
182         vacc0x0123 = vmlal_lane_s16(
183             vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 2);
184         vacc0x4567 = vmlal_lane_s16(
185             vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 2);
186         vacc1x0123 = vmlal_lane_s16(
187             vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2);
188         vacc1x4567 = vmlal_lane_s16(
189             vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2);
190         vacc2x0123 = vmlal_lane_s16(
191             vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 2);
192         vacc2x4567 = vmlal_lane_s16(
193             vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 2);
194         vacc3x0123 = vmlal_lane_s16(
195             vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 2);
196         vacc3x4567 = vmlal_lane_s16(
197             vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 2);
198         vacc4x0123 = vmlal_lane_s16(
199             vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 2);
200         vacc4x4567 = vmlal_lane_s16(
201             vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 2);
202         vacc5x0123 = vmlal_lane_s16(
203             vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 2);
204         vacc5x4567 = vmlal_lane_s16(
205             vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 2);
206         vacc6x0123 = vmlal_lane_s16(
207             vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 2);
208         vacc6x4567 = vmlal_lane_s16(
209             vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 2);
210         vacc7x0123 = vmlal_lane_s16(
211             vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 2);
212         vacc7x4567 = vmlal_lane_s16(
213             vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 2);
214       }
215 
216       {
217         const uint8x8_t vb01234567 = vld1_u8(w);
218         w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
219         const int16x8_t vxb01234567 =
220             vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
221 
222         vacc0x0123 = vmlal_lane_s16(
223             vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 3);
224         vacc0x4567 = vmlal_lane_s16(
225             vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 3);
226         vacc1x0123 = vmlal_lane_s16(
227             vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3);
228         vacc1x4567 = vmlal_lane_s16(
229             vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3);
230         vacc2x0123 = vmlal_lane_s16(
231             vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 3);
232         vacc2x4567 = vmlal_lane_s16(
233             vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 3);
234         vacc3x0123 = vmlal_lane_s16(
235             vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 3);
236         vacc3x4567 = vmlal_lane_s16(
237             vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 3);
238         vacc4x0123 = vmlal_lane_s16(
239             vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 3);
240         vacc4x4567 = vmlal_lane_s16(
241             vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 3);
242         vacc5x0123 = vmlal_lane_s16(
243             vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 3);
244         vacc5x4567 = vmlal_lane_s16(
245             vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 3);
246         vacc6x0123 = vmlal_lane_s16(
247             vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 3);
248         vacc6x4567 = vmlal_lane_s16(
249             vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 3);
250         vacc7x0123 = vmlal_lane_s16(
251             vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 3);
252         vacc7x4567 = vmlal_lane_s16(
253             vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 3);
254       }
255 
256       {
257         const uint8x8_t vb01234567 = vld1_u8(w);
258         w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
259         const int16x8_t vxb01234567 =
260             vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
261 
262         vacc0x0123 = vmlal_lane_s16(
263             vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 0);
264         vacc0x4567 = vmlal_lane_s16(
265             vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 0);
266         vacc1x0123 = vmlal_lane_s16(
267             vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 0);
268         vacc1x4567 = vmlal_lane_s16(
269             vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 0);
270         vacc2x0123 = vmlal_lane_s16(
271             vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 0);
272         vacc2x4567 = vmlal_lane_s16(
273             vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 0);
274         vacc3x0123 = vmlal_lane_s16(
275             vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 0);
276         vacc3x4567 = vmlal_lane_s16(
277             vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 0);
278         vacc4x0123 = vmlal_lane_s16(
279             vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 0);
280         vacc4x4567 = vmlal_lane_s16(
281             vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 0);
282         vacc5x0123 = vmlal_lane_s16(
283             vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 0);
284         vacc5x4567 = vmlal_lane_s16(
285             vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 0);
286         vacc6x0123 = vmlal_lane_s16(
287             vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 0);
288         vacc6x4567 = vmlal_lane_s16(
289             vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 0);
290         vacc7x0123 = vmlal_lane_s16(
291             vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 0);
292         vacc7x4567 = vmlal_lane_s16(
293             vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 0);
294       }
295 
296       {
297         const uint8x8_t vb01234567 = vld1_u8(w);
298         w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
299         const int16x8_t vxb01234567 =
300             vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
301 
302         vacc0x0123 = vmlal_lane_s16(
303             vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 1);
304         vacc0x4567 = vmlal_lane_s16(
305             vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 1);
306         vacc1x0123 = vmlal_lane_s16(
307             vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 1);
308         vacc1x4567 = vmlal_lane_s16(
309             vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 1);
310         vacc2x0123 = vmlal_lane_s16(
311             vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 1);
312         vacc2x4567 = vmlal_lane_s16(
313             vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 1);
314         vacc3x0123 = vmlal_lane_s16(
315             vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 1);
316         vacc3x4567 = vmlal_lane_s16(
317             vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 1);
318         vacc4x0123 = vmlal_lane_s16(
319             vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 1);
320         vacc4x4567 = vmlal_lane_s16(
321             vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 1);
322         vacc5x0123 = vmlal_lane_s16(
323             vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 1);
324         vacc5x4567 = vmlal_lane_s16(
325             vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 1);
326         vacc6x0123 = vmlal_lane_s16(
327             vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 1);
328         vacc6x4567 = vmlal_lane_s16(
329             vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 1);
330         vacc7x0123 = vmlal_lane_s16(
331             vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 1);
332         vacc7x4567 = vmlal_lane_s16(
333             vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 1);
334       }
335 
336       {
337         const uint8x8_t vb01234567 = vld1_u8(w);
338         w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
339         const int16x8_t vxb01234567 =
340             vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
341 
342         vacc0x0123 = vmlal_lane_s16(
343             vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 2);
344         vacc0x4567 = vmlal_lane_s16(
345             vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 2);
346         vacc1x0123 = vmlal_lane_s16(
347             vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 2);
348         vacc1x4567 = vmlal_lane_s16(
349             vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 2);
350         vacc2x0123 = vmlal_lane_s16(
351             vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 2);
352         vacc2x4567 = vmlal_lane_s16(
353             vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 2);
354         vacc3x0123 = vmlal_lane_s16(
355             vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 2);
356         vacc3x4567 = vmlal_lane_s16(
357             vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 2);
358         vacc4x0123 = vmlal_lane_s16(
359             vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 2);
360         vacc4x4567 = vmlal_lane_s16(
361             vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 2);
362         vacc5x0123 = vmlal_lane_s16(
363             vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 2);
364         vacc5x4567 = vmlal_lane_s16(
365             vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 2);
366         vacc6x0123 = vmlal_lane_s16(
367             vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 2);
368         vacc6x4567 = vmlal_lane_s16(
369             vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 2);
370         vacc7x0123 = vmlal_lane_s16(
371             vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 2);
372         vacc7x4567 = vmlal_lane_s16(
373             vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 2);
374       }
375 
376       {
377         const uint8x8_t vb01234567 = vld1_u8(w);
378         w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
379         const int16x8_t vxb01234567 =
380             vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
381 
382         vacc0x0123 = vmlal_lane_s16(
383             vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 3);
384         vacc0x4567 = vmlal_lane_s16(
385             vacc0x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa0), 3);
386         vacc1x0123 = vmlal_lane_s16(
387             vacc1x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa1), 3);
388         vacc1x4567 = vmlal_lane_s16(
389             vacc1x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa1), 3);
390         vacc2x0123 = vmlal_lane_s16(
391             vacc2x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa2), 3);
392         vacc2x4567 = vmlal_lane_s16(
393             vacc2x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa2), 3);
394         vacc3x0123 = vmlal_lane_s16(
395             vacc3x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa3), 3);
396         vacc3x4567 = vmlal_lane_s16(
397             vacc3x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa3), 3);
398         vacc4x0123 = vmlal_lane_s16(
399             vacc4x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa4), 3);
400         vacc4x4567 = vmlal_lane_s16(
401             vacc4x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa4), 3);
402         vacc5x0123 = vmlal_lane_s16(
403             vacc5x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa5), 3);
404         vacc5x4567 = vmlal_lane_s16(
405             vacc5x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa5), 3);
406         vacc6x0123 = vmlal_lane_s16(
407             vacc6x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa6), 3);
408         vacc6x4567 = vmlal_lane_s16(
409             vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 3);
410         vacc7x0123 = vmlal_lane_s16(
411             vacc7x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa7), 3);
412         vacc7x4567 = vmlal_lane_s16(
413             vacc7x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa7), 3);
414       }
415     }
416     if (k != 0) {
417       const size_t a_predecrement = 8 - k;
418       const int64x1_t va_shift = vmov_n_s64(-8 * a_predecrement);
419       const uint8x8_t va0 = vreinterpret_u8_u64(vshl_u64(
420           vreinterpret_u64_u8(vld1_u8(a0 - a_predecrement)), va_shift));
421       const uint8x8_t va1 = vreinterpret_u8_u64(vshl_u64(
422           vreinterpret_u64_u8(vld1_u8(a1 - a_predecrement)), va_shift));
423       const uint8x8_t va2 = vreinterpret_u8_u64(vshl_u64(
424           vreinterpret_u64_u8(vld1_u8(a2 - a_predecrement)), va_shift));
425       const uint8x8_t va3 = vreinterpret_u8_u64(vshl_u64(
426           vreinterpret_u64_u8(vld1_u8(a3 - a_predecrement)), va_shift));
427       const uint8x8_t va4 = vreinterpret_u8_u64(vshl_u64(
428           vreinterpret_u64_u8(vld1_u8(a4 - a_predecrement)), va_shift));
429       const uint8x8_t va5 = vreinterpret_u8_u64(vshl_u64(
430           vreinterpret_u64_u8(vld1_u8(a5 - a_predecrement)), va_shift));
431       const uint8x8_t va6 = vreinterpret_u8_u64(vshl_u64(
432           vreinterpret_u64_u8(vld1_u8(a6 - a_predecrement)), va_shift));
433       const uint8x8_t va7 = vreinterpret_u8_u64(vshl_u64(
434           vreinterpret_u64_u8(vld1_u8(a7 - a_predecrement)), va_shift));
435       const int16x8_t vxa0 =
436           vreinterpretq_s16_u16(sub_zero_point(va0, va_zero_point));
437       const int16x8_t vxa1 =
438           vreinterpretq_s16_u16(sub_zero_point(va1, va_zero_point));
439       const int16x8_t vxa2 =
440           vreinterpretq_s16_u16(sub_zero_point(va2, va_zero_point));
441       const int16x8_t vxa3 =
442           vreinterpretq_s16_u16(sub_zero_point(va3, va_zero_point));
443       const int16x8_t vxa4 =
444           vreinterpretq_s16_u16(sub_zero_point(va4, va_zero_point));
445       const int16x8_t vxa5 =
446           vreinterpretq_s16_u16(sub_zero_point(va5, va_zero_point));
447       const int16x8_t vxa6 =
448           vreinterpretq_s16_u16(sub_zero_point(va6, va_zero_point));
449       const int16x8_t vxa7 =
450           vreinterpretq_s16_u16(sub_zero_point(va7, va_zero_point));
451 
452       {
453         const uint8x8_t vb01234567 = vld1_u8(w);
454         w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
455         const int16x8_t vxb01234567 =
456             vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
457 
458         vacc0x0123 = vmlal_lane_s16(
459             vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0);
460         vacc0x4567 = vmlal_lane_s16(
461             vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0);
462         vacc1x0123 = vmlal_lane_s16(
463             vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0);
464         vacc1x4567 = vmlal_lane_s16(
465             vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0);
466         vacc2x0123 = vmlal_lane_s16(
467             vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 0);
468         vacc2x4567 = vmlal_lane_s16(
469             vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 0);
470         vacc3x0123 = vmlal_lane_s16(
471             vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 0);
472         vacc3x4567 = vmlal_lane_s16(
473             vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0);
474         vacc4x0123 = vmlal_lane_s16(
475             vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 0);
476         vacc4x4567 = vmlal_lane_s16(
477             vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 0);
478         vacc5x0123 = vmlal_lane_s16(
479             vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 0);
480         vacc5x4567 = vmlal_lane_s16(
481             vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 0);
482         vacc6x0123 = vmlal_lane_s16(
483             vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 0);
484         vacc6x4567 = vmlal_lane_s16(
485             vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 0);
486         vacc7x0123 = vmlal_lane_s16(
487             vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 0);
488         vacc7x4567 = vmlal_lane_s16(
489             vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 0);
490       }
491 
492       if (k >= 2) {
493         const uint8x8_t vb01234567 = vld1_u8(w);
494         w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
495         const int16x8_t vxb01234567 =
496             vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
497 
498         vacc0x0123 = vmlal_lane_s16(
499             vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 1);
500         vacc0x4567 = vmlal_lane_s16(
501             vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 1);
502         vacc1x0123 = vmlal_lane_s16(
503             vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 1);
504         vacc1x4567 = vmlal_lane_s16(
505             vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 1);
506         vacc2x0123 = vmlal_lane_s16(
507             vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 1);
508         vacc2x4567 = vmlal_lane_s16(
509             vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 1);
510         vacc3x0123 = vmlal_lane_s16(
511             vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 1);
512         vacc3x4567 = vmlal_lane_s16(
513             vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 1);
514         vacc4x0123 = vmlal_lane_s16(
515             vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 1);
516         vacc4x4567 = vmlal_lane_s16(
517             vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 1);
518         vacc5x0123 = vmlal_lane_s16(
519             vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 1);
520         vacc5x4567 = vmlal_lane_s16(
521             vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 1);
522         vacc6x0123 = vmlal_lane_s16(
523             vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 1);
524         vacc6x4567 = vmlal_lane_s16(
525             vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 1);
526         vacc7x0123 = vmlal_lane_s16(
527             vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 1);
528         vacc7x4567 = vmlal_lane_s16(
529             vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 1);
530 
531         if (k > 2) {
532           const uint8x8_t vb01234567 = vld1_u8(w);
533           w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
534           const int16x8_t vxb01234567 =
535               vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
536 
537           vacc0x0123 = vmlal_lane_s16(
538               vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 2);
539           vacc0x4567 = vmlal_lane_s16(
540               vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 2);
541           vacc1x0123 = vmlal_lane_s16(
542               vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 2);
543           vacc1x4567 = vmlal_lane_s16(
544               vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 2);
545           vacc2x0123 = vmlal_lane_s16(
546               vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 2);
547           vacc2x4567 = vmlal_lane_s16(
548               vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 2);
549           vacc3x0123 = vmlal_lane_s16(
550               vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 2);
551           vacc3x4567 = vmlal_lane_s16(
552               vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 2);
553           vacc4x0123 = vmlal_lane_s16(
554               vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 2);
555           vacc4x4567 = vmlal_lane_s16(
556               vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 2);
557           vacc5x0123 = vmlal_lane_s16(
558               vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 2);
559           vacc5x4567 = vmlal_lane_s16(
560               vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 2);
561           vacc6x0123 = vmlal_lane_s16(
562               vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 2);
563           vacc6x4567 = vmlal_lane_s16(
564               vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 2);
565           vacc7x0123 = vmlal_lane_s16(
566               vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 2);
567           vacc7x4567 = vmlal_lane_s16(
568               vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 2);
569 
570           if (k >= 4) {
571             const uint8x8_t vb01234567 = vld1_u8(w);
572             w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
573             const int16x8_t vxb01234567 =
574                 vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
575 
576             vacc0x0123 = vmlal_lane_s16(
577                 vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 3);
578             vacc0x4567 = vmlal_lane_s16(
579                 vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 3);
580             vacc1x0123 = vmlal_lane_s16(
581                 vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 3);
582             vacc1x4567 = vmlal_lane_s16(
583                 vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 3);
584             vacc2x0123 = vmlal_lane_s16(
585                 vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 3);
586             vacc2x4567 = vmlal_lane_s16(
587                 vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 3);
588             vacc3x0123 = vmlal_lane_s16(
589                 vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 3);
590             vacc3x4567 = vmlal_lane_s16(
591                 vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 3);
592             vacc4x0123 = vmlal_lane_s16(
593                 vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 3);
594             vacc4x4567 = vmlal_lane_s16(
595                 vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 3);
596             vacc5x0123 = vmlal_lane_s16(
597                 vacc5x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa5), 3);
598             vacc5x4567 = vmlal_lane_s16(
599                 vacc5x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa5), 3);
600             vacc6x0123 = vmlal_lane_s16(
601                 vacc6x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa6), 3);
602             vacc6x4567 = vmlal_lane_s16(
603                 vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 3);
604             vacc7x0123 = vmlal_lane_s16(
605                 vacc7x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa7), 3);
606             vacc7x4567 = vmlal_lane_s16(
607                 vacc7x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa7), 3);
608 
609             if (k > 4) {
610               const uint8x8_t vb01234567 = vld1_u8(w);
611               w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
612               const int16x8_t vxb01234567 =
613                   vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
614 
615               vacc0x0123 = vmlal_lane_s16(
616                   vacc0x0123,
617                   vget_low_s16(vxb01234567),
618                   vget_high_s16(vxa0),
619                   0);
620               vacc0x4567 = vmlal_lane_s16(
621                   vacc0x4567,
622                   vget_high_s16(vxb01234567),
623                   vget_high_s16(vxa0),
624                   0);
625               vacc1x0123 = vmlal_lane_s16(
626                   vacc1x0123,
627                   vget_low_s16(vxb01234567),
628                   vget_high_s16(vxa1),
629                   0);
630               vacc1x4567 = vmlal_lane_s16(
631                   vacc1x4567,
632                   vget_high_s16(vxb01234567),
633                   vget_high_s16(vxa1),
634                   0);
635               vacc2x0123 = vmlal_lane_s16(
636                   vacc2x0123,
637                   vget_low_s16(vxb01234567),
638                   vget_high_s16(vxa2),
639                   0);
640               vacc2x4567 = vmlal_lane_s16(
641                   vacc2x4567,
642                   vget_high_s16(vxb01234567),
643                   vget_high_s16(vxa2),
644                   0);
645               vacc3x0123 = vmlal_lane_s16(
646                   vacc3x0123,
647                   vget_low_s16(vxb01234567),
648                   vget_high_s16(vxa3),
649                   0);
650               vacc3x4567 = vmlal_lane_s16(
651                   vacc3x4567,
652                   vget_high_s16(vxb01234567),
653                   vget_high_s16(vxa3),
654                   0);
655               vacc4x0123 = vmlal_lane_s16(
656                   vacc4x0123,
657                   vget_low_s16(vxb01234567),
658                   vget_high_s16(vxa4),
659                   0);
660               vacc4x4567 = vmlal_lane_s16(
661                   vacc4x4567,
662                   vget_high_s16(vxb01234567),
663                   vget_high_s16(vxa4),
664                   0);
665               vacc5x0123 = vmlal_lane_s16(
666                   vacc5x0123,
667                   vget_low_s16(vxb01234567),
668                   vget_high_s16(vxa5),
669                   0);
670               vacc5x4567 = vmlal_lane_s16(
671                   vacc5x4567,
672                   vget_high_s16(vxb01234567),
673                   vget_high_s16(vxa5),
674                   0);
675               vacc6x0123 = vmlal_lane_s16(
676                   vacc6x0123,
677                   vget_low_s16(vxb01234567),
678                   vget_high_s16(vxa6),
679                   0);
680               vacc6x4567 = vmlal_lane_s16(
681                   vacc6x4567,
682                   vget_high_s16(vxb01234567),
683                   vget_high_s16(vxa6),
684                   0);
685               vacc7x0123 = vmlal_lane_s16(
686                   vacc7x0123,
687                   vget_low_s16(vxb01234567),
688                   vget_high_s16(vxa7),
689                   0);
690               vacc7x4567 = vmlal_lane_s16(
691                   vacc7x4567,
692                   vget_high_s16(vxb01234567),
693                   vget_high_s16(vxa7),
694                   0);
695 
696               if (k >= 6) {
697                 const uint8x8_t vb01234567 = vld1_u8(w);
698                 w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
699                 const int16x8_t vxb01234567 =
700                     vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
701 
702                 vacc0x0123 = vmlal_lane_s16(
703                     vacc0x0123,
704                     vget_low_s16(vxb01234567),
705                     vget_high_s16(vxa0),
706                     1);
707                 vacc0x4567 = vmlal_lane_s16(
708                     vacc0x4567,
709                     vget_high_s16(vxb01234567),
710                     vget_high_s16(vxa0),
711                     1);
712                 vacc1x0123 = vmlal_lane_s16(
713                     vacc1x0123,
714                     vget_low_s16(vxb01234567),
715                     vget_high_s16(vxa1),
716                     1);
717                 vacc1x4567 = vmlal_lane_s16(
718                     vacc1x4567,
719                     vget_high_s16(vxb01234567),
720                     vget_high_s16(vxa1),
721                     1);
722                 vacc2x0123 = vmlal_lane_s16(
723                     vacc2x0123,
724                     vget_low_s16(vxb01234567),
725                     vget_high_s16(vxa2),
726                     1);
727                 vacc2x4567 = vmlal_lane_s16(
728                     vacc2x4567,
729                     vget_high_s16(vxb01234567),
730                     vget_high_s16(vxa2),
731                     1);
732                 vacc3x0123 = vmlal_lane_s16(
733                     vacc3x0123,
734                     vget_low_s16(vxb01234567),
735                     vget_high_s16(vxa3),
736                     1);
737                 vacc3x4567 = vmlal_lane_s16(
738                     vacc3x4567,
739                     vget_high_s16(vxb01234567),
740                     vget_high_s16(vxa3),
741                     1);
742                 vacc4x0123 = vmlal_lane_s16(
743                     vacc4x0123,
744                     vget_low_s16(vxb01234567),
745                     vget_high_s16(vxa4),
746                     1);
747                 vacc4x4567 = vmlal_lane_s16(
748                     vacc4x4567,
749                     vget_high_s16(vxb01234567),
750                     vget_high_s16(vxa4),
751                     1);
752                 vacc5x0123 = vmlal_lane_s16(
753                     vacc5x0123,
754                     vget_low_s16(vxb01234567),
755                     vget_high_s16(vxa5),
756                     1);
757                 vacc5x4567 = vmlal_lane_s16(
758                     vacc5x4567,
759                     vget_high_s16(vxb01234567),
760                     vget_high_s16(vxa5),
761                     1);
762                 vacc6x0123 = vmlal_lane_s16(
763                     vacc6x0123,
764                     vget_low_s16(vxb01234567),
765                     vget_high_s16(vxa6),
766                     1);
767                 vacc6x4567 = vmlal_lane_s16(
768                     vacc6x4567,
769                     vget_high_s16(vxb01234567),
770                     vget_high_s16(vxa6),
771                     1);
772                 vacc7x0123 = vmlal_lane_s16(
773                     vacc7x0123,
774                     vget_low_s16(vxb01234567),
775                     vget_high_s16(vxa7),
776                     1);
777                 vacc7x4567 = vmlal_lane_s16(
778                     vacc7x4567,
779                     vget_high_s16(vxb01234567),
780                     vget_high_s16(vxa7),
781                     1);
782 
783                 if (k > 6) {
784                   const uint8x8_t vb01234567 = vld1_u8(w);
785                   w = (void*)((uintptr_t)w + sizeof(uint8x8_t));
786                   const int16x8_t vxb01234567 = vreinterpretq_s16_u16(
787                       vsubl_u8(vb01234567, vb_zero_point));
788 
789                   vacc0x0123 = vmlal_lane_s16(
790                       vacc0x0123,
791                       vget_low_s16(vxb01234567),
792                       vget_high_s16(vxa0),
793                       2);
794                   vacc0x4567 = vmlal_lane_s16(
795                       vacc0x4567,
796                       vget_high_s16(vxb01234567),
797                       vget_high_s16(vxa0),
798                       2);
799                   vacc1x0123 = vmlal_lane_s16(
800                       vacc1x0123,
801                       vget_low_s16(vxb01234567),
802                       vget_high_s16(vxa1),
803                       2);
804                   vacc1x4567 = vmlal_lane_s16(
805                       vacc1x4567,
806                       vget_high_s16(vxb01234567),
807                       vget_high_s16(vxa1),
808                       2);
809                   vacc2x0123 = vmlal_lane_s16(
810                       vacc2x0123,
811                       vget_low_s16(vxb01234567),
812                       vget_high_s16(vxa2),
813                       2);
814                   vacc2x4567 = vmlal_lane_s16(
815                       vacc2x4567,
816                       vget_high_s16(vxb01234567),
817                       vget_high_s16(vxa2),
818                       2);
819                   vacc3x0123 = vmlal_lane_s16(
820                       vacc3x0123,
821                       vget_low_s16(vxb01234567),
822                       vget_high_s16(vxa3),
823                       2);
824                   vacc3x4567 = vmlal_lane_s16(
825                       vacc3x4567,
826                       vget_high_s16(vxb01234567),
827                       vget_high_s16(vxa3),
828                       2);
829                   vacc4x0123 = vmlal_lane_s16(
830                       vacc4x0123,
831                       vget_low_s16(vxb01234567),
832                       vget_high_s16(vxa4),
833                       2);
834                   vacc4x4567 = vmlal_lane_s16(
835                       vacc4x4567,
836                       vget_high_s16(vxb01234567),
837                       vget_high_s16(vxa4),
838                       2);
839                   vacc5x0123 = vmlal_lane_s16(
840                       vacc5x0123,
841                       vget_low_s16(vxb01234567),
842                       vget_high_s16(vxa5),
843                       2);
844                   vacc5x4567 = vmlal_lane_s16(
845                       vacc5x4567,
846                       vget_high_s16(vxb01234567),
847                       vget_high_s16(vxa5),
848                       2);
849                   vacc6x0123 = vmlal_lane_s16(
850                       vacc6x0123,
851                       vget_low_s16(vxb01234567),
852                       vget_high_s16(vxa6),
853                       2);
854                   vacc6x4567 = vmlal_lane_s16(
855                       vacc6x4567,
856                       vget_high_s16(vxb01234567),
857                       vget_high_s16(vxa6),
858                       2);
859                   vacc7x0123 = vmlal_lane_s16(
860                       vacc7x0123,
861                       vget_low_s16(vxb01234567),
862                       vget_high_s16(vxa7),
863                       2);
864                   vacc7x4567 = vmlal_lane_s16(
865                       vacc7x4567,
866                       vget_high_s16(vxb01234567),
867                       vget_high_s16(vxa7),
868                       2);
869                 }
870               }
871             }
872           }
873         }
874       }
875     }
876   } while (--ks != 0);
877 
878   const float32x4_t requantization_scale_c0123 =
879       vld1q_f32(
880           &quantization_params->neon.requantization_scales[output_channel_index]
881           );
882   const float32x4_t requantization_scale_c4567 =
883       vld1q_f32(
884           &quantization_params->neon.requantization_scales[
885               output_channel_index + 4]);
886 
887   const float32x4_t vacc0x0123_f =
888     vmulq_f32(vcvtq_f32_s32(vacc0x0123), requantization_scale_c0123);
889   const float32x4_t vacc1x0123_f =
890     vmulq_f32(vcvtq_f32_s32(vacc1x0123), requantization_scale_c0123);
891   const float32x4_t vacc2x0123_f =
892     vmulq_f32(vcvtq_f32_s32(vacc2x0123), requantization_scale_c0123);
893   const float32x4_t vacc3x0123_f =
894     vmulq_f32(vcvtq_f32_s32(vacc3x0123), requantization_scale_c0123);
895   const float32x4_t vacc0x4567_f =
896     vmulq_f32(vcvtq_f32_s32(vacc0x4567), requantization_scale_c4567);
897   const float32x4_t vacc1x4567_f =
898     vmulq_f32(vcvtq_f32_s32(vacc1x4567), requantization_scale_c4567);
899   const float32x4_t vacc2x4567_f =
900     vmulq_f32(vcvtq_f32_s32(vacc2x4567), requantization_scale_c4567);
901   const float32x4_t vacc3x4567_f =
902     vmulq_f32(vcvtq_f32_s32(vacc3x4567), requantization_scale_c4567);
903   const float32x4_t vacc4x0123_f =
904     vmulq_f32(vcvtq_f32_s32(vacc4x0123), requantization_scale_c0123);
905   const float32x4_t vacc5x0123_f =
906     vmulq_f32(vcvtq_f32_s32(vacc5x0123), requantization_scale_c0123);
907   const float32x4_t vacc6x0123_f =
908     vmulq_f32(vcvtq_f32_s32(vacc6x0123), requantization_scale_c0123);
909   const float32x4_t vacc7x0123_f =
910     vmulq_f32(vcvtq_f32_s32(vacc7x0123), requantization_scale_c0123);
911   const float32x4_t vacc4x4567_f =
912     vmulq_f32(vcvtq_f32_s32(vacc4x4567), requantization_scale_c4567);
913   const float32x4_t vacc5x4567_f =
914     vmulq_f32(vcvtq_f32_s32(vacc5x4567), requantization_scale_c4567);
915   const float32x4_t vacc6x4567_f =
916     vmulq_f32(vcvtq_f32_s32(vacc6x4567), requantization_scale_c4567);
917   const float32x4_t vacc7x4567_f =
918     vmulq_f32(vcvtq_f32_s32(vacc7x4567), requantization_scale_c4567);
919 
920 #ifdef __aarch64__
921   const int16x8_t voutput_zero_point =
922       vld1q_dup_s16(&quantization_params->neon.output_zero_point);
923 
924   vacc0x0123 = vcvtnq_s32_f32(vacc0x0123_f);
925   vacc1x0123 = vcvtnq_s32_f32(vacc1x0123_f);
926   vacc2x0123 = vcvtnq_s32_f32(vacc2x0123_f);
927   vacc3x0123 = vcvtnq_s32_f32(vacc3x0123_f);
928   vacc0x4567 = vcvtnq_s32_f32(vacc0x4567_f);
929   vacc1x4567 = vcvtnq_s32_f32(vacc1x4567_f);
930   vacc2x4567 = vcvtnq_s32_f32(vacc2x4567_f);
931   vacc3x4567 = vcvtnq_s32_f32(vacc3x4567_f);
932   vacc4x0123 = vcvtnq_s32_f32(vacc4x0123_f);
933   vacc5x0123 = vcvtnq_s32_f32(vacc5x0123_f);
934   vacc6x0123 = vcvtnq_s32_f32(vacc6x0123_f);
935   vacc7x0123 = vcvtnq_s32_f32(vacc7x0123_f);
936   vacc4x4567 = vcvtnq_s32_f32(vacc4x4567_f);
937   vacc5x4567 = vcvtnq_s32_f32(vacc5x4567_f);
938   vacc6x4567 = vcvtnq_s32_f32(vacc6x4567_f);
939   vacc7x4567 = vcvtnq_s32_f32(vacc7x4567_f);
940 
941   const int16x8_t vacc0x01234567 = vqaddq_s16(
942       vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
943   const int16x8_t vacc1x01234567 = vqaddq_s16(
944       vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
945   const int16x8_t vacc2x01234567 = vqaddq_s16(
946       vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
947   const int16x8_t vacc3x01234567 = vqaddq_s16(
948       vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
949   const int16x8_t vacc4x01234567 = vqaddq_s16(
950       vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
951   const int16x8_t vacc5x01234567 = vqaddq_s16(
952       vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
953   const int16x8_t vacc6x01234567 = vqaddq_s16(
954       vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), voutput_zero_point);
955   const int16x8_t vacc7x01234567 = vqaddq_s16(
956       vqmovn_high_s32(vqmovn_s32(vacc7x0123), vacc7x4567), voutput_zero_point);
957 
958   uint8x16_t vout0x01234567_1x01234567 =
959       vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567);
960   uint8x16_t vout2x01234567_3x01234567 =
961       vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc3x01234567);
962   uint8x16_t vout4x01234567_5x01234567 =
963       vqmovun_high_s16(vqmovun_s16(vacc4x01234567), vacc5x01234567);
964   uint8x16_t vout6x01234567_7x01234567 =
965       vqmovun_high_s16(vqmovun_s16(vacc6x01234567), vacc7x01234567);
966 
967   const uint8x16_t voutput_min =
968       vld1q_dup_u8(&quantization_params->neon.output_min);
969   const uint8x16_t voutput_max =
970       vld1q_dup_u8(&quantization_params->neon.output_max);
971 
972   vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min);
973   vout2x01234567_3x01234567 = vmaxq_u8(vout2x01234567_3x01234567, voutput_min);
974   vout4x01234567_5x01234567 = vmaxq_u8(vout4x01234567_5x01234567, voutput_min);
975   vout6x01234567_7x01234567 = vmaxq_u8(vout6x01234567_7x01234567, voutput_min);
976   vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max);
977   vout2x01234567_3x01234567 = vminq_u8(vout2x01234567_3x01234567, voutput_max);
978   vout4x01234567_5x01234567 = vminq_u8(vout4x01234567_5x01234567, voutput_max);
979   vout6x01234567_7x01234567 = vminq_u8(vout6x01234567_7x01234567, voutput_max);
980 #else
981   const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
982   const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
983   const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
984   const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
985 
986   const float32x4_t vacc0x0123_f_clamped =
987       vminq_f32(vmaxq_f32(vacc0x0123_f, vfmin), vfmax);
988   const float32x4_t vacc1x0123_f_clamped =
989       vminq_f32(vmaxq_f32(vacc1x0123_f, vfmin), vfmax);
990   const float32x4_t vacc2x0123_f_clamped =
991       vminq_f32(vmaxq_f32(vacc2x0123_f, vfmin), vfmax);
992   const float32x4_t vacc3x0123_f_clamped =
993       vminq_f32(vmaxq_f32(vacc3x0123_f, vfmin), vfmax);
994   const float32x4_t vacc0x4567_f_clamped =
995       vminq_f32(vmaxq_f32(vacc0x4567_f, vfmin), vfmax);
996   const float32x4_t vacc1x4567_f_clamped =
997       vminq_f32(vmaxq_f32(vacc1x4567_f, vfmin), vfmax);
998   const float32x4_t vacc2x4567_f_clamped =
999       vminq_f32(vmaxq_f32(vacc2x4567_f, vfmin), vfmax);
1000   const float32x4_t vacc3x4567_f_clamped =
1001       vminq_f32(vmaxq_f32(vacc3x4567_f, vfmin), vfmax);
1002   const float32x4_t vacc4x0123_f_clamped =
1003       vminq_f32(vmaxq_f32(vacc4x0123_f, vfmin), vfmax);
1004   const float32x4_t vacc5x0123_f_clamped =
1005       vminq_f32(vmaxq_f32(vacc5x0123_f, vfmin), vfmax);
1006   const float32x4_t vacc6x0123_f_clamped =
1007       vminq_f32(vmaxq_f32(vacc6x0123_f, vfmin), vfmax);
1008   const float32x4_t vacc7x0123_f_clamped =
1009       vminq_f32(vmaxq_f32(vacc7x0123_f, vfmin), vfmax);
1010   const float32x4_t vacc4x4567_f_clamped =
1011       vminq_f32(vmaxq_f32(vacc4x4567_f, vfmin), vfmax);
1012   const float32x4_t vacc5x4567_f_clamped =
1013       vminq_f32(vmaxq_f32(vacc5x4567_f, vfmin), vfmax);
1014   const float32x4_t vacc6x4567_f_clamped =
1015       vminq_f32(vmaxq_f32(vacc6x4567_f, vfmin), vfmax);
1016   const float32x4_t vacc7x4567_f_clamped =
1017       vminq_f32(vmaxq_f32(vacc7x4567_f, vfmin), vfmax);
1018 
1019   vacc0x0123 = vsubq_s32(
1020       vreinterpretq_s32_f32(vaddq_f32(vacc0x0123_f_clamped, vfmagic)), vimagic);
1021   vacc1x0123 = vsubq_s32(
1022       vreinterpretq_s32_f32(vaddq_f32(vacc1x0123_f_clamped, vfmagic)), vimagic);
1023   vacc2x0123 = vsubq_s32(
1024       vreinterpretq_s32_f32(vaddq_f32(vacc2x0123_f_clamped, vfmagic)), vimagic);
1025   vacc3x0123 = vsubq_s32(
1026       vreinterpretq_s32_f32(vaddq_f32(vacc3x0123_f_clamped, vfmagic)), vimagic);
1027   vacc0x4567 = vsubq_s32(
1028       vreinterpretq_s32_f32(vaddq_f32(vacc0x4567_f_clamped, vfmagic)), vimagic);
1029   vacc1x4567 = vsubq_s32(
1030       vreinterpretq_s32_f32(vaddq_f32(vacc1x4567_f_clamped, vfmagic)), vimagic);
1031   vacc2x4567 = vsubq_s32(
1032       vreinterpretq_s32_f32(vaddq_f32(vacc2x4567_f_clamped, vfmagic)), vimagic);
1033   vacc3x4567 = vsubq_s32(
1034       vreinterpretq_s32_f32(vaddq_f32(vacc3x4567_f_clamped, vfmagic)), vimagic);
1035   vacc4x0123 = vsubq_s32(
1036       vreinterpretq_s32_f32(vaddq_f32(vacc4x0123_f_clamped, vfmagic)), vimagic);
1037   vacc5x0123 = vsubq_s32(
1038       vreinterpretq_s32_f32(vaddq_f32(vacc5x0123_f_clamped, vfmagic)), vimagic);
1039   vacc6x0123 = vsubq_s32(
1040       vreinterpretq_s32_f32(vaddq_f32(vacc6x0123_f_clamped, vfmagic)), vimagic);
1041   vacc7x0123 = vsubq_s32(
1042       vreinterpretq_s32_f32(vaddq_f32(vacc7x0123_f_clamped, vfmagic)), vimagic);
1043   vacc4x4567 = vsubq_s32(
1044       vreinterpretq_s32_f32(vaddq_f32(vacc4x4567_f_clamped, vfmagic)), vimagic);
1045   vacc5x4567 = vsubq_s32(
1046       vreinterpretq_s32_f32(vaddq_f32(vacc5x4567_f_clamped, vfmagic)), vimagic);
1047   vacc6x4567 = vsubq_s32(
1048       vreinterpretq_s32_f32(vaddq_f32(vacc6x4567_f_clamped, vfmagic)), vimagic);
1049   vacc7x4567 = vsubq_s32(
1050       vreinterpretq_s32_f32(vaddq_f32(vacc7x4567_f_clamped, vfmagic)), vimagic);
1051 
1052   const int16x8_t vacc0x01234567 =
1053       vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
1054   const int16x8_t vacc1x01234567 =
1055       vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
1056   const int16x8_t vacc2x01234567 =
1057       vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));
1058   const int16x8_t vacc3x01234567 =
1059       vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567));
1060   const int16x8_t vacc4x01234567 =
1061       vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567));
1062   const int16x8_t vacc5x01234567 =
1063       vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567));
1064   const int16x8_t vacc6x01234567 =
1065       vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567));
1066   const int16x8_t vacc7x01234567 =
1067       vcombine_s16(vqmovn_s32(vacc7x0123), vqmovn_s32(vacc7x4567));
1068 
1069   uint8x16_t vout0x01234567_1x01234567 =
1070       vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567));
1071   uint8x16_t vout2x01234567_3x01234567 =
1072       vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc3x01234567));
1073   uint8x16_t vout4x01234567_5x01234567 =
1074       vcombine_u8(vqmovun_s16(vacc4x01234567), vqmovun_s16(vacc5x01234567));
1075   uint8x16_t vout6x01234567_7x01234567 =
1076       vcombine_u8(vqmovun_s16(vacc6x01234567), vqmovun_s16(vacc7x01234567));
1077 #endif
1078 
1079   uint8_t* c0 = c;
1080   uint8_t* c1 = (uint8_t*)((uintptr_t)c0 + c_stride);
1081   if (mr < 2) {
1082     c1 = c0;
1083   }
1084   uint8_t* c2 = (uint8_t*)((uintptr_t)c1 + c_stride);
1085   if (mr <= 2) {
1086     c2 = c1;
1087   }
1088   uint8_t* c3 = (uint8_t*)((uintptr_t)c2 + c_stride);
1089   if (mr < 4) {
1090     c3 = c2;
1091   }
1092   uint8_t* c4 = (uint8_t*)((uintptr_t)c3 + c_stride);
1093   if (mr <= 4) {
1094     c4 = c3;
1095   }
1096   uint8_t* c5 = (uint8_t*)((uintptr_t)c4 + c_stride);
1097   if (mr < 6) {
1098     c5 = c4;
1099   }
1100   uint8_t* c6 = (uint8_t*)((uintptr_t)c5 + c_stride);
1101   if (mr <= 6) {
1102     c6 = c5;
1103   }
1104   uint8_t* c7 = (uint8_t*)((uintptr_t)c6 + c_stride);
1105   if (mr != 8) {
1106     c7 = c6;
1107   }
1108   if (nr == 8) {
1109     vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567));
1110     vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567));
1111     vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567));
1112     vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567));
1113     vst1_u8(c4, vget_low_u8(vout4x01234567_5x01234567));
1114     vst1_u8(c5, vget_high_u8(vout4x01234567_5x01234567));
1115     vst1_u8(c6, vget_low_u8(vout6x01234567_7x01234567));
1116     vst1_u8(c7, vget_high_u8(vout6x01234567_7x01234567));
1117   } else {
1118     if (nr >= 4) {
1119       vst1q_lane_u32(
1120           __builtin_assume_aligned(c0, 1),
1121           vreinterpretq_u32_u8(vout0x01234567_1x01234567),
1122           0);
1123       c0 += 4;
1124       vst1q_lane_u32(
1125           __builtin_assume_aligned(c1, 1),
1126           vreinterpretq_u32_u8(vout0x01234567_1x01234567),
1127           2);
1128       c1 += 4;
1129       vst1q_lane_u32(
1130           __builtin_assume_aligned(c2, 1),
1131           vreinterpretq_u32_u8(vout2x01234567_3x01234567),
1132           0);
1133       c2 += 4;
1134       vst1q_lane_u32(
1135           __builtin_assume_aligned(c3, 1),
1136           vreinterpretq_u32_u8(vout2x01234567_3x01234567),
1137           2);
1138       c3 += 4;
1139       vst1q_lane_u32(
1140           __builtin_assume_aligned(c4, 1),
1141           vreinterpretq_u32_u8(vout4x01234567_5x01234567),
1142           0);
1143       c4 += 4;
1144       vst1q_lane_u32(
1145           __builtin_assume_aligned(c5, 1),
1146           vreinterpretq_u32_u8(vout4x01234567_5x01234567),
1147           2);
1148       c5 += 4;
1149       vst1q_lane_u32(
1150           __builtin_assume_aligned(c6, 1),
1151           vreinterpretq_u32_u8(vout6x01234567_7x01234567),
1152           0);
1153       c6 += 4;
1154       vst1q_lane_u32(
1155           __builtin_assume_aligned(c7, 1),
1156           vreinterpretq_u32_u8(vout6x01234567_7x01234567),
1157           2);
1158       c7 += 4;
1159       vout0x01234567_1x01234567 =
1160           vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
1161       vout2x01234567_3x01234567 =
1162           vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
1163       vout4x01234567_5x01234567 =
1164           vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
1165       vout6x01234567_7x01234567 =
1166           vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 4);
1167       nr -= 4;
1168     }
1169     if (nr >= 2) {
1170       vst1q_lane_u16(
1171           __builtin_assume_aligned(c0, 1),
1172           vreinterpretq_u16_u8(vout0x01234567_1x01234567),
1173           0);
1174       c0 += 2;
1175       vst1q_lane_u16(
1176           __builtin_assume_aligned(c1, 1),
1177           vreinterpretq_u16_u8(vout0x01234567_1x01234567),
1178           4);
1179       c1 += 2;
1180       vst1q_lane_u16(
1181           __builtin_assume_aligned(c2, 1),
1182           vreinterpretq_u16_u8(vout2x01234567_3x01234567),
1183           0);
1184       c2 += 2;
1185       vst1q_lane_u16(
1186           __builtin_assume_aligned(c3, 1),
1187           vreinterpretq_u16_u8(vout2x01234567_3x01234567),
1188           4);
1189       c3 += 2;
1190       vst1q_lane_u16(
1191           __builtin_assume_aligned(c4, 1),
1192           vreinterpretq_u16_u8(vout4x01234567_5x01234567),
1193           0);
1194       c4 += 2;
1195       vst1q_lane_u16(
1196           __builtin_assume_aligned(c5, 1),
1197           vreinterpretq_u16_u8(vout4x01234567_5x01234567),
1198           4);
1199       c5 += 2;
1200       vst1q_lane_u16(
1201           __builtin_assume_aligned(c6, 1),
1202           vreinterpretq_u16_u8(vout6x01234567_7x01234567),
1203           0);
1204       c6 += 2;
1205       vst1q_lane_u16(
1206           __builtin_assume_aligned(c7, 1),
1207           vreinterpretq_u16_u8(vout6x01234567_7x01234567),
1208           4);
1209       c7 += 2;
1210       vout0x01234567_1x01234567 =
1211           vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
1212       vout2x01234567_3x01234567 =
1213           vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
1214       vout4x01234567_5x01234567 =
1215           vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
1216       vout6x01234567_7x01234567 =
1217           vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 2);
1218       nr -= 2;
1219     }
1220     if (nr != 0) {
1221       vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
1222       vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
1223       vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
1224       vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
1225       vst1q_lane_u8(c4, vout4x01234567_5x01234567, 0);
1226       vst1q_lane_u8(c5, vout4x01234567_5x01234567, 8);
1227       vst1q_lane_u8(c6, vout6x01234567_7x01234567, 0);
1228       vst1q_lane_u8(c7, vout6x01234567_7x01234567, 8);
1229     }
1230   }
1231 }
1232