1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-neon.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/intrinsics-polyfill.h>
16 #include <xnnpack/math.h>
17
18
xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int32_t * buffer,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24(
20 size_t rows,
21 size_t channels,
22 const int8_t* input,
23 size_t input_stride,
24 const int8_t* zero,
25 int32_t* buffer,
26 int8_t* output,
27 const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(rows > 7);
30 assert(channels != 0);
31
32 const int8_t* i0 = input;
33 const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
34 const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
35 const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
36 const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
37 const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
38 const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
39 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
40
41 const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
42 int32_t* b = buffer;
43 size_t c = channels;
44 for (; c >= 24; c -= 24) {
45 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
46 const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8;
47 const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8;
48 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
49 const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
50 const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
51
52 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
53 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
54 const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
55 int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
56 const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
57 int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
58
59 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
60 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
61 const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
62 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
63 const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
64 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
65 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
66 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
67 const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
68 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
69 const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
70 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
71 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
72 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
73 const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
74 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
75 const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
76 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
77 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
78 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
79 const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
80 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
81 const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
82 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
83 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
84 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
85 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
86
87 const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
88 const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
89 const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
90 const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
91 const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN));
92 const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN));
93
94 vst1q_s32(b, vacc0123); b += 4;
95 vst1q_s32(b, vacc4567); b += 4;
96 vst1q_s32(b, vacc89AB); b += 4;
97 vst1q_s32(b, vaccCDEF); b += 4;
98 vst1q_s32(b, vaccGHIJ); b += 4;
99 vst1q_s32(b, vaccKLMN); b += 4;
100 }
101 if XNN_UNLIKELY(c != 0) {
102 do {
103 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
104 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
105 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
106 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
107
108 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
109 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
110 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
111 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
112 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
113 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
114 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
115 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
116 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
117
118 const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
119 const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
120
121 vst1q_s32(b, vacc0123); b += 4;
122 vst1q_s32(b, vacc4567); b += 4;
123
124 c = doz(c, 8);
125 } while (c != 0);
126 }
127
128 for (rows -= 7; rows > 7; rows -= 7) {
129 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
130 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
131 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
132 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
133 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
134 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
135 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
136
137 int32_t* b = buffer;
138 size_t c = channels;
139 for (; c >= 24; c -= 24) {
140 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
141 const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8;
142 const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8;
143 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
144 const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
145 const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
146
147 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
148 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
149 const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
150 int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
151 const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
152 int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
153
154 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
155 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
156 const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
157 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
158 const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
159 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
160 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
161 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
162 const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
163 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
164 const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
165 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
166 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
167 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
168 const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
169 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
170 const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
171 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
172 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
173 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
174 const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
175 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
176 const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
177 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
178 int32x4_t vacc0123 = vld1q_s32(b);
179 int32x4_t vacc4567 = vld1q_s32(b + 4);
180 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
181 int32x4_t vacc89AB = vld1q_s32(b + 8);
182 int32x4_t vaccCDEF = vld1q_s32(b + 12);
183 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
184 int32x4_t vaccGHIJ = vld1q_s32(b + 16);
185 int32x4_t vaccKLMN = vld1q_s32(b + 20);
186 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
187
188 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
189 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
190 vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
191 vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
192 vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
193 vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
194
195 vst1q_s32(b, vacc0123); b += 4;
196 vst1q_s32(b, vacc4567); b += 4;
197 vst1q_s32(b, vacc89AB); b += 4;
198 vst1q_s32(b, vaccCDEF); b += 4;
199 vst1q_s32(b, vaccGHIJ); b += 4;
200 vst1q_s32(b, vaccKLMN); b += 4;
201 }
202 if XNN_UNLIKELY(c != 0) {
203 do {
204 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
205 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
206 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
207 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
208
209 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
210 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
211 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
212 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
213 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
214 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
215 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
216 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
217 int32x4_t vacc0123 = vld1q_s32(b);
218 int32x4_t vacc4567 = vld1q_s32(b + 4);
219 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
220
221 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
222 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
223
224 vst1q_s32(b, vacc0123); b += 4;
225 vst1q_s32(b, vacc4567); b += 4;
226
227 c = doz(c, 8);
228 } while (c != 0);
229 }
230 }
231
232 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
233 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
234 if XNN_UNPREDICTABLE(rows < 2) {
235 i1 = zero;
236 }
237 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
238 if XNN_UNPREDICTABLE(rows <= 2) {
239 i2 = zero;
240 }
241 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
242 if XNN_UNPREDICTABLE(rows < 4) {
243 i3 = zero;
244 }
245 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
246 if XNN_UNPREDICTABLE(rows <= 4) {
247 i4 = zero;
248 }
249 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
250 if XNN_UNPREDICTABLE(rows < 6) {
251 i5 = zero;
252 }
253 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
254 if XNN_UNPREDICTABLE(rows <= 6) {
255 i6 = zero;
256 }
257
258 const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
259 const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
260 const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min);
261 const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max);
262 for (; channels >= 24; channels -= 24) {
263 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
264 const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8;
265 const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8;
266 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
267 const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
268 const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
269
270 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
271 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
272 const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
273 int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
274 const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
275 int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
276
277 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
278 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
279 const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
280 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
281 const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
282 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
283 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
284 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
285 const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
286 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
287 const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
288 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
289 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
290 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
291 const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
292 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
293 const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
294 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
295 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
296 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
297 const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
298 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
299 const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
300 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
301 int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
302 int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
303 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
304 int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
305 int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
306 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
307 int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
308 int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
309 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
310
311 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
312 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
313 vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
314 vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
315 vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
316 vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
317
318 float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
319 float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
320 float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
321 float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
322 float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
323 float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
324
325 vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
326 vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
327 vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
328 vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
329 vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
330 vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
331
332 vacc0123 = vcvtnq_s32_f32(vfpacc0123);
333 vacc4567 = vcvtnq_s32_f32(vfpacc4567);
334 vacc89AB = vcvtnq_s32_f32(vfpacc89AB);
335 vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
336 vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ);
337 vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN);
338
339 #if XNN_ARCH_ARM64
340 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
341 int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
342 int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
343 #else // !XNN_ARCH_ARM64
344 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
345 int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
346 int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
347 #endif // !XNN_ARCH_ARM64
348
349 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
350 vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);
351 vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point);
352
353 #if XNN_ARCH_ARM64
354 int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF);
355 int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN);
356 #else // !XNN_ARCH_ARM64
357 int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
358 int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN);
359 #endif // !XNN_ARCH_ARM64
360
361 vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);
362 voutGHIJKLMN = vmax_s8(voutGHIJKLMN, vget_low_s8(voutput_min));
363
364 vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);
365 voutGHIJKLMN = vmin_s8(voutGHIJKLMN, vget_low_s8(voutput_max));
366
367 vst1q_s8(output, vout0123456789ABCDEF); output += 16;
368 vst1_s8(output, voutGHIJKLMN); output += 8;
369 }
370 if XNN_UNLIKELY(channels != 0) {
371 do {
372 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
373 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
374 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
375 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
376
377 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
378 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
379 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
380 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
381 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
382 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
383 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
384 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
385 int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
386 int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
387 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
388
389 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
390 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
391
392 float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
393 float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
394
395 vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
396 vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
397
398 vacc0123 = vcvtnq_s32_f32(vfpacc0123);
399 vacc4567 = vcvtnq_s32_f32(vfpacc4567);
400
401 #if XNN_ARCH_ARM64
402 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
403 #else
404 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
405 #endif
406 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
407
408 int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
409 vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
410 vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
411
412 if XNN_LIKELY(channels >= 8) {
413 vst1_s8(output, vout01234567); output += 8;
414 channels -= 8;
415 } else {
416 if (channels & 4) {
417 vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
418 vout01234567 = vext_s8(vout01234567, vout01234567, 4);
419 }
420 if (channels & 2) {
421 vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
422 vout01234567 = vext_s8(vout01234567, vout01234567, 2);
423 }
424 if (channels & 1) {
425 vst1_lane_s8(output, vout01234567, 0); output += 1;
426 }
427 channels = 0;
428 }
429 } while (channels != 0);
430 }
431 }
432