1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-neon.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/intrinsics-polyfill.h>
16 #include <xnnpack/math.h>
17
18
xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int32_t * buffer,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32(
20 size_t rows,
21 size_t channels,
22 const int8_t* input,
23 size_t input_stride,
24 const int8_t* zero,
25 int32_t* buffer,
26 int8_t* output,
27 const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(rows > 7);
30 assert(channels != 0);
31
32 const int8_t* i0 = input;
33 const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
34 const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
35 const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
36 const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
37 const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
38 const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
39 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
40
41 const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
42 int32_t* b = buffer;
43 size_t c = channels;
44 for (; c >= 32; c -= 32) {
45 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
46 const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8;
47 const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8;
48 const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8;
49 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
50 const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
51 const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
52 const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
53
54 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
55 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
56 const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
57 int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
58 const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
59 int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
60 const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
61 int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
62
63 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
64 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
65 const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
66 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
67 const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
68 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
69 const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
70 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
71 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
72 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
73 const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
74 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
75 const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
76 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
77 const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
78 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
79 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
80 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
81 const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
82 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
83 const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
84 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
85 const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
86 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
87 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
88 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
89 const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
90 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
91 const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
92 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
93 const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
94 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
95 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
96 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
97 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
98 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
99
100 const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
101 const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
102 const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF));
103 const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF));
104 const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN));
105 const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN));
106 const int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV));
107 const int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV));
108
109 vst1q_s32(b, vacc0123); b += 4;
110 vst1q_s32(b, vacc4567); b += 4;
111 vst1q_s32(b, vacc89AB); b += 4;
112 vst1q_s32(b, vaccCDEF); b += 4;
113 vst1q_s32(b, vaccGHIJ); b += 4;
114 vst1q_s32(b, vaccKLMN); b += 4;
115 vst1q_s32(b, vaccOPQR); b += 4;
116 vst1q_s32(b, vaccSTUV); b += 4;
117 }
118 if XNN_UNLIKELY(c != 0) {
119 do {
120 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
121 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
122 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
123 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
124
125 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
126 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
127 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
128 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
129 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
130 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
131 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
132 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
133 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
134
135 const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567));
136 const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567));
137
138 vst1q_s32(b, vacc0123); b += 4;
139 vst1q_s32(b, vacc4567); b += 4;
140
141 c = doz(c, 8);
142 } while (c != 0);
143 }
144
145 for (rows -= 7; rows > 7; rows -= 7) {
146 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
147 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
148 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
149 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
150 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
151 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
152 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
153
154 int32_t* b = buffer;
155 size_t c = channels;
156 for (; c >= 32; c -= 32) {
157 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
158 const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8;
159 const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8;
160 const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8;
161 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
162 const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
163 const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
164 const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
165
166 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
167 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
168 const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
169 int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
170 const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
171 int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
172 const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
173 int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
174
175 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
176 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
177 const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
178 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
179 const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
180 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
181 const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
182 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
183 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
184 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
185 const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
186 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
187 const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
188 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
189 const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
190 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
191 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
192 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
193 const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
194 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
195 const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
196 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
197 const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
198 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
199 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
200 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
201 const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
202 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
203 const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
204 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
205 const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
206 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
207 int32x4_t vacc0123 = vld1q_s32(b);
208 int32x4_t vacc4567 = vld1q_s32(b + 4);
209 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
210 int32x4_t vacc89AB = vld1q_s32(b + 8);
211 int32x4_t vaccCDEF = vld1q_s32(b + 12);
212 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
213 int32x4_t vaccGHIJ = vld1q_s32(b + 16);
214 int32x4_t vaccKLMN = vld1q_s32(b + 20);
215 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
216 int32x4_t vaccOPQR = vld1q_s32(b + 24);
217 int32x4_t vaccSTUV = vld1q_s32(b + 28);
218 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
219
220 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
221 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
222 vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
223 vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
224 vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
225 vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
226 vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV));
227 vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV));
228
229 vst1q_s32(b, vacc0123); b += 4;
230 vst1q_s32(b, vacc4567); b += 4;
231 vst1q_s32(b, vacc89AB); b += 4;
232 vst1q_s32(b, vaccCDEF); b += 4;
233 vst1q_s32(b, vaccGHIJ); b += 4;
234 vst1q_s32(b, vaccKLMN); b += 4;
235 vst1q_s32(b, vaccOPQR); b += 4;
236 vst1q_s32(b, vaccSTUV); b += 4;
237 }
238 if XNN_UNLIKELY(c != 0) {
239 do {
240 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
241 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
242 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
243 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
244
245 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
246 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
247 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
248 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
249 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
250 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
251 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
252 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
253 int32x4_t vacc0123 = vld1q_s32(b);
254 int32x4_t vacc4567 = vld1q_s32(b + 4);
255 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
256
257 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
258 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
259
260 vst1q_s32(b, vacc0123); b += 4;
261 vst1q_s32(b, vacc4567); b += 4;
262
263 c = doz(c, 8);
264 } while (c != 0);
265 }
266 }
267
268 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
269 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
270 if XNN_UNPREDICTABLE(rows < 2) {
271 i1 = zero;
272 }
273 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
274 if XNN_UNPREDICTABLE(rows <= 2) {
275 i2 = zero;
276 }
277 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
278 if XNN_UNPREDICTABLE(rows < 4) {
279 i3 = zero;
280 }
281 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
282 if XNN_UNPREDICTABLE(rows <= 4) {
283 i4 = zero;
284 }
285 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
286 if XNN_UNPREDICTABLE(rows < 6) {
287 i5 = zero;
288 }
289 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
290 if XNN_UNPREDICTABLE(rows <= 6) {
291 i6 = zero;
292 }
293
294 const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
295 const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
296 const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min);
297 const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max);
298 for (; channels >= 32; channels -= 32) {
299 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
300 const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8;
301 const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8;
302 const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8;
303 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
304 const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8;
305 const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8;
306 const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8;
307
308 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
309 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
310 const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8;
311 int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF);
312 const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8;
313 int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN);
314 const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8;
315 int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV);
316
317 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
318 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
319 const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8;
320 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF);
321 const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8;
322 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN);
323 const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8;
324 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV);
325 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
326 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
327 const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8;
328 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF);
329 const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8;
330 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN);
331 const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8;
332 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV);
333 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
334 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
335 const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8;
336 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF);
337 const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8;
338 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN);
339 const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8;
340 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV);
341 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
342 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
343 const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8;
344 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF);
345 const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8;
346 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN);
347 const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8;
348 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV);
349 int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
350 int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
351 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
352 int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
353 int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
354 vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF);
355 int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
356 int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
357 vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN);
358 int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4;
359 int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4;
360 vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV);
361
362 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
363 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
364 vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF));
365 vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF));
366 vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN));
367 vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN));
368 vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV));
369 vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV));
370
371 float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
372 float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
373 float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
374 float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
375 float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
376 float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
377 float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR);
378 float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV);
379
380 vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
381 vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
382 vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
383 vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
384 vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
385 vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
386 vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale);
387 vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale);
388
389 vacc0123 = vcvtnq_s32_f32(vfpacc0123);
390 vacc4567 = vcvtnq_s32_f32(vfpacc4567);
391 vacc89AB = vcvtnq_s32_f32(vfpacc89AB);
392 vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
393 vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ);
394 vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN);
395 vaccOPQR = vcvtnq_s32_f32(vfpaccOPQR);
396 vaccSTUV = vcvtnq_s32_f32(vfpaccSTUV);
397
398 #if XNN_ARCH_ARM64
399 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
400 int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
401 int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
402 int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV);
403 #else // !XNN_ARCH_ARM64
404 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
405 int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
406 int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
407 int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV));
408 #endif // !XNN_ARCH_ARM64
409
410 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
411 vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);
412 vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point);
413 vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point);
414
415 #if XNN_ARCH_ARM64
416 int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF);
417 int8x16_t voutGHIJKLMNOPQRSTUV = vqmovn_high_s16(vqmovn_s16(vaccGHIJKLMN), vaccOPQRSTUV);
418 #else // !XNN_ARCH_ARM64
419 int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
420 int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV));
421 #endif // !XNN_ARCH_ARM64
422
423 vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);
424 voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min);
425
426 vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);
427 voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max);
428
429 vst1q_s8(output, vout0123456789ABCDEF); output += 16;
430 vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16;
431 }
432 if XNN_UNLIKELY(channels != 0) {
433 do {
434 const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8;
435 const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8;
436 const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8;
437 int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567);
438
439 const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8;
440 vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567);
441 const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8;
442 vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567);
443 const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8;
444 vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567);
445 const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8;
446 vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567);
447 int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
448 int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
449 vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567);
450
451 vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567));
452 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567));
453
454 float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
455 float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
456
457 vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
458 vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
459
460 vacc0123 = vcvtnq_s32_f32(vfpacc0123);
461 vacc4567 = vcvtnq_s32_f32(vfpacc4567);
462
463 #if XNN_ARCH_ARM64
464 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
465 #else
466 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
467 #endif
468 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
469
470 int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
471 vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
472 vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
473
474 if XNN_LIKELY(channels >= 8) {
475 vst1_s8(output, vout01234567); output += 8;
476 channels -= 8;
477 } else {
478 if (channels & 4) {
479 vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4;
480 vout01234567 = vext_s8(vout01234567, vout01234567, 4);
481 }
482 if (channels & 2) {
483 vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2;
484 vout01234567 = vext_s8(vout01234567, vout01234567, 2);
485 }
486 if (channels & 1) {
487 vst1_lane_s8(output, vout01234567, 0); output += 1;
488 }
489 channels = 0;
490 }
491 } while (channels != 0);
492 }
493 }
494