1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-neon.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/intrinsics-polyfill.h>
16 #include <xnnpack/math.h>
17
18
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24(
20 size_t rows,
21 size_t channels,
22 const uint8_t* input,
23 size_t input_stride,
24 const uint8_t* zero,
25 int32_t* buffer,
26 uint8_t* output,
27 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(rows > 7);
30 assert(channels != 0);
31
32 const uint8_t* i0 = input;
33 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
34 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
35 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
36 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
37 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
38 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
39 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
40
41 const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
42 int32_t* b = buffer;
43 size_t c = channels;
44 for (; c >= 24; c -= 24) {
45 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
46 const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
47 const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
48 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
49 const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
50 const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
51
52 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
53 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
54 const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
55 uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
56 const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
57 uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
58
59 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
60 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
61 const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
62 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
63 const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
64 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
65 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
66 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
67 const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
68 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
69 const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
70 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
71 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
72 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
73 const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
74 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
75 const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
76 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
77 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
78 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
79 const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
80 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
81 const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
82 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
83 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
84 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
85 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
86
87 const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
88 const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
89 const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
90 const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
91 const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN)));
92 const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN)));
93
94 vst1q_s32(b, vacc0123); b += 4;
95 vst1q_s32(b, vacc4567); b += 4;
96 vst1q_s32(b, vacc89AB); b += 4;
97 vst1q_s32(b, vaccCDEF); b += 4;
98 vst1q_s32(b, vaccGHIJ); b += 4;
99 vst1q_s32(b, vaccKLMN); b += 4;
100 }
101 if XNN_UNLIKELY(c != 0) {
102 do {
103 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
104 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
105 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
106 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
107
108 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
109 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
110 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
111 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
112 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
113 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
114 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
115 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
116 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
117
118 const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
119 const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
120
121 vst1q_s32(b, vacc0123); b += 4;
122 vst1q_s32(b, vacc4567); b += 4;
123
124 c = doz(c, 8);
125 } while (c != 0);
126 }
127
128 for (rows -= 7; rows > 7; rows -= 7) {
129 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
130 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
131 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
132 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
133 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
134 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
135 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
136
137 int32_t* b = buffer;
138 size_t c = channels;
139 for (; c >= 24; c -= 24) {
140 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
141 const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
142 const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
143 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
144 const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
145 const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
146
147 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
148 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
149 const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
150 uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
151 const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
152 uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
153
154 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
155 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
156 const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
157 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
158 const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
159 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
160 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
161 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
162 const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
163 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
164 const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
165 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
166 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
167 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
168 const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
169 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
170 const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
171 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
172 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
173 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
174 const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
175 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
176 const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
177 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
178 int32x4_t vacc0123 = vld1q_s32(b);
179 int32x4_t vacc4567 = vld1q_s32(b + 4);
180 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
181 int32x4_t vacc89AB = vld1q_s32(b + 8);
182 int32x4_t vaccCDEF = vld1q_s32(b + 12);
183 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
184 int32x4_t vaccGHIJ = vld1q_s32(b + 16);
185 int32x4_t vaccKLMN = vld1q_s32(b + 20);
186 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
187
188 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
189 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
190 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
191 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
192 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
193 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
194
195 vst1q_s32(b, vacc0123); b += 4;
196 vst1q_s32(b, vacc4567); b += 4;
197 vst1q_s32(b, vacc89AB); b += 4;
198 vst1q_s32(b, vaccCDEF); b += 4;
199 vst1q_s32(b, vaccGHIJ); b += 4;
200 vst1q_s32(b, vaccKLMN); b += 4;
201 }
202 if XNN_UNLIKELY(c != 0) {
203 do {
204 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
205 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
206 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
207 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
208
209 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
210 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
211 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
212 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
213 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
214 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
215 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
216 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
217 int32x4_t vacc0123 = vld1q_s32(b);
218 int32x4_t vacc4567 = vld1q_s32(b + 4);
219 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
220
221 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
222 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
223
224 vst1q_s32(b, vacc0123); b += 4;
225 vst1q_s32(b, vacc4567); b += 4;
226
227 c = doz(c, 8);
228 } while (c != 0);
229 }
230 }
231
232 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
233 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
234 if XNN_UNPREDICTABLE(rows < 2) {
235 i1 = zero;
236 }
237 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
238 if XNN_UNPREDICTABLE(rows <= 2) {
239 i2 = zero;
240 }
241 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
242 if XNN_UNPREDICTABLE(rows < 4) {
243 i3 = zero;
244 }
245 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
246 if XNN_UNPREDICTABLE(rows <= 4) {
247 i4 = zero;
248 }
249 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
250 if XNN_UNPREDICTABLE(rows < 6) {
251 i5 = zero;
252 }
253 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
254 if XNN_UNPREDICTABLE(rows <= 6) {
255 i6 = zero;
256 }
257
258 const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
259 const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
260 const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
261 const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
262 for (; channels >= 24; channels -= 24) {
263 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
264 const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
265 const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
266 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
267 const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
268 const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
269
270 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
271 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
272 const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
273 uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
274 const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
275 uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
276
277 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
278 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
279 const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
280 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
281 const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
282 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
283 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
284 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
285 const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
286 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
287 const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
288 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
289 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
290 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
291 const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
292 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
293 const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
294 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
295 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
296 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
297 const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
298 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
299 const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
300 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
301 int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
302 int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
303 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
304 int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
305 int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
306 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
307 int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
308 int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
309 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
310
311 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
312 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
313 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
314 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
315 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
316 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
317
318 float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
319 float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
320 float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
321 float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
322 float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
323 float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
324
325 vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
326 vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
327 vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
328 vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
329 vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
330 vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
331
332 vacc0123 = vcvtnq_s32_f32(vfpacc0123);
333 vacc4567 = vcvtnq_s32_f32(vfpacc4567);
334 vacc89AB = vcvtnq_s32_f32(vfpacc89AB);
335 vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF);
336 vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ);
337 vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN);
338
339 #if XNN_ARCH_ARM64
340 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
341 int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
342 int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
343 #else // !XNN_ARCH_ARM64
344 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
345 int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
346 int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
347 #endif // !XNN_ARCH_ARM64
348
349 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
350 vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point);
351 vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point);
352
353 #if XNN_ARCH_ARM64
354 uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
355 uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
356 #else // !XNN_ARCH_ARM64
357 uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
358 uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
359 #endif // !XNN_ARCH_ARM64
360
361 vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
362 voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min));
363
364 vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
365 voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max));
366
367 vst1q_u8(output, vout0123456789ABCDEF); output += 16;
368 vst1_u8(output, voutGHIJKLMN); output += 8;
369 }
370 if XNN_UNLIKELY(channels != 0) {
371 do {
372 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
373 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
374 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
375 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
376
377 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
378 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
379 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
380 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
381 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
382 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
383 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
384 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
385 int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
386 int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
387 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
388
389 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
390 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
391
392 float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
393 float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
394
395 vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
396 vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
397
398 vacc0123 = vcvtnq_s32_f32(vfpacc0123);
399 vacc4567 = vcvtnq_s32_f32(vfpacc4567);
400
401 #if XNN_ARCH_ARM64
402 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
403 #else
404 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
405 #endif
406 vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point);
407
408 uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
409 vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
410 vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
411
412 if XNN_LIKELY(channels >= 8) {
413 vst1_u8(output, vout01234567); output += 8;
414 channels -= 8;
415 } else {
416 if (channels & 4) {
417 vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
418 vout01234567 = vext_u8(vout01234567, vout01234567, 4);
419 }
420 if (channels & 2) {
421 vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
422 vout01234567 = vext_u8(vout01234567, vout01234567, 2);
423 }
424 if (channels & 1) {
425 vst1_lane_u8(output, vout01234567, 0); output += 1;
426 }
427 channels = 0;
428 }
429 } while (channels != 0);
430 }
431 }
432