1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-neon.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16
17
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24(
19 size_t rows,
20 size_t channels,
21 const uint8_t* input,
22 size_t input_stride,
23 const uint8_t* zero,
24 int32_t* buffer,
25 uint8_t* output,
26 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
27 {
28 assert(rows > 7);
29 assert(channels != 0);
30
31 const uint8_t* i0 = input;
32 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
33 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
34 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
35 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
36 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
37 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
38 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
39
40 const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
41 int32_t* b = buffer;
42 size_t c = channels;
43 for (; c >= 24; c -= 24) {
44 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
45 const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
46 const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
47 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
48 const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
49 const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
50
51 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
52 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
53 const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
54 uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
55 const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
56 uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
57
58 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
59 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
60 const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
61 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
62 const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
63 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
64 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
65 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
66 const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
67 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
68 const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
69 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
70 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
71 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
72 const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
73 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
74 const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
75 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
76 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
77 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
78 const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
79 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
80 const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
81 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
82 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
83 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
84 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
85
86 const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
87 const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
88 const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF)));
89 const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF)));
90 const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN)));
91 const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN)));
92
93 vst1q_s32(b, vacc0123); b += 4;
94 vst1q_s32(b, vacc4567); b += 4;
95 vst1q_s32(b, vacc89AB); b += 4;
96 vst1q_s32(b, vaccCDEF); b += 4;
97 vst1q_s32(b, vaccGHIJ); b += 4;
98 vst1q_s32(b, vaccKLMN); b += 4;
99 }
100 if XNN_UNLIKELY(c != 0) {
101 do {
102 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
103 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
104 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
105 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
106
107 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
108 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
109 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
110 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
111 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
112 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
113 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
114 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
115 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
116
117 const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567)));
118 const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567)));
119
120 vst1q_s32(b, vacc0123); b += 4;
121 vst1q_s32(b, vacc4567); b += 4;
122
123 c = doz(c, 8);
124 } while (c != 0);
125 }
126
127 for (rows -= 7; rows > 7; rows -= 7) {
128 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
129 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
130 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
131 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
132 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
133 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
134 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
135
136 int32_t* b = buffer;
137 size_t c = channels;
138 for (; c >= 24; c -= 24) {
139 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
140 const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
141 const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
142 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
143 const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
144 const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
145
146 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
147 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
148 const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
149 uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
150 const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
151 uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
152
153 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
154 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
155 const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
156 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
157 const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
158 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
159 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
160 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
161 const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
162 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
163 const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
164 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
165 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
166 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
167 const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
168 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
169 const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
170 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
171 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
172 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
173 const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
174 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
175 const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
176 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
177 int32x4_t vacc0123 = vld1q_s32(b);
178 int32x4_t vacc4567 = vld1q_s32(b + 4);
179 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
180 int32x4_t vacc89AB = vld1q_s32(b + 8);
181 int32x4_t vaccCDEF = vld1q_s32(b + 12);
182 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
183 int32x4_t vaccGHIJ = vld1q_s32(b + 16);
184 int32x4_t vaccKLMN = vld1q_s32(b + 20);
185 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
186
187 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
188 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
189 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
190 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
191 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
192 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
193
194 vst1q_s32(b, vacc0123); b += 4;
195 vst1q_s32(b, vacc4567); b += 4;
196 vst1q_s32(b, vacc89AB); b += 4;
197 vst1q_s32(b, vaccCDEF); b += 4;
198 vst1q_s32(b, vaccGHIJ); b += 4;
199 vst1q_s32(b, vaccKLMN); b += 4;
200 }
201 if XNN_UNLIKELY(c != 0) {
202 do {
203 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
204 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
205 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
206 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
207
208 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
209 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
210 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
211 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
212 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
213 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
214 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
215 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
216 int32x4_t vacc0123 = vld1q_s32(b);
217 int32x4_t vacc4567 = vld1q_s32(b + 4);
218 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
219
220 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
221 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
222
223 vst1q_s32(b, vacc0123); b += 4;
224 vst1q_s32(b, vacc4567); b += 4;
225
226 c = doz(c, 8);
227 } while (c != 0);
228 }
229 }
230
231 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
232 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
233 if XNN_UNPREDICTABLE(rows < 2) {
234 i1 = zero;
235 }
236 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
237 if XNN_UNPREDICTABLE(rows <= 2) {
238 i2 = zero;
239 }
240 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
241 if XNN_UNPREDICTABLE(rows < 4) {
242 i3 = zero;
243 }
244 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
245 if XNN_UNPREDICTABLE(rows <= 4) {
246 i4 = zero;
247 }
248 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
249 if XNN_UNPREDICTABLE(rows < 6) {
250 i5 = zero;
251 }
252 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
253 if XNN_UNPREDICTABLE(rows <= 6) {
254 i6 = zero;
255 }
256
257 const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
258 const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
259 const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
260 const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min);
261 const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max);
262 for (; channels >= 24; channels -= 24) {
263 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
264 const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8;
265 const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8;
266 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
267 const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8;
268 const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8;
269
270 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
271 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
272 const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8;
273 uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF);
274 const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8;
275 uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN);
276
277 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
278 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
279 const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8;
280 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF);
281 const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8;
282 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN);
283 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
284 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
285 const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8;
286 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF);
287 const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8;
288 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN);
289 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
290 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
291 const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8;
292 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF);
293 const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8;
294 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN);
295 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
296 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
297 const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8;
298 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF);
299 const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8;
300 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN);
301 int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
302 int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
303 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
304 int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4;
305 int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4;
306 vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF);
307 int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4;
308 int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4;
309 vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN);
310
311 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
312 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
313 vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF)));
314 vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF)));
315 vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN)));
316 vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN)));
317
318 float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
319 float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
320 float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB);
321 float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF);
322 float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ);
323 float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN);
324
325 vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
326 vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
327 vfpacc89AB = vmulq_f32(vfpacc89AB, vscale);
328 vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale);
329 vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale);
330 vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale);
331
332 vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
333 vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
334 vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias));
335 vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias));
336 vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias));
337 vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias));
338
339 vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
340 vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
341 vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point);
342 vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point);
343 vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point);
344 vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point);
345
346 #if XNN_ARCH_ARM64
347 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
348 int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF);
349 int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN);
350 #else // !XNN_ARCH_ARM64
351 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
352 int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF));
353 int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN));
354 #endif // !XNN_ARCH_ARM64
355
356
357 #if XNN_ARCH_ARM64
358 uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF);
359 uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
360 #else // !XNN_ARCH_ARM64
361 uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
362 uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN);
363 #endif // !XNN_ARCH_ARM64
364
365 vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
366 voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min));
367
368 vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
369 voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max));
370
371 vst1q_u8(output, vout0123456789ABCDEF); output += 16;
372 vst1_u8(output, voutGHIJKLMN); output += 8;
373 }
374 if XNN_UNLIKELY(channels != 0) {
375 do {
376 const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8;
377 const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8;
378 const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8;
379 uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567);
380
381 const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8;
382 vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567);
383 const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8;
384 vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567);
385 const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8;
386 vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567);
387 const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8;
388 vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567);
389 int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4;
390 int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4;
391 vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567);
392
393 vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567)));
394 vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567)));
395
396 float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123);
397 float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567);
398
399 vfpacc0123 = vmulq_f32(vfpacc0123, vscale);
400 vfpacc4567 = vmulq_f32(vfpacc4567, vscale);
401
402 vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias));
403 vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias));
404
405 vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point);
406 vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point);
407
408 #if XNN_ARCH_ARM64
409 int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567);
410 #else
411 int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567));
412 #endif
413
414 uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
415 vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
416 vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
417
418 if XNN_LIKELY(channels >= 8) {
419 vst1_u8(output, vout01234567); output += 8;
420 channels -= 8;
421 } else {
422 if (channels & 4) {
423 vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4;
424 vout01234567 = vext_u8(vout01234567, vout01234567, 4);
425 }
426 if (channels & 2) {
427 vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2;
428 vout01234567 = vext_u8(vout01234567, vout01234567, 2);
429 }
430 if (channels & 1) {
431 vst1_lane_u8(output, vout01234567, 0); output += 1;
432 }
433 channels = 0;
434 }
435 } while (channels != 0);
436 }
437 }
438