1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-sse2.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <emmintrin.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/unaligned.h>
17
18
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8(
20 size_t rows,
21 size_t channels,
22 const uint8_t* input,
23 size_t input_stride,
24 const uint8_t* zero,
25 int32_t* buffer,
26 uint8_t* output,
27 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(rows > 7);
30 assert(channels != 0);
31
32 const uint8_t* i0 = input;
33 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
34 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
35 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
36 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
37 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
38 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
39 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
40
41 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
42 const __m128i vzero = _mm_setzero_si128();
43 int32_t* b = buffer;
44 size_t c = channels;
45 for (; c != 0; c = doz(c, 8)) {
46
47 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
48 i0 += 8;
49
50 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
51 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
52 i1 += 8;
53
54 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
55 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
56 i2 += 8;
57
58 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
59 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
60 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
61 i3 += 8;
62
63 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
64 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
65 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
66 i4 += 8;
67
68 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
69 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
70 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
71 i5 += 8;
72
73 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
74 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
75 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
76 i6 += 8;
77
78 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
79 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
80
81 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
82
83 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
84 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
85
86 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
87 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
88
89 _mm_store_si128((__m128i*) b, vacc0123);
90 _mm_store_si128((__m128i*) (b + 4), vacc4567);
91 b += 8;
92 }
93
94 for (rows -= 7; rows > 7; rows -= 7) {
95 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
96 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
97 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
98 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
99 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
100 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
101 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
102
103 int32_t* b = buffer;
104 size_t c = channels;
105 for (; c != 0; c = doz(c, 8)) {
106
107 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
108 i0 += 8;
109
110 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
111 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
112 i1 += 8;
113
114 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
115 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
116 i2 += 8;
117
118 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
119 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
120 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
121 i3 += 8;
122
123 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
124 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
125 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
126 i4 += 8;
127
128 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
129 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
130 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
131 i5 += 8;
132
133 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
134 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
135 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
136 i6 += 8;
137
138 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
139 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
140
141 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
142
143 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
144 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
145
146 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
147 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
148
149 _mm_store_si128((__m128i*) b, vacc0123);
150 _mm_store_si128((__m128i*) (b + 4), vacc4567);
151 b += 8;
152 }
153 }
154
155 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
156 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
157 if XNN_UNPREDICTABLE(rows < 2) {
158 i1 = zero;
159 }
160 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
161 if XNN_UNPREDICTABLE(rows <= 2) {
162 i2 = zero;
163 }
164 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
165 if XNN_UNPREDICTABLE(rows < 4) {
166 i3 = zero;
167 }
168 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
169 if XNN_UNPREDICTABLE(rows <= 4) {
170 i4 = zero;
171 }
172 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
173 if XNN_UNPREDICTABLE(rows < 6) {
174 i5 = zero;
175 }
176 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
177 if XNN_UNPREDICTABLE(rows <= 6) {
178 i6 = zero;
179 }
180
181 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
182 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
183 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
184 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
185 for (; channels >= 8; channels -= 8) {
186
187 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
188 i0 += 8;
189
190 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
191 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
192 i1 += 8;
193
194 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
195 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
196 i2 += 8;
197
198 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
199 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
200 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
201 i3 += 8;
202
203 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
204 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
205 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
206 i4 += 8;
207
208 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
209 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
210 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
211 i5 += 8;
212
213 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
214 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
215 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
216 i6 += 8;
217
218 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
219 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
220
221 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
222
223 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
224 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
225
226 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
227 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
228 buffer += 8;
229
230 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
231 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
232
233 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
234 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
235
236 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
237 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
238
239 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
240 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
241
242 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
243
244
245 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
246
247 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
248
249 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
250 output += 8;
251 }
252 if XNN_UNLIKELY(channels != 0) {
253 {
254
255 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
256 i0 += 8;
257
258 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
259 i1 += 8;
260
261 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
262 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
263 i2 += 8;
264
265 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
266 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
267 i3 += 8;
268
269 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
270 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
271 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
272 i4 += 8;
273
274 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
275 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
276 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
277 i5 += 8;
278
279 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
280 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
281 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
282 i6 += 8;
283
284 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
285 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
286
287 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
288 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
289
290 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
291
292 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
293 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
294
295 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
296 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
297 buffer += 8;
298
299 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
300 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
301
302 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
303 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
304
305 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
306 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
307
308 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
309 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
310
311 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
312
313 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
314 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
315
316 if (channels & 4) {
317 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
318 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
319 output += 4;
320 }
321 uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
322 if (channels & 2) {
323 unaligned_store_u16(output, (uint16_t) vout0123);
324 vout0123 >>= 16;
325 output += 2;
326 }
327 if (channels & 1) {
328 *output = (uint8_t) vout0123;
329 }
330 }
331 }
332 }
333