1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-sse4.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <smmintrin.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/unaligned.h>
17
18
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24(
20 size_t rows,
21 size_t channels,
22 const uint8_t* input,
23 size_t input_stride,
24 const uint8_t* zero,
25 int32_t* buffer,
26 uint8_t* output,
27 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(rows > 7);
30 assert(channels != 0);
31
32 const uint8_t* i0 = input;
33 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
34 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
35 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
36 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
37 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
38 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
39 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
40
41 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
42 int32_t* b = buffer;
43 size_t c = channels;
44 for (; c >= 24; c -= 24) {
45 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
46 const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
47 const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16)));
48 i0 += 24;
49 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
50 const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
51 const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16)));
52 i1 += 24;
53
54 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
55 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
56 __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
57 const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
58 __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
59 const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16)));
60 i2 += 24;
61
62 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
63 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
64 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
65 const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
66 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
67 const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16)));
68 i3 += 24;
69 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
70 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
71 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
72 const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
73 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
74 const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16)));
75 i4 += 24;
76 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
77 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
78 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
79 const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
80 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
81 const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16)));
82 i5 += 24;
83 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
84 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
85 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
86 const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
87 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
88 const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16)));
89 i6 += 24;
90
91 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
92 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
93 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
94
95 const __m128i vzero = _mm_setzero_si128();
96 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
97 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
98 __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
99 __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
100 __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN);
101 __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
102
103 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
104 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
105 vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias);
106 vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias);
107 vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias);
108 vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias);
109
110 _mm_store_si128((__m128i*) b, vacc0123);
111 _mm_store_si128((__m128i*) (b + 4), vacc4567);
112 _mm_store_si128((__m128i*) (b + 8), vacc89AB);
113 _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
114 _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
115 _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
116 b += 24;
117 }
118 if XNN_UNLIKELY(c != 0) {
119 do {
120 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
121 i0 += 8;
122 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
123 i1 += 8;
124
125 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
126 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
127 i2 += 8;
128
129 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
130 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
131 i3 += 8;
132 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
133 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
134 i4 += 8;
135 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
136 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
137 i5 += 8;
138 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
139 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
140 i6 += 8;
141
142 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
143
144 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
145 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
146
147 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
148 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
149
150 _mm_store_si128((__m128i*) b, vacc0123);
151 _mm_store_si128((__m128i*) (b + 4), vacc4567);
152 b += 8;
153
154 c = doz(c, 8);
155 } while (c != 0);
156 }
157
158 for (rows -= 7; rows > 7; rows -= 7) {
159 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
160 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
161 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
162 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
163 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
164 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
165 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
166
167 int32_t* b = buffer;
168 size_t c = channels;
169 for (; c >= 24; c -= 24) {
170 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
171 const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
172 const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16)));
173 i0 += 24;
174 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
175 const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
176 const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16)));
177 i1 += 24;
178
179 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
180 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
181 __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
182 const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
183 __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
184 const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16)));
185 i2 += 24;
186
187 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
188 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
189 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
190 const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
191 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
192 const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16)));
193 i3 += 24;
194 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
195 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
196 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
197 const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
198 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
199 const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16)));
200 i4 += 24;
201 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
202 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
203 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
204 const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
205 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
206 const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16)));
207 i5 += 24;
208 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
209 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
210 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
211 const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
212 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
213 const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16)));
214 i6 += 24;
215
216 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
217 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
218 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
219
220 const __m128i vzero = _mm_setzero_si128();
221 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
222 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
223 __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
224 __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
225 __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN);
226 __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
227
228 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
229 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
230 vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8)));
231 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12)));
232 vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16)));
233 vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20)));
234
235 _mm_store_si128((__m128i*) b, vacc0123);
236 _mm_store_si128((__m128i*) (b + 4), vacc4567);
237 _mm_store_si128((__m128i*) (b + 8), vacc89AB);
238 _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
239 _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
240 _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
241 b += 24;
242 }
243 if XNN_UNLIKELY(c != 0) {
244 do {
245 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
246 i0 += 8;
247 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
248 i1 += 8;
249
250 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
251 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
252 i2 += 8;
253
254 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
255 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
256 i3 += 8;
257 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
258 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
259 i4 += 8;
260 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
261 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
262 i5 += 8;
263 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
264 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
265 i6 += 8;
266
267 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
268
269 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
270 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
271
272 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
273 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
274
275 _mm_store_si128((__m128i*) b, vacc0123);
276 _mm_store_si128((__m128i*) (b + 4), vacc4567);
277 b += 8;
278
279 c = doz(c, 8);
280 } while (c != 0);
281 }
282 }
283
284 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
285 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
286 if XNN_UNPREDICTABLE(rows < 2) {
287 i1 = zero;
288 }
289 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
290 if XNN_UNPREDICTABLE(rows <= 2) {
291 i2 = zero;
292 }
293 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
294 if XNN_UNPREDICTABLE(rows < 4) {
295 i3 = zero;
296 }
297 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
298 if XNN_UNPREDICTABLE(rows <= 4) {
299 i4 = zero;
300 }
301 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
302 if XNN_UNPREDICTABLE(rows < 6) {
303 i5 = zero;
304 }
305 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
306 if XNN_UNPREDICTABLE(rows <= 6) {
307 i6 = zero;
308 }
309
310 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
311 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
312 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
313 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
314 for (; channels >= 24; channels -= 24) {
315 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
316 const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
317 const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16)));
318 i0 += 24;
319 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
320 const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
321 const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16)));
322 i1 += 24;
323
324 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
325 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
326 __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
327 const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
328 __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
329 const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16)));
330 i2 += 24;
331
332 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
333 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
334 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF);
335 const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
336 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN);
337 const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16)));
338 i3 += 24;
339 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
340 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
341 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF);
342 const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
343 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN);
344 const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16)));
345 i4 += 24;
346 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
347 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
348 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF);
349 const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
350 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN);
351 const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16)));
352 i5 += 24;
353 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
354 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
355 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF);
356 const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
357 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN);
358 const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16)));
359 i6 += 24;
360
361 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
362 vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF);
363 vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN);
364
365 const __m128i vzero = _mm_setzero_si128();
366 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
367 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
368 __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF);
369 __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero);
370 __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN);
371 __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero);
372
373 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
374 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
375 vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8)));
376 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12)));
377 vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16)));
378 vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20)));
379 buffer += 24;
380
381 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
382 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
383 __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB);
384 __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF);
385 __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ);
386 __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN);
387
388 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
389 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
390 vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
391 vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
392 vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale);
393 vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale);
394
395 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
396 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
397 vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point);
398 vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point);
399 vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point);
400 vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point);
401
402 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
403 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
404 vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
405 vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
406 vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ);
407 vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN);
408
409 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
410 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
411 __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
412
413 __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
414 __m128i voutGHIJKLMNGHIJKLMN = _mm_packus_epi16(voutGHIJKLMN, voutGHIJKLMN);
415
416 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
417 voutGHIJKLMNGHIJKLMN = _mm_max_epu8(voutGHIJKLMNGHIJKLMN, voutput_min);
418
419 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
420 _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
421 output += 24;
422 }
423 if XNN_UNLIKELY(channels != 0) {
424 do {
425 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0));
426 i0 += 8;
427 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1));
428 i1 += 8;
429
430 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
431 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2));
432 i2 += 8;
433
434 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
435 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3));
436 i3 += 8;
437 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
438 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4));
439 i4 += 8;
440 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
441 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5));
442 i5 += 8;
443 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
444 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6));
445 i6 += 8;
446
447 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
448
449 __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567);
450 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128());
451
452 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
453 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
454 buffer += 8;
455
456 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
457 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
458
459 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
460 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
461
462 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
463 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
464
465 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
466 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
467
468 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
469
470 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
471 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
472
473 if XNN_LIKELY(channels >= 8) {
474 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
475 output += 8;
476 channels -= 8;
477 } else {
478 if (channels & 4) {
479 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
480 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
481 output += 4;
482 }
483 if (channels & 2) {
484 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
485 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
486 output += 2;
487 }
488 if (channels & 1) {
489 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
490 output += 1;
491 }
492 channels = 0;
493 }
494 } while (channels != 0);
495 }
496 }
497